src/patches/reiser4-for-2.6.20.patch

   1  Documentation/Changes                         |   12 +
   2  Documentation/filesystems/reiser4.txt         |   75 +
   3  arch/i386/lib/usercopy.c                      |    2 +
   4  fs/Kconfig                                    |    2 +
   5  fs/Makefile                                   |    1 +
   6  fs/fs-writeback.c                             |   26 +-
   7  fs/reiser4/Kconfig                            |   32 +
   8  fs/reiser4/Makefile                           |   99 +
   9  fs/reiser4/README                             |  125 +
  10  fs/reiser4/as_ops.c                           |  339 +++
  11  fs/reiser4/block_alloc.c                      | 1137 ++++++++
  12  fs/reiser4/block_alloc.h                      |  175 ++
  13  fs/reiser4/blocknrset.c                       |  368 +++
  14  fs/reiser4/carry.c                            | 1391 +++++++++
  15  fs/reiser4/carry.h                            |  442 +++
  16  fs/reiser4/carry_ops.c                        | 2131 ++++++++++++++
  17  fs/reiser4/carry_ops.h                        |   42 +
  18  fs/reiser4/context.c                          |  288 ++
  19  fs/reiser4/context.h                          |  228 ++
  20  fs/reiser4/coord.c                            |  935 ++++++
  21  fs/reiser4/coord.h                            |  389 +++
  22  fs/reiser4/debug.c                            |  308 ++
  23  fs/reiser4/debug.h                            |  350 +++
  24  fs/reiser4/dformat.h                          |   70 +
  25  fs/reiser4/dscale.c                           |  174 ++
  26  fs/reiser4/dscale.h                           |   27 +
  27  fs/reiser4/entd.c                             |  335 +++
  28  fs/reiser4/entd.h                             |   90 +
  29  fs/reiser4/eottl.c                            |  509 ++++
  30  fs/reiser4/estimate.c                         |  120 +
  31  fs/reiser4/export_ops.c                       |  295 ++
  32  fs/reiser4/flush.c                            | 3622 ++++++++++++++++++++++++
  33  fs/reiser4/flush.h                            |  274 ++
  34  fs/reiser4/flush_queue.c                      |  680 +++++
  35  fs/reiser4/forward.h                          |  256 ++
  36  fs/reiser4/fsdata.c                           |  804 ++++++
  37  fs/reiser4/fsdata.h                           |  207 ++
  38  fs/reiser4/init_super.c                       |  750 +++++
  39  fs/reiser4/inode.c                            |  709 +++++
  40  fs/reiser4/inode.h                            |  438 +++
  41  fs/reiser4/ioctl.h                            |   41 +
  42  fs/reiser4/jnode.c                            | 1925 +++++++++++++
  43  fs/reiser4/jnode.h                            |  705 +++++
  44  fs/reiser4/kassign.c                          |  661 +++++
  45  fs/reiser4/kassign.h                          |  110 +
  46  fs/reiser4/key.c                              |  137 +
  47  fs/reiser4/key.h                              |  384 +++
  48  fs/reiser4/ktxnmgrd.c                         |  215 ++
  49  fs/reiser4/ktxnmgrd.h                         |   52 +
  50  fs/reiser4/lock.c                             | 1232 ++++++++
  51  fs/reiser4/lock.h                             |  249 ++
  52  fs/reiser4/oid.c                              |  141 +
  53  fs/reiser4/page_cache.c                       |  736 +++++
  54  fs/reiser4/page_cache.h                       |   68 +
  55  fs/reiser4/plugin/Makefile                    |   26 +
  56  fs/reiser4/plugin/cluster.c                   |   71 +
  57  fs/reiser4/plugin/cluster.h                   |  343 +++
  58  fs/reiser4/plugin/compress/Makefile           |    6 +
  59  fs/reiser4/plugin/compress/compress.c         |  381 +++
  60  fs/reiser4/plugin/compress/compress.h         |   38 +
  61  fs/reiser4/plugin/compress/compress_mode.c    |  162 ++
  62  fs/reiser4/plugin/compress/lzoconf.h          |  216 ++
  63  fs/reiser4/plugin/compress/minilzo.c          | 1967 +++++++++++++
  64  fs/reiser4/plugin/compress/minilzo.h          |   70 +
  65  fs/reiser4/plugin/crypto/cipher.c             |   37 +
  66  fs/reiser4/plugin/crypto/cipher.h             |   55 +
  67  fs/reiser4/plugin/crypto/digest.c             |   58 +
  68  fs/reiser4/plugin/dir/Makefile                |    5 +
  69  fs/reiser4/plugin/dir/dir.h                   |   36 +
  70  fs/reiser4/plugin/dir/hashed_dir.c            |   81 +
  71  fs/reiser4/plugin/dir/seekable_dir.c          |   46 +
  72  fs/reiser4/plugin/dir_plugin_common.c         |  872 ++++++
  73  fs/reiser4/plugin/disk_format/Makefile        |    5 +
  74  fs/reiser4/plugin/disk_format/disk_format.c   |   38 +
  75  fs/reiser4/plugin/disk_format/disk_format.h   |   27 +
  76  fs/reiser4/plugin/disk_format/disk_format40.c |  655 +++++
  77  fs/reiser4/plugin/disk_format/disk_format40.h |  109 +
  78  fs/reiser4/plugin/fibration.c                 |  175 ++
  79  fs/reiser4/plugin/fibration.h                 |   37 +
  80  fs/reiser4/plugin/file/Makefile               |    7 +
  81  fs/reiser4/plugin/file/cryptcompress.c        | 3760 +++++++++++++++++++++++++
  82  fs/reiser4/plugin/file/cryptcompress.h        |  554 ++++
  83  fs/reiser4/plugin/file/file.c                 | 2820 ++++++++++++++++++
  84  fs/reiser4/plugin/file/file.h                 |  272 ++
  85  fs/reiser4/plugin/file/file_conversion.c      |  594 ++++
  86  fs/reiser4/plugin/file/invert.c               |  493 ++++
  87  fs/reiser4/plugin/file/symfile.c              |   87 +
  88  fs/reiser4/plugin/file/symlink.c              |   95 +
  89  fs/reiser4/plugin/file/tail_conversion.c      |  726 +++++
  90  fs/reiser4/plugin/file_ops.c                  |  168 ++
  91  fs/reiser4/plugin/file_ops_readdir.c          |  657 +++++
  92  fs/reiser4/plugin/file_plugin_common.c        | 1007 +++++++
  93  fs/reiser4/plugin/hash.c                      |  353 +++
  94  fs/reiser4/plugin/inode_ops.c                 |  897 ++++++
  95  fs/reiser4/plugin/inode_ops_rename.c          |  914 ++++++
  96  fs/reiser4/plugin/item/Makefile               |   18 +
  97  fs/reiser4/plugin/item/acl.h                  |   66 +
  98  fs/reiser4/plugin/item/blackbox.c             |  142 +
  99  fs/reiser4/plugin/item/blackbox.h             |   33 +
 100  fs/reiser4/plugin/item/cde.c                  | 1008 +++++++
 101  fs/reiser4/plugin/item/cde.h                  |   87 +
 102  fs/reiser4/plugin/item/ctail.c                | 1570 +++++++++++
 103  fs/reiser4/plugin/item/ctail.h                |   97 +
 104  fs/reiser4/plugin/item/extent.c               |  197 ++
 105  fs/reiser4/plugin/item/extent.h               |  231 ++
 106  fs/reiser4/plugin/item/extent_file_ops.c      | 1435 ++++++++++
 107  fs/reiser4/plugin/item/extent_flush_ops.c     | 1028 +++++++
 108  fs/reiser4/plugin/item/extent_item_ops.c      |  889 ++++++
 109  fs/reiser4/plugin/item/internal.c             |  396 +++
 110  fs/reiser4/plugin/item/internal.h             |   57 +
 111  fs/reiser4/plugin/item/item.c                 |  719 +++++
 112  fs/reiser4/plugin/item/item.h                 |  400 +++
 113  fs/reiser4/plugin/item/sde.c                  |  190 ++
 114  fs/reiser4/plugin/item/sde.h                  |   66 +
 115  fs/reiser4/plugin/item/static_stat.c          | 1106 ++++++++
 116  fs/reiser4/plugin/item/static_stat.h          |  224 ++
 117  fs/reiser4/plugin/item/tail.c                 |  812 ++++++
 118  fs/reiser4/plugin/item/tail.h                 |   58 +
 119  fs/reiser4/plugin/node/Makefile               |    5 +
 120  fs/reiser4/plugin/node/node.c                 |  131 +
 121  fs/reiser4/plugin/node/node.h                 |  272 ++
 122  fs/reiser4/plugin/node/node40.c               | 2924 +++++++++++++++++++
 123  fs/reiser4/plugin/node/node40.h               |  125 +
 124  fs/reiser4/plugin/object.c                    |  516 ++++
 125  fs/reiser4/plugin/object.h                    |  121 +
 126  fs/reiser4/plugin/plugin.c                    |  578 ++++
 127  fs/reiser4/plugin/plugin.h                    |  920 ++++++
 128  fs/reiser4/plugin/plugin_header.h             |  144 +
 129  fs/reiser4/plugin/plugin_set.c                |  379 +++
 130  fs/reiser4/plugin/plugin_set.h                |   77 +
 131  fs/reiser4/plugin/security/Makefile           |    4 +
 132  fs/reiser4/plugin/security/perm.c             |   44 +
 133  fs/reiser4/plugin/security/perm.h             |   82 +
 134  fs/reiser4/plugin/space/Makefile              |    4 +
 135  fs/reiser4/plugin/space/bitmap.c              | 1585 +++++++++++
 136  fs/reiser4/plugin/space/bitmap.h              |   47 +
 137  fs/reiser4/plugin/space/space_allocator.h     |   80 +
 138  fs/reiser4/plugin/tail_policy.c               |  113 +
 139  fs/reiser4/pool.c                             |  234 ++
 140  fs/reiser4/pool.h                             |   55 +
 141  fs/reiser4/readahead.c                        |  138 +
 142  fs/reiser4/readahead.h                        |   48 +
 143  fs/reiser4/reiser4.h                          |  269 ++
 144  fs/reiser4/safe_link.c                        |  351 +++
 145  fs/reiser4/safe_link.h                        |   29 +
 146  fs/reiser4/seal.c                             |  218 ++
 147  fs/reiser4/seal.h                             |   49 +
 148  fs/reiser4/search.c                           | 1611 +++++++++++
 149  fs/reiser4/status_flags.c                     |  175 ++
 150  fs/reiser4/status_flags.h                     |   43 +
 151  fs/reiser4/super.c                            |  316 +++
 152  fs/reiser4/super.h                            |  464 +++
 153  fs/reiser4/super_ops.c                        |  730 +++++
 154  fs/reiser4/tap.c                              |  377 +++
 155  fs/reiser4/tap.h                              |   70 +
 156  fs/reiser4/tree.c                             | 1876 ++++++++++++
 157  fs/reiser4/tree.h                             |  577 ++++
 158  fs/reiser4/tree_mod.c                         |  386 +++
 159  fs/reiser4/tree_mod.h                         |   29 +
 160  fs/reiser4/tree_walk.c                        |  927 ++++++
 161  fs/reiser4/tree_walk.h                        |  125 +
 162  fs/reiser4/txnmgr.c                           | 3164 +++++++++++++++++++++
 163  fs/reiser4/txnmgr.h                           |  708 +++++
 164  fs/reiser4/type_safe_hash.h                   |  320 +++
 165  fs/reiser4/vfs_ops.c                          |  259 ++
 166  fs/reiser4/vfs_ops.h                          |   53 +
 167  fs/reiser4/wander.c                           | 1797 ++++++++++++
 168  fs/reiser4/wander.h                           |  135 +
 169  fs/reiser4/writeout.h                         |   21 +
 170  fs/reiser4/znode.c                            | 1029 +++++++
 171  fs/reiser4/znode.h                            |  434 +++
 172  include/linux/fs.h                            |    3 +
 173  lib/radix-tree.c                              |    1 +
 174  mm/filemap.c                                  |    5 +
 175  mm/readahead.c                                |    1 +
 176  175 files changed, 79830 insertions(+), 10 deletions(-)
 177
 178 diff --git a/Documentation/Changes b/Documentation/Changes
 179 index 73a8617..49ee889 100644
 180 --- a/Documentation/Changes
 181 +++ b/Documentation/Changes
 182 @@ -36,6 +36,7 @@ o  module-init-tools      0.9.10                  # depmod -V
 183  o  e2fsprogs              1.29                    # tune2fs
 184  o  jfsutils               1.1.3                   # fsck.jfs -V
 185  o  reiserfsprogs          3.6.3                   # reiserfsck -V 2>&1|grep reiserfsprogs
 186 +o  reiser4progs           1.0.0                   # fsck.reiser4 -V
 187  o  xfsprogs               2.6.0                   # xfs_db -V
 188  o  pcmciautils            004                     # pccardctl -V
 189  o  quota-tools            3.09                    # quota -V
 190 @@ -144,6 +145,13 @@ The reiserfsprogs package should be used for reiserfs-3.6.x
 191  versions of mkreiserfs, resize_reiserfs, debugreiserfs and
 192  reiserfsck. These utils work on both i386 and alpha platforms.
 193
 194 +Reiser4progs
 195 +------------
 196 +
 197 +The reiser4progs package contains utilities for the reiser4 file system.
 198 +Detailed instructions are provided in the README file located at:
 199 +<ftp://ftp.namesys.com/pub/reiser4progs/README>.
 200 +
 201  Xfsprogs
 202  --------
 203
 204 @@ -322,6 +330,10 @@ Reiserfsprogs
 205  -------------
 206  o  <http://www.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.6.3.tar.gz>
 207
 208 +Reiser4progs
 209 +------------
 210 +o  <ftp://ftp.namesys.com/pub/reiser4progs/>
 211 +
 212  Xfsprogs
 213  --------
 214  o  <ftp://oss.sgi.com/projects/xfs/download/>
 215 diff --git a/Documentation/filesystems/reiser4.txt b/Documentation/filesystems/reiser4.txt
 216 new file mode 100644
 217 index 0000000..8e07c9e
 218 --- /dev/null
 219 +++ b/Documentation/filesystems/reiser4.txt
 220 @@ -0,0 +1,75 @@
 221 +Reiser4 filesystem
 222 +==================
 223 +Reiser4 is a file system based on dancing tree algorithms, and is
 224 +described at http://www.namesys.com
 225 +
 226 +
 227 +References
 228 +==========
 229 +web page               http://namesys.com/v4/v4.html
 230 +source code            ftp://ftp.namesys.com/pub/reiser4-for-2.6/
 231 +userland tools         ftp://ftp.namesys.com/pub/reiser4progs/
 232 +install page           http://www.namesys.com/install_v4.html
 233 +
 234 +Compile options
 235 +===============
 236 +Enable reiser4 debug mode
 237 +       This checks everything imaginable while reiser4
 238 +       runs
 239 +
 240 +Mount options
 241 +=============
 242 +tmgr.atom_max_size=N
 243 +       Atoms containing more than N blocks will be forced to commit.
 244 +       N is decimal.
 245 +       Default is nr_free_pagecache_pages() / 2 at mount time.
 246 +
 247 +tmgr.atom_max_age=N
 248 +       Atoms older than N seconds will be forced to commit. N is decimal.
 249 +       Default is 600.
 250 +
 251 +tmgr.atom_max_flushers=N
 252 +       Limit of concurrent flushers for one atom. 0 means no limit.
 253 +       Default is 0.
 254 +
 255 +tree.cbk_cache.nr_slots=N
 256 +       Number of slots in the cbk cache.
 257 +
 258 +flush.relocate_threshold=N
 259 +       If flush finds more than N adjacent dirty leaf-level blocks it
 260 +       will force them to be relocated.
 261 +       Default is 64.
 262 +
 263 +flush.relocate_distance=N
 264 +       If flush finds can find a block allocation closer than at most
 265 +       N from the preceder it will relocate to that position.
 266 +       Default is 64.
 267 +
 268 +flush.scan_maxnodes=N
 269 +       The maximum number of nodes to scan left on a level during
 270 +       flush.
 271 +       Default is 10000.
 272 +
 273 +optimal_io_size=N
 274 +       Preferred IO size. This value is used to set st_blksize of
 275 +       struct stat.
 276 +       Default is 65536.
 277 +
 278 +bsdgroups
 279 +       Turn on BSD-style gid assignment.
 280 +
 281 +32bittimes
 282 +       By default file in reiser4 have 64 bit timestamps. Files
 283 +       created when filesystem is mounted with 32bittimes mount
 284 +       option will get 32 bit timestamps.
 285 +
 286 +mtflush
 287 +       Turn off concurrent flushing.
 288 +
 289 +nopseudo
 290 +       Disable pseudo files support. See
 291 +       http://namesys.com/v4/pseudo.html for more about pseudo files.
 292 +
 293 +dont_load_bitmap
 294 +       Don't load all bitmap blocks at mount time, it is useful for
 295 +       machines with tiny RAM and large disks.
 296 diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
 297 index d22cfc9..bb4a75a 100644
 298 --- a/arch/i386/lib/usercopy.c
 299 +++ b/arch/i386/lib/usercopy.c
 300 @@ -812,6 +812,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
 301  #endif
 302         return n;
 303  }
 304 +EXPORT_SYMBOL(__copy_from_user_ll_nocache);
 305
 306  unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 307                                         unsigned long n)
 308 @@ -827,6 +828,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
 309  #endif
 310         return n;
 311  }
 312 +EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
 313
 314  /**
 315   * copy_to_user: - Copy a block of data into user space.
 316 diff --git a/fs/Kconfig b/fs/Kconfig
 317 index 8cd2417..5a97039 100644
 318 --- a/fs/Kconfig
 319 +++ b/fs/Kconfig
 320 @@ -272,6 +272,8 @@ config FS_MBCACHE
 321         default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
 322         default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
 323
 324 +source "fs/reiser4/Kconfig"
 325 +
 326  config REISERFS_FS
 327         tristate "Reiserfs support"
 328         help
 329 diff --git a/fs/Makefile b/fs/Makefile
 330 index b9ffa63..b4c08ce 100644
 331 --- a/fs/Makefile
 332 +++ b/fs/Makefile
 333 @@ -62,6 +62,7 @@ obj-$(CONFIG_DLM)             += dlm/
 334
 335  # Do not add any filesystems before this line
 336  obj-$(CONFIG_REISERFS_FS)      += reiserfs/
 337 +obj-$(CONFIG_REISER4_FS)       += reiser4/
 338  obj-$(CONFIG_EXT3_FS)          += ext3/ # Before ext2 so root fs can be ext3
 339  obj-$(CONFIG_EXT4DEV_FS)       += ext4/ # Before ext2 so root fs can be ext4dev
 340  obj-$(CONFIG_JBD)              += jbd/
 341 diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
 342 index a4b142a..cdcff8c 100644
 343 --- a/fs/fs-writeback.c
 344 +++ b/fs/fs-writeback.c
 345 @@ -296,8 +296,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 346   * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
 347   * that it can be located for waiting on in __writeback_single_inode().
 348   *
 349 - * Called under inode_lock.
 350 - *
 351   * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 352   * This function assumes that the blockdev superblock's inodes are backed by
 353   * a variety of queues, so all inodes are searched.  For other superblocks,
 354 @@ -313,11 +311,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 355   * on the writer throttling path, and we get decent balancing between many
 356   * throttled threads: we don't want them all piling up on __wait_on_inode.
 357   */
 358 -static void
 359 -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 360 +void
 361 +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 362  {
 363         const unsigned long start = jiffies;    /* livelock avoidance */
 364
 365 +       spin_lock(&inode_lock);
 366 +
 367         if (!wbc->for_kupdate || list_empty(&sb->s_io))
 368                 list_splice_init(&sb->s_dirty, &sb->s_io);
 369
 370 @@ -397,8 +397,19 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 371                 if (wbc->nr_to_write <= 0)
 372                         break;
 373         }
 374 +       spin_unlock(&inode_lock);
 375         return;         /* Leave any unwritten inodes on s_io */
 376  }
 377 +EXPORT_SYMBOL(generic_sync_sb_inodes);
 378 +
 379 +static void
 380 +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 381 +{
 382 +       if (sb->s_op->sync_inodes)
 383 +               sb->s_op->sync_inodes(sb, wbc);
 384 +       else
 385 +               generic_sync_sb_inodes(sb, wbc);
 386 +}
 387
 388  /*
 389   * Start writeback of dirty pagecache data against all unlocked inodes.
 390 @@ -439,11 +450,8 @@ restart:
 391                          * be unmounted by the time it is released.
 392                          */
 393                         if (down_read_trylock(&sb->s_umount)) {
 394 -                               if (sb->s_root) {
 395 -                                       spin_lock(&inode_lock);
 396 +                               if (sb->s_root)
 397                                         sync_sb_inodes(sb, wbc);
 398 -                                       spin_unlock(&inode_lock);
 399 -                               }
 400                                 up_read(&sb->s_umount);
 401                         }
 402                         spin_lock(&sb_lock);
 403 @@ -481,9 +489,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 404                         (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
 405                         nr_dirty + nr_unstable;
 406         wbc.nr_to_write += wbc.nr_to_write / 2;         /* Bit more for luck */
 407 -       spin_lock(&inode_lock);
 408         sync_sb_inodes(sb, &wbc);
 409 -       spin_unlock(&inode_lock);
 410  }
 411
 412  /*
 413 diff --git a/fs/reiser4/Kconfig b/fs/reiser4/Kconfig
 414 new file mode 100644
 415 index 0000000..f6e5195
 416 --- /dev/null
 417 +++ b/fs/reiser4/Kconfig
 418 @@ -0,0 +1,32 @@
 419 +config REISER4_FS
 420 +       tristate "Reiser4 (EXPERIMENTAL)"
 421 +       depends on EXPERIMENTAL
 422 +       select ZLIB_INFLATE
 423 +       select ZLIB_DEFLATE
 424 +       select CRYPTO
 425 +       help
 426 +         Reiser4 is a filesystem that performs all filesystem operations
 427 +         as atomic transactions, which means that it either performs a
 428 +         write, or it does not, and in the event of a crash it does not
 429 +         partially perform it or corrupt it.
 430 +
 431 +         It stores files in dancing trees, which are like balanced trees but
 432 +         faster.  It packs small files together so that they share blocks
 433 +         without wasting space.  This means you can use it to store really
 434 +         small files.  It also means that it saves you disk space.  It avoids
 435 +         hassling you with anachronisms like having a maximum number of
 436 +         inodes, and wasting space if you use less than that number.
 437 +
 438 +         Reiser4 is a distinct filesystem type from reiserfs (V3).
 439 +         It's therefore not possible to use reiserfs file systems
 440 +         with reiser4.
 441 +
 442 +         To learn more about reiser4, go to http://www.namesys.com
 443 +
 444 +config REISER4_DEBUG
 445 +       bool "Enable reiser4 debug mode"
 446 +       depends on REISER4_FS
 447 +       help
 448 +         Don't use this unless you are debugging reiser4.
 449 +
 450 +         If unsure, say N.
 451 diff --git a/fs/reiser4/Makefile b/fs/reiser4/Makefile
 452 new file mode 100644
 453 index 0000000..e78441e
 454 --- /dev/null
 455 +++ b/fs/reiser4/Makefile
 456 @@ -0,0 +1,99 @@
 457 +#
 458 +# reiser4/Makefile
 459 +#
 460 +
 461 +obj-$(CONFIG_REISER4_FS) += reiser4.o
 462 +
 463 +reiser4-y := \
 464 +                  debug.o \
 465 +                  jnode.o \
 466 +                  znode.o \
 467 +                  key.o \
 468 +                  pool.o \
 469 +                  tree_mod.o \
 470 +                  estimate.o \
 471 +                  carry.o \
 472 +                  carry_ops.o \
 473 +                  lock.o \
 474 +                  tree.o \
 475 +                  context.o \
 476 +                  tap.o \
 477 +                  coord.o \
 478 +                  block_alloc.o \
 479 +                  txnmgr.o \
 480 +                  kassign.o \
 481 +                  flush.o \
 482 +                  wander.o \
 483 +                  eottl.o \
 484 +                  search.o \
 485 +                  page_cache.o \
 486 +                  seal.o \
 487 +                  dscale.o \
 488 +                  flush_queue.o \
 489 +                  ktxnmgrd.o \
 490 +                  blocknrset.o \
 491 +                  super.o \
 492 +                  super_ops.o \
 493 +                  fsdata.o \
 494 +                  export_ops.o \
 495 +                  oid.o \
 496 +                  tree_walk.o \
 497 +                  inode.o \
 498 +                  vfs_ops.o \
 499 +                  as_ops.o \
 500 +                  entd.o\
 501 +                  readahead.o \
 502 +                  status_flags.o \
 503 +                  init_super.o \
 504 +                  safe_link.o \
 505 +           \
 506 +                  plugin/plugin.o \
 507 +                  plugin/plugin_set.o \
 508 +                  plugin/node/node.o \
 509 +                  plugin/object.o \
 510 +                  plugin/cluster.o \
 511 +                  plugin/inode_ops.o \
 512 +                  plugin/inode_ops_rename.o \
 513 +                  plugin/file_ops.o \
 514 +                  plugin/file_ops_readdir.o \
 515 +                  plugin/file_plugin_common.o \
 516 +                  plugin/file/file.o \
 517 +                  plugin/file/tail_conversion.o \
 518 +                  plugin/file/file_conversion.o \
 519 +                  plugin/file/symlink.o \
 520 +                  plugin/file/cryptcompress.o \
 521 +                  plugin/dir_plugin_common.o \
 522 +                  plugin/dir/hashed_dir.o \
 523 +                  plugin/dir/seekable_dir.o \
 524 +                  plugin/node/node40.o \
 525 +           \
 526 +                  plugin/crypto/cipher.o \
 527 +                  plugin/crypto/digest.o \
 528 +           \
 529 +                  plugin/compress/minilzo.o \
 530 +                  plugin/compress/compress.o \
 531 +                  plugin/compress/compress_mode.o \
 532 +           \
 533 +                  plugin/item/static_stat.o \
 534 +                  plugin/item/sde.o \
 535 +                  plugin/item/cde.o \
 536 +                  plugin/item/blackbox.o \
 537 +                  plugin/item/internal.o \
 538 +                  plugin/item/tail.o \
 539 +                  plugin/item/ctail.o \
 540 +                  plugin/item/extent.o \
 541 +                  plugin/item/extent_item_ops.o \
 542 +                  plugin/item/extent_file_ops.o \
 543 +                  plugin/item/extent_flush_ops.o \
 544 +           \
 545 +                  plugin/hash.o \
 546 +                  plugin/fibration.o \
 547 +                  plugin/tail_policy.o \
 548 +                  plugin/item/item.o \
 549 +           \
 550 +                  plugin/security/perm.o \
 551 +                  plugin/space/bitmap.o \
 552 +           \
 553 +                  plugin/disk_format/disk_format40.o \
 554 +                  plugin/disk_format/disk_format.o
 555 +
 556 diff --git a/fs/reiser4/README b/fs/reiser4/README
 557 new file mode 100644
 558 index 0000000..4637f59
 559 --- /dev/null
 560 +++ b/fs/reiser4/README
 561 @@ -0,0 +1,125 @@
 562 +[LICENSING]
 563 +
 564 +Reiser4 is hereby licensed under the GNU General
 565 +Public License version 2.
 566 +
 567 +Source code files that contain the phrase "licensing governed by
 568 +reiser4/README" are "governed files" throughout this file.  Governed
 569 +files are licensed under the GPL.  The portions of them owned by Hans
 570 +Reiser, or authorized to be licensed by him, have been in the past,
 571 +and likely will be in the future, licensed to other parties under
 572 +other licenses.  If you add your code to governed files, and don't
 573 +want it to be owned by Hans Reiser, put your copyright label on that
 574 +code so the poor blight and his customers can keep things straight.
 575 +All portions of governed files not labeled otherwise are owned by Hans
 576 +Reiser, and by adding your code to it, widely distributing it to
 577 +others or sending us a patch, and leaving the sentence in stating that
 578 +licensing is governed by the statement in this file, you accept this.
 579 +It will be a kindness if you identify whether Hans Reiser is allowed
 580 +to license code labeled as owned by you on your behalf other than
 581 +under the GPL, because he wants to know if it is okay to do so and put
 582 +a check in the mail to you (for non-trivial improvements) when he
 583 +makes his next sale.  He makes no guarantees as to the amount if any,
 584 +though he feels motivated to motivate contributors, and you can surely
 585 +discuss this with him before or after contributing.  You have the
 586 +right to decline to allow him to license your code contribution other
 587 +than under the GPL.
 588 +
 589 +Further licensing options are available for commercial and/or other
 590 +interests directly from Hans Reiser: reiser@namesys.com.  If you interpret
 591 +the GPL as not allowing those additional licensing options, you read
 592 +it wrongly, and Richard Stallman agrees with me, when carefully read
 593 +you can see that those restrictions on additional terms do not apply
 594 +to the owner of the copyright, and my interpretation of this shall
 595 +govern for this license.
 596 +
 597 +[END LICENSING]
 598 +
 599 +Reiser4 is a file system based on dancing tree algorithms, and is
 600 +described at http://www.namesys.com
 601 +
 602 +mkfs.reiser4 and other utilities are on our webpage or wherever your
 603 +Linux provider put them.  You really want to be running the latest
 604 +version off the website if you use fsck.
 605 +
 606 +Yes, if you update your reiser4 kernel module you do have to
 607 +recompile your kernel, most of the time.  The errors you get will be
 608 +quite cryptic if your forget to do so.
 609 +
 610 +Hideous Commercial Pitch: Spread your development costs across other OS
 611 +vendors.  Select from the best in the world, not the best in your
 612 +building, by buying from third party OS component suppliers.  Leverage
 613 +the software component development power of the internet.  Be the most
 614 +aggressive in taking advantage of the commercial possibilities of
 615 +decentralized internet development, and add value through your branded
 616 +integration that you sell as an operating system.  Let your competitors
 617 +be the ones to compete against the entire internet by themselves.  Be
 618 +hip, get with the new economic trend, before your competitors do.  Send
 619 +email to reiser@namesys.com
 620 +
 621 +Hans Reiser was the primary architect of Reiser4, but a whole team
 622 +chipped their ideas in.  He invested everything he had into Namesys
 623 +for 5.5 dark years of no money before Reiser3 finally started to work well
 624 +enough to bring in money.  He owns the copyright.
 625 +
 626 +DARPA was the primary sponsor of Reiser4.  DARPA does not endorse
 627 +Reiser4, it merely sponsors it.  DARPA is, in solely Hans's personal
 628 +opinion, unique in its willingness to invest into things more
 629 +theoretical than the VC community can readily understand, and more
 630 +longterm than allows them to be sure that they will be the ones to
 631 +extract the economic benefits from.  DARPA also integrated us into a
 632 +security community that transformed our security worldview.
 633 +
 634 +Vladimir Saveliev is our lead programmer, with us from the beginning,
 635 +and he worked long hours writing the cleanest code.  This is why he is
 636 +now the lead programmer after years of commitment to our work.  He
 637 +always made the effort to be the best he could be, and to make his
 638 +code the best that it could be.  What resulted was quite remarkable. I
 639 +don't think that money can ever motivate someone to work the way he
 640 +did, he is one of the most selfless men I know.
 641 +
 642 +Alexander Lyamin was our sysadmin, and helped to educate us in
 643 +security issues.  Moscow State University and IMT were very generous
 644 +in the internet access they provided us, and in lots of other little
 645 +ways that a generous institution can be.
 646 +
 647 +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the
 648 +locking code, the block allocator, and finished the flushing code.
 649 +His code is always crystal clean and well structured.
 650 +
 651 +Nikita Danilov wrote the core of the balancing code, the core of the
 652 +plugins code, and the directory code.  He worked a steady pace of long
 653 +hours that produced a whole lot of well abstracted code.  He is our
 654 +senior computer scientist.
 655 +
 656 +Vladimir Demidov wrote the parser.  Writing an in kernel parser is
 657 +something very few persons have the skills for, and it is thanks to
 658 +him that we can say that the parser is really not so big compared to
 659 +various bits of our other code, and making a parser work in the kernel
 660 +was not so complicated as everyone would imagine mainly because it was
 661 +him doing it...
 662 +
 663 +Joshua McDonald wrote the transaction manager, and the flush code.
 664 +The flush code unexpectedly turned out be extremely hairy for reasons
 665 +you can read about on our web page, and he did a great job on an
 666 +extremely difficult task.
 667 +
 668 +Nina Reiser handled our accounting, government relations, and much
 669 +more.
 670 +
 671 +Ramon Reiser developed our website.
 672 +
 673 +Beverly Palmer drew our graphics.
 674 +
 675 +Vitaly Fertman developed librepair, userspace plugins repair code, fsck
 676 +and worked with Umka on developing libreiser4 and userspace plugins.
 677 +
 678 +Yury Umanets (aka Umka) developed libreiser4, userspace plugins and
 679 +userspace tools (reiser4progs).
 680 +
 681 +Oleg Drokin (aka Green) is the release manager who fixes everything.
 682 +It is so nice to have someone like that on the team.  He (plus Chris
 683 +and Jeff) make it possible for the entire rest of the Namesys team to
 684 +focus on Reiser4, and he fixed a whole lot of Reiser4 bugs also.  It
 685 +is just amazing to watch his talent for spotting bugs in action.
 686 +
 687 diff --git a/fs/reiser4/as_ops.c b/fs/reiser4/as_ops.c
 688 new file mode 100644
 689 index 0000000..b4f3375
 690 --- /dev/null
 691 +++ b/fs/reiser4/as_ops.c
 692 @@ -0,0 +1,339 @@
 693 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
 694 +
 695 +/* Interface to VFS. Reiser4 address_space_operations are defined here. */
 696 +
 697 +#include "forward.h"
 698 +#include "debug.h"
 699 +#include "dformat.h"
 700 +#include "coord.h"
 701 +#include "plugin/item/item.h"
 702 +#include "plugin/file/file.h"
 703 +#include "plugin/security/perm.h"
 704 +#include "plugin/disk_format/disk_format.h"
 705 +#include "plugin/plugin.h"
 706 +#include "plugin/plugin_set.h"
 707 +#include "plugin/object.h"
 708 +#include "txnmgr.h"
 709 +#include "jnode.h"
 710 +#include "znode.h"
 711 +#include "block_alloc.h"
 712 +#include "tree.h"
 713 +#include "vfs_ops.h"
 714 +#include "inode.h"
 715 +#include "page_cache.h"
 716 +#include "ktxnmgrd.h"
 717 +#include "super.h"
 718 +#include "reiser4.h"
 719 +#include "entd.h"
 720 +
 721 +#include <linux/profile.h>
 722 +#include <linux/types.h>
 723 +#include <linux/mount.h>
 724 +#include <linux/vfs.h>
 725 +#include <linux/mm.h>
 726 +#include <linux/buffer_head.h>
 727 +#include <linux/dcache.h>
 728 +#include <linux/list.h>
 729 +#include <linux/pagemap.h>
 730 +#include <linux/slab.h>
 731 +#include <linux/seq_file.h>
 732 +#include <linux/init.h>
 733 +#include <linux/module.h>
 734 +#include <linux/writeback.h>
 735 +#include <linux/backing-dev.h>
 736 +#include <linux/quotaops.h>
 737 +#include <linux/security.h>
 738 +
 739 +/* address space operations */
 740 +
 741 +/**
 742 + * reiser4_set_page_dirty - set dirty bit, tag in page tree, dirty accounting
 743 + * @page: page to be dirtied
 744 + *
 745 + * Operation of struct address_space_operations. This implementation is used by
 746 + * unix and cryptcompress file plugins.
 747 + *
 748 + * This is called when reiser4 page gets dirtied outside of reiser4, for
 749 + * example, when dirty bit is moved from pte to physical page.
 750 + *
 751 + * Tags page in the mapping's page tree with special tag so that it is possible
 752 + * to do all the reiser4 specific work wrt dirty pages (jnode creation,
 753 + * capturing by an atom) later because it can not be done in the contexts where
 754 + * set_page_dirty is called.
 755 + */
 756 +int reiser4_set_page_dirty(struct page *page)
 757 +{
 758 +       /* this page can be unformatted only */
 759 +       assert("vs-1734", (page->mapping &&
 760 +                          page->mapping->host &&
 761 +                          reiser4_get_super_fake(page->mapping->host->i_sb) !=
 762 +                          page->mapping->host
 763 +                          && reiser4_get_cc_fake(page->mapping->host->i_sb) !=
 764 +                          page->mapping->host
 765 +                          && reiser4_get_bitmap_fake(page->mapping->host->i_sb) !=
 766 +                          page->mapping->host));
 767 +
 768 +       if (!TestSetPageDirty(page)) {
 769 +               struct address_space *mapping = page->mapping;
 770 +
 771 +               if (mapping) {
 772 +                       write_lock_irq(&mapping->tree_lock);
 773 +
 774 +                       /* check for race with truncate */
 775 +                       if (page->mapping) {
 776 +                               assert("vs-1652", page->mapping == mapping);
 777 +                               if (mapping_cap_account_dirty(mapping))
 778 +                                       inc_zone_page_state(page,
 779 +                                                           NR_FILE_DIRTY);
 780 +                               radix_tree_tag_set(&mapping->page_tree,
 781 +                                                  page->index,
 782 +                                                  PAGECACHE_TAG_REISER4_MOVED);
 783 +                       }
 784 +                       write_unlock_irq(&mapping->tree_lock);
 785 +                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 786 +               }
 787 +       }
 788 +       return 0;
 789 +}
 790 +
 791 +/* ->invalidatepage method for reiser4 */
 792 +
 793 +/*
 794 + * this is called for each truncated page from
 795 + * truncate_inode_pages()->truncate_{complete,partial}_page().
 796 + *
 797 + * At the moment of call, page is under lock, and outstanding io (if any) has
 798 + * completed.
 799 + */
 800 +
 801 +/**
 802 + * reiser4_invalidatepage
 803 + * @page: page to invalidate
 804 + * @offset: starting offset for partial invalidation
 805 + *
 806 + */
 807 +void reiser4_invalidatepage(struct page *page, unsigned long offset)
 808 +{
 809 +       int ret = 0;
 810 +       reiser4_context *ctx;
 811 +       struct inode *inode;
 812 +       jnode *node;
 813 +
 814 +       /*
 815 +        * This is called to truncate file's page.
 816 +        *
 817 +        * Originally, reiser4 implemented truncate in a standard way
 818 +        * (vmtruncate() calls ->invalidatepage() on all truncated pages
 819 +        * first, then file system ->truncate() call-back is invoked).
 820 +        *
 821 +        * This lead to the problem when ->invalidatepage() was called on a
 822 +        * page with jnode that was captured into atom in ASTAGE_PRE_COMMIT
 823 +        * process. That is, truncate was bypassing transactions. To avoid
 824 +        * this, try_capture_page_to_invalidate() call was added here.
 825 +        *
 826 +        * After many troubles with vmtruncate() based truncate (including
 827 +        * races with flush, tail conversion, etc.) it was re-written in the
 828 +        * top-to-bottom style: items are killed in reiser4_cut_tree_object()
 829 +        * and pages belonging to extent are invalidated in kill_hook_extent().
 830 +        * So probably now additional call to capture is not needed here.
 831 +        */
 832 +
 833 +       assert("nikita-3137", PageLocked(page));
 834 +       assert("nikita-3138", !PageWriteback(page));
 835 +       inode = page->mapping->host;
 836 +
 837 +       /*
 838 +        * ->invalidatepage() should only be called for the unformatted
 839 +        * jnodes. Destruction of all other types of jnodes is performed
 840 +        * separately. But, during some corner cases (like handling errors
 841 +        * during mount) it is simpler to let ->invalidatepage to be called on
 842 +        * them. Check for this, and do nothing.
 843 +        */
 844 +       if (reiser4_get_super_fake(inode->i_sb) == inode)
 845 +               return;
 846 +       if (reiser4_get_cc_fake(inode->i_sb) == inode)
 847 +               return;
 848 +       if (reiser4_get_bitmap_fake(inode->i_sb) == inode)
 849 +               return;
 850 +       assert("vs-1426", PagePrivate(page));
 851 +       assert("vs-1427",
 852 +              page->mapping == jnode_get_mapping(jnode_by_page(page)));
 853 +       assert("", jprivate(page) != NULL);
 854 +       assert("", ergo(inode_file_plugin(inode) !=
 855 +                       file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID),
 856 +                       offset == 0));
 857 +
 858 +       ctx = reiser4_init_context(inode->i_sb);
 859 +       if (IS_ERR(ctx))
 860 +               return;
 861 +
 862 +       node = jprivate(page);
 863 +       spin_lock_jnode(node);
 864 +       if (!(node->state & ((1 << JNODE_DIRTY) | (1<< JNODE_FLUSH_QUEUED) |
 865 +                         (1 << JNODE_WRITEBACK) | (1 << JNODE_OVRWR)))) {
 866 +               /* there is not need to capture */
 867 +               jref(node);
 868 +               JF_SET(node, JNODE_HEARD_BANSHEE);
 869 +               page_clear_jnode(page, node);
 870 +               reiser4_uncapture_jnode(node);
 871 +               unhash_unformatted_jnode(node);
 872 +               jput(node);
 873 +               reiser4_exit_context(ctx);
 874 +               return;
 875 +       }
 876 +       spin_unlock_jnode(node);
 877 +
 878 +       /* capture page being truncated. */
 879 +       ret = try_capture_page_to_invalidate(page);
 880 +       if (ret != 0)
 881 +               warning("nikita-3141", "Cannot capture: %i", ret);
 882 +
 883 +       if (offset == 0) {
 884 +               /* remove jnode from transaction and detach it from page. */
 885 +               jref(node);
 886 +               JF_SET(node, JNODE_HEARD_BANSHEE);
 887 +               /* page cannot be detached from jnode concurrently, because it
 888 +                * is locked */
 889 +               reiser4_uncapture_page(page);
 890 +
 891 +               /* this detaches page from jnode, so that jdelete will not try
 892 +                * to lock page which is already locked */
 893 +               spin_lock_jnode(node);
 894 +               page_clear_jnode(page, node);
 895 +               spin_unlock_jnode(node);
 896 +               unhash_unformatted_jnode(node);
 897 +
 898 +               jput(node);
 899 +       }
 900 +
 901 +       reiser4_exit_context(ctx);
 902 +}
 903 +
 904 +/* help function called from reiser4_releasepage(). It returns true if jnode
 905 + * can be detached from its page and page released. */
 906 +int jnode_is_releasable(jnode * node /* node to check */ )
 907 +{
 908 +       assert("nikita-2781", node != NULL);
 909 +       assert_spin_locked(&(node->guard));
 910 +       assert_spin_locked(&(node->load));
 911 +
 912 +       /* is some thread is currently using jnode page, later cannot be
 913 +        * detached */
 914 +       if (atomic_read(&node->d_count) != 0) {
 915 +               return 0;
 916 +       }
 917 +
 918 +       assert("vs-1214", !jnode_is_loaded(node));
 919 +
 920 +       /*
 921 +        * can only release page if real block number is assigned to it. Simple
 922 +        * check for ->atom wouldn't do, because it is possible for node to be
 923 +        * clean, not it atom yet, and still having fake block number. For
 924 +        * example, node just created in jinit_new().
 925 +        */
 926 +       if (reiser4_blocknr_is_fake(jnode_get_block(node)))
 927 +               return 0;
 928 +
 929 +       /*
 930 +        * pages prepared for write can not be released anyway, so avoid
 931 +        * detaching jnode from the page
 932 +        */
 933 +       if (JF_ISSET(node, JNODE_WRITE_PREPARED))
 934 +               return 0;
 935 +
 936 +       /*
 937 +        * dirty jnode cannot be released. It can however be submitted to disk
 938 +        * as part of early flushing, but only after getting flush-prepped.
 939 +        */
 940 +       if (JF_ISSET(node, JNODE_DIRTY))
 941 +               return 0;
 942 +
 943 +       /* overwrite set is only written by log writer. */
 944 +       if (JF_ISSET(node, JNODE_OVRWR))
 945 +               return 0;
 946 +
 947 +       /* jnode is already under writeback */
 948 +       if (JF_ISSET(node, JNODE_WRITEBACK))
 949 +               return 0;
 950 +
 951 +       /* don't flush bitmaps or journal records */
 952 +       if (!jnode_is_znode(node) && !jnode_is_unformatted(node))
 953 +               return 0;
 954 +
 955 +       return 1;
 956 +}
 957 +
 958 +/*
 959 + * ->releasepage method for reiser4
 960 + *
 961 + * This is called by VM scanner when it comes across clean page.  What we have
 962 + * to do here is to check whether page can really be released (freed that is)
 963 + * and if so, detach jnode from it and remove page from the page cache.
 964 + *
 965 + * Check for releasability is done by releasable() function.
 966 + */
 967 +int reiser4_releasepage(struct page *page, gfp_t gfp UNUSED_ARG)
 968 +{
 969 +       jnode *node;
 970 +
 971 +       assert("nikita-2257", PagePrivate(page));
 972 +       assert("nikita-2259", PageLocked(page));
 973 +       assert("nikita-2892", !PageWriteback(page));
 974 +       assert("nikita-3019", reiser4_schedulable());
 975 +
 976 +       /* NOTE-NIKITA: this can be called in the context of reiser4 call. It
 977 +          is not clear what to do in this case. A lot of deadlocks seems be
 978 +          possible. */
 979 +       if (page_count(page) > 3)
 980 +               return 0;
 981 +
 982 +       node = jnode_by_page(page);
 983 +       assert("nikita-2258", node != NULL);
 984 +       assert("reiser4-4", page->mapping != NULL);
 985 +       assert("reiser4-5", page->mapping->host != NULL);
 986 +
 987 +       if (PageDirty(page))
 988 +               return 0;
 989 +
 990 +       /* extra page reference is used by reiser4 to protect
 991 +        * jnode<->page link from this ->releasepage(). */
 992 +       if (page_count(page) > 3)
 993 +               return 0;
 994 +
 995 +       /* releasable() needs jnode lock, because it looks at the jnode fields
 996 +        * and we need jload_lock here to avoid races with jload(). */
 997 +       spin_lock_jnode(node);
 998 +       spin_lock(&(node->load));
 999 +       if (jnode_is_releasable(node)) {
1000 +               struct address_space *mapping;
1001 +
1002 +               mapping = page->mapping;
1003 +               jref(node);
1004 +               /* there is no need to synchronize against
1005 +                * jnode_extent_write() here, because pages seen by
1006 +                * jnode_extent_write() are !releasable(). */
1007 +               page_clear_jnode(page, node);
1008 +               spin_unlock(&(node->load));
1009 +               spin_unlock_jnode(node);
1010 +
1011 +               /* we are under memory pressure so release jnode also. */
1012 +               jput(node);
1013 +
1014 +               return 1;
1015 +       } else {
1016 +               spin_unlock(&(node->load));
1017 +               spin_unlock_jnode(node);
1018 +               assert("nikita-3020", reiser4_schedulable());
1019 +               return 0;
1020 +       }
1021 +}
1022 +
1023 +/* Make Linus happy.
1024 +   Local variables:
1025 +   c-indentation-style: "K&R"
1026 +   mode-name: "LC"
1027 +   c-basic-offset: 8
1028 +   tab-width: 8
1029 +   fill-column: 120
1030 +   End:
1031 +*/
1032 diff --git a/fs/reiser4/block_alloc.c b/fs/reiser4/block_alloc.c
1033 new file mode 100644
1034 index 0000000..c405c5f
1035 --- /dev/null
1036 +++ b/fs/reiser4/block_alloc.c
1037 @@ -0,0 +1,1137 @@
1038 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
1039 +
1040 +#include "debug.h"
1041 +#include "dformat.h"
1042 +#include "plugin/plugin.h"
1043 +#include "txnmgr.h"
1044 +#include "znode.h"
1045 +#include "block_alloc.h"
1046 +#include "tree.h"
1047 +#include "super.h"
1048 +
1049 +#include <linux/types.h>       /* for __u??  */
1050 +#include <linux/fs.h>          /* for struct super_block  */
1051 +#include <linux/spinlock.h>
1052 +
1053 +/* THE REISER4 DISK SPACE RESERVATION SCHEME. */
1054 +
1055 +/* We need to be able to reserve enough disk space to ensure that an atomic
1056 +   operation will have enough disk space to flush (see flush.c and
1057 +   http://namesys.com/v4/v4.html) and commit it once it is started.
1058 +
1059 +   In our design a call for reserving disk space may fail but not an actual
1060 +   block allocation.
1061 +
1062 +   All free blocks, already allocated blocks, and all kinds of reserved blocks
1063 +   are counted in different per-fs block counters.
1064 +
1065 +   A reiser4 super block's set of block counters currently is:
1066 +
1067 +   free -- free blocks,
1068 +   used -- already allocated blocks,
1069 +
1070 +   grabbed -- initially reserved for performing an fs operation, those blocks
1071 +          are taken from free blocks, then grabbed disk space leaks from grabbed
1072 +          blocks counter to other counters like "fake allocated", "flush
1073 +          reserved", "used", the rest of not used grabbed space is returned to
1074 +          free space at the end of fs operation;
1075 +
1076 +   fake allocated -- counts all nodes without real disk block numbers assigned,
1077 +                     we have separate accounting for formatted and unformatted
1078 +                     nodes (for easier debugging);
1079 +
1080 +   flush reserved -- disk space needed for flushing and committing an atom.
1081 +                     Each dirty already allocated block could be written as a
1082 +                     part of atom's overwrite set or as a part of atom's
1083 +                     relocate set.  In both case one additional block is needed,
1084 +                     it is used as a wandered block if we do overwrite or as a
1085 +                    new location for a relocated block.
1086 +
1087 +   In addition, blocks in some states are counted on per-thread and per-atom
1088 +   basis.  A reiser4 context has a counter of blocks grabbed by this transaction
1089 +   and the sb's grabbed blocks counter is a sum of grabbed blocks counter values
1090 +   of each reiser4 context.  Each reiser4 atom has a counter of "flush reserved"
1091 +   blocks, which are reserved for flush processing and atom commit. */
1092 +
1093 +/* AN EXAMPLE: suppose we insert new item to the reiser4 tree.  We estimate
1094 +   number of blocks to grab for most expensive case of balancing when the leaf
1095 +   node we insert new item to gets split and new leaf node is allocated.
1096 +
1097 +   So, we need to grab blocks for
1098 +
1099 +   1) one block for possible dirtying the node we insert an item to. That block
1100 +      would be used for node relocation at flush time or for allocating of a
1101 +      wandered one, it depends what will be a result (what set, relocate or
1102 +      overwrite the node gets assigned to) of the node processing by the flush
1103 +      algorithm.
1104 +
1105 +   2) one block for either allocating a new node, or dirtying of right or left
1106 +      clean neighbor, only one case may happen.
1107 +
1108 +   VS-FIXME-HANS: why can only one case happen? I would expect to see dirtying of left neighbor, right neighbor, current
1109 +   node, and creation of new node.  have I forgotten something?  email me.
1110 +
1111 +   These grabbed blocks are counted in both reiser4 context "grabbed blocks"
1112 +   counter and in the fs-wide one (both ctx->grabbed_blocks and
1113 +   sbinfo->blocks_grabbed get incremented by 2), sb's free blocks counter is
1114 +   decremented by 2.
1115 +
1116 +   Suppose both two blocks were spent for dirtying of an already allocated clean
1117 +   node (one block went from "grabbed" to "flush reserved") and for new block
1118 +   allocating (one block went from "grabbed" to "fake allocated formatted").
1119 +
1120 +   Inserting of a child pointer to the parent node caused parent node to be
1121 +   split, the balancing code takes care about this grabbing necessary space
1122 +   immediately by calling reiser4_grab with BA_RESERVED flag set which means
1123 +   "can use the 5% reserved disk space".
1124 +
1125 +   At this moment insertion completes and grabbed blocks (if they were not used)
1126 +   should be returned to the free space counter.
1127 +
1128 +   However the atom life-cycle is not completed.  The atom had one "flush
1129 +   reserved" block added by our insertion and the new fake allocated node is
1130 +   counted as a "fake allocated formatted" one.  The atom has to be fully
1131 +   processed by flush before commit.  Suppose that the flush moved the first,
1132 +   already allocated node to the atom's overwrite list, the new fake allocated
1133 +   node, obviously, went into the atom relocate set.  The reiser4 flush
1134 +   allocates the new node using one unit from "fake allocated formatted"
1135 +   counter, the log writer uses one from "flush reserved" for wandered block
1136 +   allocation.
1137 +
1138 +   And, it is not the end.  When the wandered block is deallocated after the
1139 +   atom gets fully played (see wander.c for term description), the disk space
1140 +   occupied for it is returned to free blocks. */
1141 +
1142 +/* BLOCK NUMBERS */
1143 +
1144 +/* Any reiser4 node has a block number assigned to it.  We use these numbers for
1145 +   indexing in hash tables, so if a block has not yet been assigned a location
1146 +   on disk we need to give it a temporary fake block number.
1147 +
1148 +   Current implementation of reiser4 uses 64-bit integers for block numbers. We
1149 +   use highest bit in 64-bit block number to distinguish fake and real block
1150 +   numbers. So, only 63 bits may be used to addressing of real device
1151 +   blocks. That "fake" block numbers space is divided into subspaces of fake
1152 +   block numbers for data blocks and for shadow (working) bitmap blocks.
1153 +
1154 +   Fake block numbers for data blocks are generated by a cyclic counter, which
1155 +   gets incremented after each real block allocation. We assume that it is
1156 +   impossible to overload this counter during one transaction life. */
1157 +
1158 +/* Initialize a blocknr hint. */
1159 +void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint)
1160 +{
1161 +       memset(hint, 0, sizeof(reiser4_blocknr_hint));
1162 +}
1163 +
1164 +/* Release any resources of a blocknr hint. */
1165 +void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint UNUSED_ARG)
1166 +{
1167 +       /* No resources should be freed in current blocknr_hint implementation. */
1168 +}
1169 +
1170 +/* see above for explanation of fake block number.  */
1171 +/* Audited by: green(2002.06.11) */
1172 +int reiser4_blocknr_is_fake(const reiser4_block_nr * da)
1173 +{
1174 +       /* The reason for not simply returning result of '&' operation is that
1175 +          while return value is (possibly 32bit) int,  the reiser4_block_nr is
1176 +          at least 64 bits long, and high bit (which is the only possible
1177 +          non zero bit after the masking) would be stripped off */
1178 +       return (*da & REISER4_FAKE_BLOCKNR_BIT_MASK) ? 1 : 0;
1179 +}
1180 +
1181 +/* Static functions for <reiser4 super block>/<reiser4 context> block counters
1182 +   arithmetic. Mostly, they are isolated to not to code same assertions in
1183 +   several places. */
1184 +static void sub_from_ctx_grabbed(reiser4_context * ctx, __u64 count)
1185 +{
1186 +       BUG_ON(ctx->grabbed_blocks < count);
1187 +       assert("zam-527", ctx->grabbed_blocks >= count);
1188 +       ctx->grabbed_blocks -= count;
1189 +}
1190 +
1191 +static void add_to_ctx_grabbed(reiser4_context * ctx, __u64 count)
1192 +{
1193 +       ctx->grabbed_blocks += count;
1194 +}
1195 +
1196 +static void sub_from_sb_grabbed(reiser4_super_info_data * sbinfo, __u64 count)
1197 +{
1198 +       assert("zam-525", sbinfo->blocks_grabbed >= count);
1199 +       sbinfo->blocks_grabbed -= count;
1200 +}
1201 +
1202 +/* Decrease the counter of block reserved for flush in super block. */
1203 +static void
1204 +sub_from_sb_flush_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1205 +{
1206 +       assert("vpf-291", sbinfo->blocks_flush_reserved >= count);
1207 +       sbinfo->blocks_flush_reserved -= count;
1208 +}
1209 +
1210 +static void
1211 +sub_from_sb_fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1212 +                          reiser4_ba_flags_t flags)
1213 +{
1214 +       if (flags & BA_FORMATTED) {
1215 +               assert("zam-806", sbinfo->blocks_fake_allocated >= count);
1216 +               sbinfo->blocks_fake_allocated -= count;
1217 +       } else {
1218 +               assert("zam-528",
1219 +                      sbinfo->blocks_fake_allocated_unformatted >= count);
1220 +               sbinfo->blocks_fake_allocated_unformatted -= count;
1221 +       }
1222 +}
1223 +
1224 +static void sub_from_sb_used(reiser4_super_info_data * sbinfo, __u64 count)
1225 +{
1226 +       assert("zam-530",
1227 +              sbinfo->blocks_used >= count + sbinfo->min_blocks_used);
1228 +       sbinfo->blocks_used -= count;
1229 +}
1230 +
1231 +static void
1232 +sub_from_cluster_reserved(reiser4_super_info_data * sbinfo, __u64 count)
1233 +{
1234 +       assert("edward-501", sbinfo->blocks_clustered >= count);
1235 +       sbinfo->blocks_clustered -= count;
1236 +}
1237 +
1238 +/* Increase the counter of block reserved for flush in atom. */
1239 +static void add_to_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1240 +{
1241 +       assert("zam-772", atom != NULL);
1242 +       assert_spin_locked(&(atom->alock));
1243 +       atom->flush_reserved += count;
1244 +}
1245 +
1246 +/* Decrease the counter of block reserved for flush in atom. */
1247 +static void sub_from_atom_flush_reserved_nolock(txn_atom * atom, __u32 count)
1248 +{
1249 +       assert("zam-774", atom != NULL);
1250 +       assert_spin_locked(&(atom->alock));
1251 +       assert("nikita-2790", atom->flush_reserved >= count);
1252 +       atom->flush_reserved -= count;
1253 +}
1254 +
1255 +/* super block has 6 counters: free, used, grabbed, fake allocated
1256 +   (formatted and unformatted) and flush reserved. Their sum must be
1257 +   number of blocks on a device. This function checks this */
1258 +int reiser4_check_block_counters(const struct super_block *super)
1259 +{
1260 +       __u64 sum;
1261 +
1262 +       sum = reiser4_grabbed_blocks(super) + reiser4_free_blocks(super) +
1263 +           reiser4_data_blocks(super) + reiser4_fake_allocated(super) +
1264 +           reiser4_fake_allocated_unformatted(super) + reiser4_flush_reserved(super) +
1265 +           reiser4_clustered_blocks(super);
1266 +       if (reiser4_block_count(super) != sum) {
1267 +               printk("super block counters: "
1268 +                      "used %llu, free %llu, "
1269 +                      "grabbed %llu, fake allocated (formatetd %llu, unformatted %llu), "
1270 +                      "reserved %llu, clustered %llu, sum %llu, must be (block count) %llu\n",
1271 +                      (unsigned long long)reiser4_data_blocks(super),
1272 +                      (unsigned long long)reiser4_free_blocks(super),
1273 +                      (unsigned long long)reiser4_grabbed_blocks(super),
1274 +                      (unsigned long long)reiser4_fake_allocated(super),
1275 +                      (unsigned long long)
1276 +                      reiser4_fake_allocated_unformatted(super),
1277 +                      (unsigned long long)reiser4_flush_reserved(super),
1278 +                      (unsigned long long)reiser4_clustered_blocks(super),
1279 +                      (unsigned long long)sum,
1280 +                      (unsigned long long)reiser4_block_count(super));
1281 +               return 0;
1282 +       }
1283 +       return 1;
1284 +}
1285 +
1286 +/* Adjust "working" free blocks counter for number of blocks we are going to
1287 +   allocate.  Record number of grabbed blocks in fs-wide and per-thread
1288 +   counters.  This function should be called before bitmap scanning or
1289 +   allocating fake block numbers
1290 +
1291 +   @super           -- pointer to reiser4 super block;
1292 +   @count           -- number of blocks we reserve;
1293 +
1294 +   @return          -- 0 if success,  -ENOSPC, if all
1295 +                       free blocks are preserved or already allocated.
1296 +*/
1297 +
1298 +static int
1299 +reiser4_grab(reiser4_context * ctx, __u64 count, reiser4_ba_flags_t flags)
1300 +{
1301 +       __u64 free_blocks;
1302 +       int ret = 0, use_reserved = flags & BA_RESERVED;
1303 +       reiser4_super_info_data *sbinfo;
1304 +
1305 +       assert("vs-1276", ctx == get_current_context());
1306 +
1307 +       /* Do not grab anything on ro-mounted fs. */
1308 +       if (rofs_super(ctx->super)) {
1309 +               ctx->grab_enabled = 0;
1310 +               return 0;
1311 +       }
1312 +
1313 +       sbinfo = get_super_private(ctx->super);
1314 +
1315 +       spin_lock_reiser4_super(sbinfo);
1316 +
1317 +       free_blocks = sbinfo->blocks_free;
1318 +
1319 +       if ((use_reserved && free_blocks < count) ||
1320 +           (!use_reserved && free_blocks < count + sbinfo->blocks_reserved)) {
1321 +               ret = RETERR(-ENOSPC);
1322 +               goto unlock_and_ret;
1323 +       }
1324 +
1325 +       add_to_ctx_grabbed(ctx, count);
1326 +
1327 +       sbinfo->blocks_grabbed += count;
1328 +       sbinfo->blocks_free -= count;
1329 +
1330 +#if REISER4_DEBUG
1331 +       if (ctx->grabbed_initially == 0)
1332 +               ctx->grabbed_initially = count;
1333 +#endif
1334 +
1335 +       assert("nikita-2986", reiser4_check_block_counters(ctx->super));
1336 +
1337 +       /* disable grab space in current context */
1338 +       ctx->grab_enabled = 0;
1339 +
1340 +      unlock_and_ret:
1341 +       spin_unlock_reiser4_super(sbinfo);
1342 +
1343 +       return ret;
1344 +}
1345 +
1346 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags)
1347 +{
1348 +       int ret;
1349 +       reiser4_context *ctx;
1350 +
1351 +       assert("nikita-2964", ergo(flags & BA_CAN_COMMIT,
1352 +                                  lock_stack_isclean(get_current_lock_stack
1353 +                                                     ())));
1354 +       ctx = get_current_context();
1355 +       if (!(flags & BA_FORCE) && !is_grab_enabled(ctx)) {
1356 +               return 0;
1357 +       }
1358 +
1359 +       ret = reiser4_grab(ctx, count, flags);
1360 +       if (ret == -ENOSPC) {
1361 +
1362 +               /* Trying to commit the all transactions if BA_CAN_COMMIT flag present */
1363 +               if (flags & BA_CAN_COMMIT) {
1364 +                       txnmgr_force_commit_all(ctx->super, 0);
1365 +                       ctx->grab_enabled = 1;
1366 +                       ret = reiser4_grab(ctx, count, flags);
1367 +               }
1368 +       }
1369 +       /*
1370 +        * allocation from reserved pool cannot fail. This is severe error.
1371 +        */
1372 +       assert("nikita-3005", ergo(flags & BA_RESERVED, ret == 0));
1373 +       return ret;
1374 +}
1375 +
1376 +/*
1377 + * SPACE RESERVED FOR UNLINK/TRUNCATE
1378 + *
1379 + * Unlink and truncate require space in transaction (to update stat data, at
1380 + * least). But we don't want rm(1) to fail with "No space on device" error.
1381 + *
1382 + * Solution is to reserve 5% of disk space for truncates and
1383 + * unlinks. Specifically, normal space grabbing requests don't grab space from
1384 + * reserved area. Only requests with BA_RESERVED bit in flags are allowed to
1385 + * drain it. Per super block delete mutex is used to allow only one
1386 + * thread at a time to grab from reserved area.
1387 + *
1388 + * Grabbing from reserved area should always be performed with BA_CAN_COMMIT
1389 + * flag.
1390 + *
1391 + */
1392 +
1393 +int reiser4_grab_reserved(struct super_block *super,
1394 +                         __u64 count, reiser4_ba_flags_t flags)
1395 +{
1396 +       reiser4_super_info_data *sbinfo = get_super_private(super);
1397 +
1398 +       assert("nikita-3175", flags & BA_CAN_COMMIT);
1399 +
1400 +       /* Check the delete mutex already taken by us, we assume that
1401 +        * reading of machine word is atomic. */
1402 +       if (sbinfo->delete_mutex_owner == current) {
1403 +               if (reiser4_grab_space
1404 +                   (count, (flags | BA_RESERVED) & ~BA_CAN_COMMIT)) {
1405 +                       warning("zam-1003",
1406 +                               "nested call of grab_reserved fails count=(%llu)",
1407 +                               (unsigned long long)count);
1408 +                       reiser4_release_reserved(super);
1409 +                       return RETERR(-ENOSPC);
1410 +               }
1411 +               return 0;
1412 +       }
1413 +
1414 +       if (reiser4_grab_space(count, flags)) {
1415 +               mutex_lock(&sbinfo->delete_mutex);
1416 +               assert("nikita-2929", sbinfo->delete_mutex_owner == NULL);
1417 +               sbinfo->delete_mutex_owner = current;
1418 +
1419 +               if (reiser4_grab_space(count, flags | BA_RESERVED)) {
1420 +                       warning("zam-833",
1421 +                               "reserved space is not enough (%llu)",
1422 +                               (unsigned long long)count);
1423 +                       reiser4_release_reserved(super);
1424 +                       return RETERR(-ENOSPC);
1425 +               }
1426 +       }
1427 +       return 0;
1428 +}
1429 +
1430 +void reiser4_release_reserved(struct super_block *super)
1431 +{
1432 +       reiser4_super_info_data *info;
1433 +
1434 +       info = get_super_private(super);
1435 +       if (info->delete_mutex_owner == current) {
1436 +               info->delete_mutex_owner = NULL;
1437 +               mutex_unlock(&info->delete_mutex);
1438 +       }
1439 +}
1440 +
1441 +static reiser4_super_info_data *grabbed2fake_allocated_head(int count)
1442 +{
1443 +       reiser4_context *ctx;
1444 +       reiser4_super_info_data *sbinfo;
1445 +
1446 +       ctx = get_current_context();
1447 +       sub_from_ctx_grabbed(ctx, count);
1448 +
1449 +       sbinfo = get_super_private(ctx->super);
1450 +       spin_lock_reiser4_super(sbinfo);
1451 +
1452 +       sub_from_sb_grabbed(sbinfo, count);
1453 +       /* return sbinfo locked */
1454 +       return sbinfo;
1455 +}
1456 +
1457 +/* is called after @count fake block numbers are allocated and pointer to
1458 +   those blocks are inserted into tree. */
1459 +static void grabbed2fake_allocated_formatted(void)
1460 +{
1461 +       reiser4_super_info_data *sbinfo;
1462 +
1463 +       sbinfo = grabbed2fake_allocated_head(1);
1464 +       sbinfo->blocks_fake_allocated++;
1465 +
1466 +       assert("vs-922", reiser4_check_block_counters(reiser4_get_current_sb()));
1467 +
1468 +       spin_unlock_reiser4_super(sbinfo);
1469 +}
1470 +
1471 +/**
1472 + * grabbed2fake_allocated_unformatted
1473 + * @count:
1474 + *
1475 + */
1476 +static void grabbed2fake_allocated_unformatted(int count)
1477 +{
1478 +       reiser4_super_info_data *sbinfo;
1479 +
1480 +       sbinfo = grabbed2fake_allocated_head(count);
1481 +       sbinfo->blocks_fake_allocated_unformatted += count;
1482 +
1483 +       assert("vs-9221", reiser4_check_block_counters(reiser4_get_current_sb()));
1484 +
1485 +       spin_unlock_reiser4_super(sbinfo);
1486 +}
1487 +
1488 +void grabbed2cluster_reserved(int count)
1489 +{
1490 +       reiser4_context *ctx;
1491 +       reiser4_super_info_data *sbinfo;
1492 +
1493 +       ctx = get_current_context();
1494 +       sub_from_ctx_grabbed(ctx, count);
1495 +
1496 +       sbinfo = get_super_private(ctx->super);
1497 +       spin_lock_reiser4_super(sbinfo);
1498 +
1499 +       sub_from_sb_grabbed(sbinfo, count);
1500 +       sbinfo->blocks_clustered += count;
1501 +
1502 +       assert("edward-504", reiser4_check_block_counters(ctx->super));
1503 +
1504 +       spin_unlock_reiser4_super(sbinfo);
1505 +}
1506 +
1507 +void cluster_reserved2grabbed(int count)
1508 +{
1509 +       reiser4_context *ctx;
1510 +       reiser4_super_info_data *sbinfo;
1511 +
1512 +       ctx = get_current_context();
1513 +
1514 +       sbinfo = get_super_private(ctx->super);
1515 +       spin_lock_reiser4_super(sbinfo);
1516 +
1517 +       sub_from_cluster_reserved(sbinfo, count);
1518 +       sbinfo->blocks_grabbed += count;
1519 +
1520 +       assert("edward-505", reiser4_check_block_counters(ctx->super));
1521 +
1522 +       spin_unlock_reiser4_super(sbinfo);
1523 +       add_to_ctx_grabbed(ctx, count);
1524 +}
1525 +
1526 +void cluster_reserved2free(int count)
1527 +{
1528 +       reiser4_context *ctx;
1529 +       reiser4_super_info_data *sbinfo;
1530 +
1531 +       ctx = get_current_context();
1532 +       sbinfo = get_super_private(ctx->super);
1533 +
1534 +       cluster_reserved2grabbed(count);
1535 +       grabbed2free(ctx, sbinfo, count);
1536 +}
1537 +
1538 +static DEFINE_SPINLOCK(fake_lock);
1539 +static reiser4_block_nr fake_gen = 0;
1540 +
1541 +/**
1542 + * assign_fake_blocknr
1543 + * @blocknr:
1544 + * @count:
1545 + *
1546 + * Obtain a fake block number for new node which will be used to refer to
1547 + * this newly allocated node until real allocation is done.
1548 + */
1549 +static void assign_fake_blocknr(reiser4_block_nr *blocknr, int count)
1550 +{
1551 +       spin_lock(&fake_lock);
1552 +       *blocknr = fake_gen;
1553 +       fake_gen += count;
1554 +       spin_unlock(&fake_lock);
1555 +
1556 +       BUG_ON(*blocknr & REISER4_BLOCKNR_STATUS_BIT_MASK);
1557 +       /**blocknr &= ~REISER4_BLOCKNR_STATUS_BIT_MASK;*/
1558 +       *blocknr |= REISER4_UNALLOCATED_STATUS_VALUE;
1559 +       assert("zam-394", zlook(current_tree, blocknr) == NULL);
1560 +}
1561 +
1562 +int assign_fake_blocknr_formatted(reiser4_block_nr * blocknr)
1563 +{
1564 +       assign_fake_blocknr(blocknr, 1);
1565 +       grabbed2fake_allocated_formatted();
1566 +       return 0;
1567 +}
1568 +
1569 +/**
1570 + * fake_blocknrs_unformatted
1571 + * @count: number of fake numbers to get
1572 + *
1573 + * Allocates @count fake block numbers which will be assigned to jnodes
1574 + */
1575 +reiser4_block_nr fake_blocknr_unformatted(int count)
1576 +{
1577 +       reiser4_block_nr blocknr;
1578 +
1579 +       assign_fake_blocknr(&blocknr, count);
1580 +       grabbed2fake_allocated_unformatted(count);
1581 +
1582 +       return blocknr;
1583 +}
1584 +
1585 +/* adjust sb block counters, if real (on-disk) block allocation immediately
1586 +   follows grabbing of free disk space. */
1587 +static void grabbed2used(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1588 +                        __u64 count)
1589 +{
1590 +       sub_from_ctx_grabbed(ctx, count);
1591 +
1592 +       spin_lock_reiser4_super(sbinfo);
1593 +
1594 +       sub_from_sb_grabbed(sbinfo, count);
1595 +       sbinfo->blocks_used += count;
1596 +
1597 +       assert("nikita-2679", reiser4_check_block_counters(ctx->super));
1598 +
1599 +       spin_unlock_reiser4_super(sbinfo);
1600 +}
1601 +
1602 +/* adjust sb block counters when @count unallocated blocks get mapped to disk */
1603 +static void fake_allocated2used(reiser4_super_info_data *sbinfo, __u64 count,
1604 +                               reiser4_ba_flags_t flags)
1605 +{
1606 +       spin_lock_reiser4_super(sbinfo);
1607 +
1608 +       sub_from_sb_fake_allocated(sbinfo, count, flags);
1609 +       sbinfo->blocks_used += count;
1610 +
1611 +       assert("nikita-2680",
1612 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1613 +
1614 +       spin_unlock_reiser4_super(sbinfo);
1615 +}
1616 +
1617 +static void flush_reserved2used(txn_atom * atom, __u64 count)
1618 +{
1619 +       reiser4_super_info_data *sbinfo;
1620 +
1621 +       assert("zam-787", atom != NULL);
1622 +       assert_spin_locked(&(atom->alock));
1623 +
1624 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1625 +
1626 +       sbinfo = get_current_super_private();
1627 +       spin_lock_reiser4_super(sbinfo);
1628 +
1629 +       sub_from_sb_flush_reserved(sbinfo, count);
1630 +       sbinfo->blocks_used += count;
1631 +
1632 +       assert("zam-789",
1633 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1634 +
1635 +       spin_unlock_reiser4_super(sbinfo);
1636 +}
1637 +
1638 +/* update the per fs  blocknr hint default value. */
1639 +void
1640 +update_blocknr_hint_default(const struct super_block *s,
1641 +                           const reiser4_block_nr * block)
1642 +{
1643 +       reiser4_super_info_data *sbinfo = get_super_private(s);
1644 +
1645 +       assert("nikita-3342", !reiser4_blocknr_is_fake(block));
1646 +
1647 +       spin_lock_reiser4_super(sbinfo);
1648 +       if (*block < sbinfo->block_count) {
1649 +               sbinfo->blocknr_hint_default = *block;
1650 +       } else {
1651 +               warning("zam-676",
1652 +                       "block number %llu is too large to be used in a blocknr hint\n",
1653 +                       (unsigned long long)*block);
1654 +               dump_stack();
1655 +               DEBUGON(1);
1656 +       }
1657 +       spin_unlock_reiser4_super(sbinfo);
1658 +}
1659 +
1660 +/* get current value of the default blocknr hint. */
1661 +void get_blocknr_hint_default(reiser4_block_nr * result)
1662 +{
1663 +       reiser4_super_info_data *sbinfo = get_current_super_private();
1664 +
1665 +       spin_lock_reiser4_super(sbinfo);
1666 +       *result = sbinfo->blocknr_hint_default;
1667 +       assert("zam-677", *result < sbinfo->block_count);
1668 +       spin_unlock_reiser4_super(sbinfo);
1669 +}
1670 +
1671 +/* Allocate "real" disk blocks by calling a proper space allocation plugin
1672 + * method. Blocks are allocated in one contiguous disk region. The plugin
1673 + * independent part accounts blocks by subtracting allocated amount from grabbed
1674 + * or fake block counter and add the same amount to the counter of allocated
1675 + * blocks.
1676 + *
1677 + * @hint -- a reiser4 blocknr hint object which contains further block
1678 + *          allocation hints and parameters (search start, a stage of block
1679 + *          which will be mapped to disk, etc.),
1680 + * @blk  -- an out parameter for the beginning of the allocated region,
1681 + * @len  -- in/out parameter, it should contain the maximum number of allocated
1682 + *          blocks, after block allocation completes, it contains the length of
1683 + *          allocated disk region.
1684 + * @flags -- see reiser4_ba_flags_t description.
1685 + *
1686 + * @return -- 0 if success, error code otherwise.
1687 + */
1688 +int
1689 +reiser4_alloc_blocks(reiser4_blocknr_hint * hint, reiser4_block_nr * blk,
1690 +                    reiser4_block_nr * len, reiser4_ba_flags_t flags)
1691 +{
1692 +       __u64 needed = *len;
1693 +       reiser4_context *ctx;
1694 +       reiser4_super_info_data *sbinfo;
1695 +       int ret;
1696 +
1697 +       assert("zam-986", hint != NULL);
1698 +
1699 +       ctx = get_current_context();
1700 +       sbinfo = get_super_private(ctx->super);
1701 +
1702 +       /* For write-optimized data we use default search start value, which is
1703 +        * close to last write location. */
1704 +       if (flags & BA_USE_DEFAULT_SEARCH_START) {
1705 +               get_blocknr_hint_default(&hint->blk);
1706 +       }
1707 +
1708 +       /* VITALY: allocator should grab this for internal/tx-lists/similar only. */
1709 +/* VS-FIXME-HANS: why is this comment above addressed to vitaly (from vitaly)? */
1710 +       if (hint->block_stage == BLOCK_NOT_COUNTED) {
1711 +               ret = reiser4_grab_space_force(*len, flags);
1712 +               if (ret != 0)
1713 +                       return ret;
1714 +       }
1715 +
1716 +       ret =
1717 +           sa_alloc_blocks(reiser4_get_space_allocator(ctx->super),
1718 +                           hint, (int)needed, blk, len);
1719 +
1720 +       if (!ret) {
1721 +               assert("zam-680", *blk < reiser4_block_count(ctx->super));
1722 +               assert("zam-681",
1723 +                      *blk + *len <= reiser4_block_count(ctx->super));
1724 +
1725 +               if (flags & BA_PERMANENT) {
1726 +                       /* we assume that current atom exists at this moment */
1727 +                       txn_atom *atom = get_current_atom_locked();
1728 +                       atom->nr_blocks_allocated += *len;
1729 +                       spin_unlock_atom(atom);
1730 +               }
1731 +
1732 +               switch (hint->block_stage) {
1733 +               case BLOCK_NOT_COUNTED:
1734 +               case BLOCK_GRABBED:
1735 +                       grabbed2used(ctx, sbinfo, *len);
1736 +                       break;
1737 +               case BLOCK_UNALLOCATED:
1738 +                       fake_allocated2used(sbinfo, *len, flags);
1739 +                       break;
1740 +               case BLOCK_FLUSH_RESERVED:
1741 +                       {
1742 +                               txn_atom *atom = get_current_atom_locked();
1743 +                               flush_reserved2used(atom, *len);
1744 +                               spin_unlock_atom(atom);
1745 +                       }
1746 +                       break;
1747 +               default:
1748 +                       impossible("zam-531", "wrong block stage");
1749 +               }
1750 +       } else {
1751 +               assert("zam-821",
1752 +                      ergo(hint->max_dist == 0
1753 +                           && !hint->backward, ret != -ENOSPC));
1754 +               if (hint->block_stage == BLOCK_NOT_COUNTED)
1755 +                       grabbed2free(ctx, sbinfo, needed);
1756 +       }
1757 +
1758 +       return ret;
1759 +}
1760 +
1761 +/* used -> fake_allocated -> grabbed -> free */
1762 +
1763 +/* adjust sb block counters when @count unallocated blocks get unmapped from
1764 +   disk */
1765 +static void
1766 +used2fake_allocated(reiser4_super_info_data * sbinfo, __u64 count,
1767 +                   int formatted)
1768 +{
1769 +       spin_lock_reiser4_super(sbinfo);
1770 +
1771 +       if (formatted)
1772 +               sbinfo->blocks_fake_allocated += count;
1773 +       else
1774 +               sbinfo->blocks_fake_allocated_unformatted += count;
1775 +
1776 +       sub_from_sb_used(sbinfo, count);
1777 +
1778 +       assert("nikita-2681",
1779 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1780 +
1781 +       spin_unlock_reiser4_super(sbinfo);
1782 +}
1783 +
1784 +static void
1785 +used2flush_reserved(reiser4_super_info_data * sbinfo, txn_atom * atom,
1786 +                   __u64 count, reiser4_ba_flags_t flags UNUSED_ARG)
1787 +{
1788 +       assert("nikita-2791", atom != NULL);
1789 +       assert_spin_locked(&(atom->alock));
1790 +
1791 +       add_to_atom_flush_reserved_nolock(atom, (__u32) count);
1792 +
1793 +       spin_lock_reiser4_super(sbinfo);
1794 +
1795 +       sbinfo->blocks_flush_reserved += count;
1796 +       /*add_to_sb_flush_reserved(sbinfo, count); */
1797 +       sub_from_sb_used(sbinfo, count);
1798 +
1799 +       assert("nikita-2681",
1800 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1801 +
1802 +       spin_unlock_reiser4_super(sbinfo);
1803 +}
1804 +
1805 +/* disk space, virtually used by fake block numbers is counted as "grabbed" again. */
1806 +static void
1807 +fake_allocated2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1808 +                      __u64 count, reiser4_ba_flags_t flags)
1809 +{
1810 +       add_to_ctx_grabbed(ctx, count);
1811 +
1812 +       spin_lock_reiser4_super(sbinfo);
1813 +
1814 +       assert("nikita-2682", reiser4_check_block_counters(ctx->super));
1815 +
1816 +       sbinfo->blocks_grabbed += count;
1817 +       sub_from_sb_fake_allocated(sbinfo, count, flags & BA_FORMATTED);
1818 +
1819 +       assert("nikita-2683", reiser4_check_block_counters(ctx->super));
1820 +
1821 +       spin_unlock_reiser4_super(sbinfo);
1822 +}
1823 +
1824 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags)
1825 +{
1826 +       reiser4_context *ctx;
1827 +       reiser4_super_info_data *sbinfo;
1828 +
1829 +       ctx = get_current_context();
1830 +       sbinfo = get_super_private(ctx->super);
1831 +
1832 +       fake_allocated2grabbed(ctx, sbinfo, count, flags);
1833 +       grabbed2free(ctx, sbinfo, count);
1834 +}
1835 +
1836 +void grabbed2free_mark(__u64 mark)
1837 +{
1838 +       reiser4_context *ctx;
1839 +       reiser4_super_info_data *sbinfo;
1840 +
1841 +       ctx = get_current_context();
1842 +       sbinfo = get_super_private(ctx->super);
1843 +
1844 +       assert("nikita-3007", (__s64) mark >= 0);
1845 +       assert("nikita-3006", ctx->grabbed_blocks >= mark);
1846 +       grabbed2free(ctx, sbinfo, ctx->grabbed_blocks - mark);
1847 +}
1848 +
1849 +/**
1850 + * grabbed2free - adjust grabbed and free block counters
1851 + * @ctx: context to update grabbed block counter of
1852 + * @sbinfo: super block to update grabbed and free block counters of
1853 + * @count: number of blocks to adjust counters by
1854 + *
1855 + * Decreases context's and per filesystem's counters of grabbed
1856 + * blocks. Increases per filesystem's counter of free blocks.
1857 + */
1858 +void grabbed2free(reiser4_context *ctx, reiser4_super_info_data *sbinfo,
1859 +                 __u64 count)
1860 +{
1861 +       sub_from_ctx_grabbed(ctx, count);
1862 +
1863 +       spin_lock_reiser4_super(sbinfo);
1864 +
1865 +       sub_from_sb_grabbed(sbinfo, count);
1866 +       sbinfo->blocks_free += count;
1867 +       assert("nikita-2684", reiser4_check_block_counters(ctx->super));
1868 +
1869 +       spin_unlock_reiser4_super(sbinfo);
1870 +}
1871 +
1872 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count)
1873 +{
1874 +       reiser4_context *ctx;
1875 +       reiser4_super_info_data *sbinfo;
1876 +
1877 +       assert("vs-1095", atom);
1878 +
1879 +       ctx = get_current_context();
1880 +       sbinfo = get_super_private(ctx->super);
1881 +
1882 +       sub_from_ctx_grabbed(ctx, count);
1883 +
1884 +       add_to_atom_flush_reserved_nolock(atom, count);
1885 +
1886 +       spin_lock_reiser4_super(sbinfo);
1887 +
1888 +       sbinfo->blocks_flush_reserved += count;
1889 +       sub_from_sb_grabbed(sbinfo, count);
1890 +
1891 +       assert("vpf-292", reiser4_check_block_counters(ctx->super));
1892 +
1893 +       spin_unlock_reiser4_super(sbinfo);
1894 +}
1895 +
1896 +void grabbed2flush_reserved(__u64 count)
1897 +{
1898 +       txn_atom *atom = get_current_atom_locked();
1899 +
1900 +       grabbed2flush_reserved_nolock(atom, count);
1901 +
1902 +       spin_unlock_atom(atom);
1903 +}
1904 +
1905 +void flush_reserved2grabbed(txn_atom * atom, __u64 count)
1906 +{
1907 +       reiser4_context *ctx;
1908 +       reiser4_super_info_data *sbinfo;
1909 +
1910 +       assert("nikita-2788", atom != NULL);
1911 +       assert_spin_locked(&(atom->alock));
1912 +
1913 +       ctx = get_current_context();
1914 +       sbinfo = get_super_private(ctx->super);
1915 +
1916 +       add_to_ctx_grabbed(ctx, count);
1917 +
1918 +       sub_from_atom_flush_reserved_nolock(atom, (__u32) count);
1919 +
1920 +       spin_lock_reiser4_super(sbinfo);
1921 +
1922 +       sbinfo->blocks_grabbed += count;
1923 +       sub_from_sb_flush_reserved(sbinfo, count);
1924 +
1925 +       assert("vpf-292", reiser4_check_block_counters(ctx->super));
1926 +
1927 +       spin_unlock_reiser4_super(sbinfo);
1928 +}
1929 +
1930 +/**
1931 + * all_grabbed2free - releases all blocks grabbed in context
1932 + *
1933 + * Decreases context's and super block's grabbed block counters by number of
1934 + * blocks grabbed by current context and increases super block's free block
1935 + * counter correspondingly.
1936 + */
1937 +void all_grabbed2free(void)
1938 +{
1939 +       reiser4_context *ctx = get_current_context();
1940 +
1941 +       grabbed2free(ctx, get_super_private(ctx->super), ctx->grabbed_blocks);
1942 +}
1943 +
1944 +/* adjust sb block counters if real (on-disk) blocks do not become unallocated
1945 +   after freeing, @count blocks become "grabbed". */
1946 +static void
1947 +used2grabbed(reiser4_context * ctx, reiser4_super_info_data * sbinfo,
1948 +            __u64 count)
1949 +{
1950 +       add_to_ctx_grabbed(ctx, count);
1951 +
1952 +       spin_lock_reiser4_super(sbinfo);
1953 +
1954 +       sbinfo->blocks_grabbed += count;
1955 +       sub_from_sb_used(sbinfo, count);
1956 +
1957 +       assert("nikita-2685", reiser4_check_block_counters(ctx->super));
1958 +
1959 +       spin_unlock_reiser4_super(sbinfo);
1960 +}
1961 +
1962 +/* this used to be done through used2grabbed and grabbed2free*/
1963 +static void used2free(reiser4_super_info_data * sbinfo, __u64 count)
1964 +{
1965 +       spin_lock_reiser4_super(sbinfo);
1966 +
1967 +       sbinfo->blocks_free += count;
1968 +       sub_from_sb_used(sbinfo, count);
1969 +
1970 +       assert("nikita-2685",
1971 +              reiser4_check_block_counters(reiser4_get_current_sb()));
1972 +
1973 +       spin_unlock_reiser4_super(sbinfo);
1974 +}
1975 +
1976 +#if REISER4_DEBUG
1977 +
1978 +/* check "allocated" state of given block range */
1979 +static void
1980 +reiser4_check_blocks(const reiser4_block_nr * start,
1981 +                    const reiser4_block_nr * len, int desired)
1982 +{
1983 +       sa_check_blocks(start, len, desired);
1984 +}
1985 +
1986 +/* check "allocated" state of given block */
1987 +void reiser4_check_block(const reiser4_block_nr * block, int desired)
1988 +{
1989 +       const reiser4_block_nr one = 1;
1990 +
1991 +       reiser4_check_blocks(block, &one, desired);
1992 +}
1993 +
1994 +#endif
1995 +
1996 +/* Blocks deallocation function may do an actual deallocation through space
1997 +   plugin allocation or store deleted block numbers in atom's delete_set data
1998 +   structure depend on @defer parameter. */
1999 +
2000 +/* if BA_DEFER bit is not turned on, @target_stage means the stage of blocks which
2001 +   will be deleted from WORKING bitmap. They might be just unmapped from disk, or
2002 +   freed but disk space is still grabbed by current thread, or these blocks must
2003 +   not be counted in any reiser4 sb block counters, see block_stage_t comment */
2004 +
2005 +/* BA_FORMATTED bit is only used when BA_DEFER in not present: it is used to
2006 +   distinguish blocks allocated for unformatted and formatted nodes */
2007 +
2008 +int
2009 +reiser4_dealloc_blocks(const reiser4_block_nr * start,
2010 +                      const reiser4_block_nr * len,
2011 +                      block_stage_t target_stage, reiser4_ba_flags_t flags)
2012 +{
2013 +       txn_atom *atom = NULL;
2014 +       int ret;
2015 +       reiser4_context *ctx;
2016 +       reiser4_super_info_data *sbinfo;
2017 +
2018 +       ctx = get_current_context();
2019 +       sbinfo = get_super_private(ctx->super);
2020 +
2021 +       if (REISER4_DEBUG) {
2022 +               assert("zam-431", *len != 0);
2023 +               assert("zam-432", *start != 0);
2024 +               assert("zam-558", !reiser4_blocknr_is_fake(start));
2025 +
2026 +               spin_lock_reiser4_super(sbinfo);
2027 +               assert("zam-562", *start < sbinfo->block_count);
2028 +               spin_unlock_reiser4_super(sbinfo);
2029 +       }
2030 +
2031 +       if (flags & BA_DEFER) {
2032 +               blocknr_set_entry *bsep = NULL;
2033 +
2034 +               /* storing deleted block numbers in a blocknr set
2035 +                  datastructure for further actual deletion */
2036 +               do {
2037 +                       atom = get_current_atom_locked();
2038 +                       assert("zam-430", atom != NULL);
2039 +
2040 +                       ret =
2041 +                           blocknr_set_add_extent(atom, &atom->delete_set,
2042 +                                                  &bsep, start, len);
2043 +
2044 +                       if (ret == -ENOMEM)
2045 +                               return ret;
2046 +
2047 +                       /* This loop might spin at most two times */
2048 +               } while (ret == -E_REPEAT);
2049 +
2050 +               assert("zam-477", ret == 0);
2051 +               assert("zam-433", atom != NULL);
2052 +
2053 +               spin_unlock_atom(atom);
2054 +
2055 +       } else {
2056 +               assert("zam-425", get_current_super_private() != NULL);
2057 +               sa_dealloc_blocks(reiser4_get_space_allocator(ctx->super),
2058 +                                 *start, *len);
2059 +
2060 +               if (flags & BA_PERMANENT) {
2061 +                       /* These blocks were counted as allocated, we have to revert it
2062 +                        * back if allocation is discarded. */
2063 +                       txn_atom *atom = get_current_atom_locked();
2064 +                       atom->nr_blocks_allocated -= *len;
2065 +                       spin_unlock_atom(atom);
2066 +               }
2067 +
2068 +               switch (target_stage) {
2069 +               case BLOCK_NOT_COUNTED:
2070 +                       assert("vs-960", flags & BA_FORMATTED);
2071 +                       /* VITALY: This is what was grabbed for internal/tx-lists/similar only */
2072 +                       used2free(sbinfo, *len);
2073 +                       break;
2074 +
2075 +               case BLOCK_GRABBED:
2076 +                       used2grabbed(ctx, sbinfo, *len);
2077 +                       break;
2078 +
2079 +               case BLOCK_UNALLOCATED:
2080 +                       used2fake_allocated(sbinfo, *len, flags & BA_FORMATTED);
2081 +                       break;
2082 +
2083 +               case BLOCK_FLUSH_RESERVED:{
2084 +                               txn_atom *atom;
2085 +
2086 +                               atom = get_current_atom_locked();
2087 +                               used2flush_reserved(sbinfo, atom, *len,
2088 +                                                   flags & BA_FORMATTED);
2089 +                               spin_unlock_atom(atom);
2090 +                               break;
2091 +                       }
2092 +               default:
2093 +                       impossible("zam-532", "wrong block stage");
2094 +               }
2095 +       }
2096 +
2097 +       return 0;
2098 +}
2099 +
2100 +/* wrappers for block allocator plugin methods */
2101 +int reiser4_pre_commit_hook(void)
2102 +{
2103 +       assert("zam-502", get_current_super_private() != NULL);
2104 +       sa_pre_commit_hook();
2105 +       return 0;
2106 +}
2107 +
2108 +/* an actor which applies delete set to block allocator data */
2109 +static int
2110 +apply_dset(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
2111 +          const reiser4_block_nr * b, void *data UNUSED_ARG)
2112 +{
2113 +       reiser4_context *ctx;
2114 +       reiser4_super_info_data *sbinfo;
2115 +
2116 +       __u64 len = 1;
2117 +
2118 +       ctx = get_current_context();
2119 +       sbinfo = get_super_private(ctx->super);
2120 +
2121 +       assert("zam-877", atom->stage >= ASTAGE_PRE_COMMIT);
2122 +       assert("zam-552", sbinfo != NULL);
2123 +
2124 +       if (b != NULL)
2125 +               len = *b;
2126 +
2127 +       if (REISER4_DEBUG) {
2128 +               spin_lock_reiser4_super(sbinfo);
2129 +
2130 +               assert("zam-554", *a < reiser4_block_count(ctx->super));
2131 +               assert("zam-555", *a + len <= reiser4_block_count(ctx->super));
2132 +
2133 +               spin_unlock_reiser4_super(sbinfo);
2134 +       }
2135 +
2136 +       sa_dealloc_blocks(&sbinfo->space_allocator, *a, len);
2137 +       /* adjust sb block counters */
2138 +       used2free(sbinfo, len);
2139 +       return 0;
2140 +}
2141 +
2142 +void reiser4_post_commit_hook(void)
2143 +{
2144 +       txn_atom *atom;
2145 +
2146 +       atom = get_current_atom_locked();
2147 +       assert("zam-452", atom->stage == ASTAGE_POST_COMMIT);
2148 +       spin_unlock_atom(atom);
2149 +
2150 +       /* do the block deallocation which was deferred
2151 +          until commit is done */
2152 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset, NULL, 1);
2153 +
2154 +       assert("zam-504", get_current_super_private() != NULL);
2155 +       sa_post_commit_hook();
2156 +}
2157 +
2158 +void reiser4_post_write_back_hook(void)
2159 +{
2160 +       assert("zam-504", get_current_super_private() != NULL);
2161 +
2162 +       sa_post_commit_hook();
2163 +}
2164 +
2165 +/*
2166 +   Local variables:
2167 +   c-indentation-style: "K&R"
2168 +   mode-name: "LC"
2169 +   c-basic-offset: 8
2170 +   tab-width: 8
2171 +   fill-column: 120
2172 +   scroll-step: 1
2173 +   End:
2174 +*/
2175 diff --git a/fs/reiser4/block_alloc.h b/fs/reiser4/block_alloc.h
2176 new file mode 100644
2177 index 0000000..f4b79f8
2178 --- /dev/null
2179 +++ b/fs/reiser4/block_alloc.h
2180 @@ -0,0 +1,175 @@
2181 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2182 +
2183 +#if !defined (__FS_REISER4_BLOCK_ALLOC_H__)
2184 +#define __FS_REISER4_BLOCK_ALLOC_H__
2185 +
2186 +#include "dformat.h"
2187 +#include "forward.h"
2188 +
2189 +#include <linux/types.h>       /* for __u??  */
2190 +#include <linux/fs.h>
2191 +
2192 +/* Mask when is applied to given block number shows is that block number is a fake one */
2193 +#define REISER4_FAKE_BLOCKNR_BIT_MASK   0x8000000000000000ULL
2194 +/* Mask which isolates a type of object this fake block number was assigned to */
2195 +#define REISER4_BLOCKNR_STATUS_BIT_MASK 0xC000000000000000ULL
2196 +
2197 +/*result after applying the REISER4_BLOCKNR_STATUS_BIT_MASK should be compared
2198 +   against these two values to understand is the object unallocated or bitmap
2199 +   shadow object (WORKING BITMAP block, look at the plugin/space/bitmap.c) */
2200 +#define REISER4_UNALLOCATED_STATUS_VALUE    0xC000000000000000ULL
2201 +#define REISER4_BITMAP_BLOCKS_STATUS_VALUE  0x8000000000000000ULL
2202 +
2203 +/* specification how block allocation was counted in sb block counters */
2204 +typedef enum {
2205 +       BLOCK_NOT_COUNTED = 0,  /* reiser4 has no info about this block yet */
2206 +       BLOCK_GRABBED = 1,      /* free space grabbed for further allocation
2207 +                                  of this block */
2208 +       BLOCK_FLUSH_RESERVED = 2,       /* block is reserved for flush needs. */
2209 +       BLOCK_UNALLOCATED = 3,  /* block is used for existing in-memory object
2210 +                                  ( unallocated formatted or unformatted
2211 +                                  node) */
2212 +       BLOCK_ALLOCATED = 4     /* block is mapped to disk, real on-disk block
2213 +                                  number assigned */
2214 +} block_stage_t;
2215 +
2216 +/* a hint for block allocator */
2217 +struct reiser4_blocknr_hint {
2218 +       /* FIXME: I think we want to add a longterm lock on the bitmap block here.  This
2219 +          is to prevent jnode_flush() calls from interleaving allocations on the same
2220 +          bitmap, once a hint is established. */
2221 +
2222 +       /* search start hint */
2223 +       reiser4_block_nr blk;
2224 +       /* if not zero, it is a region size we search for free blocks in */
2225 +       reiser4_block_nr max_dist;
2226 +       /* level for allocation, may be useful have branch-level and higher
2227 +          write-optimized. */
2228 +       tree_level level;
2229 +       /* block allocator assumes that blocks, which will be mapped to disk,
2230 +          are in this specified block_stage */
2231 +       block_stage_t block_stage;
2232 +       /* If direction = 1 allocate blocks in backward direction from the end
2233 +        * of disk to the beginning of disk.  */
2234 +       unsigned int backward:1;
2235 +
2236 +};
2237 +
2238 +/* These flags control block allocation/deallocation behavior */
2239 +enum reiser4_ba_flags {
2240 +       /* do allocatations from reserved (5%) area */
2241 +       BA_RESERVED = (1 << 0),
2242 +
2243 +       /* block allocator can do commit trying to recover free space */
2244 +       BA_CAN_COMMIT = (1 << 1),
2245 +
2246 +       /* if operation will be applied to formatted block */
2247 +       BA_FORMATTED = (1 << 2),
2248 +
2249 +       /* defer actual block freeing until transaction commit */
2250 +       BA_DEFER = (1 << 3),
2251 +
2252 +       /* allocate blocks for permanent fs objects (formatted or unformatted), not
2253 +          wandered of log blocks */
2254 +       BA_PERMANENT = (1 << 4),
2255 +
2256 +       /* grab space even it was disabled */
2257 +       BA_FORCE = (1 << 5),
2258 +
2259 +       /* use default start value for free blocks search. */
2260 +       BA_USE_DEFAULT_SEARCH_START = (1 << 6)
2261 +};
2262 +
2263 +typedef enum reiser4_ba_flags reiser4_ba_flags_t;
2264 +
2265 +extern void reiser4_blocknr_hint_init(reiser4_blocknr_hint * hint);
2266 +extern void reiser4_blocknr_hint_done(reiser4_blocknr_hint * hint);
2267 +extern void update_blocknr_hint_default(const struct super_block *,
2268 +                                       const reiser4_block_nr *);
2269 +extern void get_blocknr_hint_default(reiser4_block_nr *);
2270 +
2271 +extern reiser4_block_nr reiser4_fs_reserved_space(struct super_block *super);
2272 +
2273 +int assign_fake_blocknr_formatted(reiser4_block_nr *);
2274 +reiser4_block_nr fake_blocknr_unformatted(int);
2275 +
2276 +/* free -> grabbed -> fake_allocated -> used */
2277 +
2278 +int reiser4_grab_space(__u64 count, reiser4_ba_flags_t flags);
2279 +void all_grabbed2free(void);
2280 +void grabbed2free(reiser4_context *, reiser4_super_info_data *, __u64 count);
2281 +void fake_allocated2free(__u64 count, reiser4_ba_flags_t flags);
2282 +void grabbed2flush_reserved_nolock(txn_atom * atom, __u64 count);
2283 +void grabbed2flush_reserved(__u64 count);
2284 +int reiser4_alloc_blocks(reiser4_blocknr_hint * hint,
2285 +                        reiser4_block_nr * start,
2286 +                        reiser4_block_nr * len, reiser4_ba_flags_t flags);
2287 +int reiser4_dealloc_blocks(const reiser4_block_nr *,
2288 +                          const reiser4_block_nr *,
2289 +                          block_stage_t, reiser4_ba_flags_t flags);
2290 +
2291 +static inline int reiser4_alloc_block(reiser4_blocknr_hint * hint,
2292 +                                     reiser4_block_nr * start,
2293 +                                     reiser4_ba_flags_t flags)
2294 +{
2295 +       reiser4_block_nr one = 1;
2296 +       return reiser4_alloc_blocks(hint, start, &one, flags);
2297 +}
2298 +
2299 +static inline int reiser4_dealloc_block(const reiser4_block_nr * block,
2300 +                                       block_stage_t stage,
2301 +                                       reiser4_ba_flags_t flags)
2302 +{
2303 +       const reiser4_block_nr one = 1;
2304 +       return reiser4_dealloc_blocks(block, &one, stage, flags);
2305 +}
2306 +
2307 +#define reiser4_grab_space_force(count, flags)         \
2308 +       reiser4_grab_space(count, flags | BA_FORCE)
2309 +
2310 +extern void grabbed2free_mark(__u64 mark);
2311 +extern int reiser4_grab_reserved(struct super_block *,
2312 +                                __u64, reiser4_ba_flags_t);
2313 +extern void reiser4_release_reserved(struct super_block *super);
2314 +
2315 +/* grabbed -> fake_allocated */
2316 +
2317 +/* fake_allocated -> used */
2318 +
2319 +/* used -> fake_allocated -> grabbed -> free */
2320 +
2321 +extern void flush_reserved2grabbed(txn_atom * atom, __u64 count);
2322 +
2323 +extern int reiser4_blocknr_is_fake(const reiser4_block_nr * da);
2324 +
2325 +extern void grabbed2cluster_reserved(int count);
2326 +extern void cluster_reserved2grabbed(int count);
2327 +extern void cluster_reserved2free(int count);
2328 +
2329 +extern int reiser4_check_block_counters(const struct super_block *);
2330 +
2331 +#if REISER4_DEBUG
2332 +
2333 +extern void reiser4_check_block(const reiser4_block_nr *, int);
2334 +
2335 +#else
2336 +
2337 +#  define reiser4_check_block(beg, val)        noop
2338 +
2339 +#endif
2340 +
2341 +extern int reiser4_pre_commit_hook(void);
2342 +extern void reiser4_post_commit_hook(void);
2343 +extern void reiser4_post_write_back_hook(void);
2344 +
2345 +#endif                         /* __FS_REISER4_BLOCK_ALLOC_H__ */
2346 +
2347 +/* Make Linus happy.
2348 +   Local variables:
2349 +   c-indentation-style: "K&R"
2350 +   mode-name: "LC"
2351 +   c-basic-offset: 8
2352 +   tab-width: 8
2353 +   fill-column: 120
2354 +   End:
2355 +*/
2356 diff --git a/fs/reiser4/blocknrset.c b/fs/reiser4/blocknrset.c
2357 new file mode 100644
2358 index 0000000..da50a5a
2359 --- /dev/null
2360 +++ b/fs/reiser4/blocknrset.c
2361 @@ -0,0 +1,368 @@
2362 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2363 +
2364 +/* This file contains code for various block number sets used by the atom to
2365 +   track the deleted set and wandered block mappings. */
2366 +
2367 +#include "debug.h"
2368 +#include "dformat.h"
2369 +#include "txnmgr.h"
2370 +#include "context.h"
2371 +
2372 +#include <linux/slab.h>
2373 +
2374 +/* The proposed data structure for storing unordered block number sets is a
2375 +   list of elements, each of which contains an array of block number or/and
2376 +   array of block number pairs. That element called blocknr_set_entry is used
2377 +   to store block numbers from the beginning and for extents from the end of
2378 +   the data field (char data[...]). The ->nr_blocks and ->nr_pairs fields
2379 +   count numbers of blocks and extents.
2380 +
2381 +   +------------------- blocknr_set_entry->data ------------------+
2382 +   |block1|block2| ... <free space> ... |pair3|pair2|pair1|
2383 +   +------------------------------------------------------------+
2384 +
2385 +   When current blocknr_set_entry is full, allocate a new one. */
2386 +
2387 +/* Usage examples: blocknr sets are used in reiser4 for storing atom's delete
2388 + * set (single blocks and block extents), in that case blocknr pair represent an
2389 + * extent; atom's wandered map is also stored as a blocknr set, blocknr pairs
2390 + * there represent a (real block) -> (wandered block) mapping. */
2391 +
2392 +/* Protection: blocknr sets belong to reiser4 atom, and
2393 + * their modifications are performed with the atom lock held */
2394 +
2395 +typedef struct blocknr_pair blocknr_pair;
2396 +
2397 +/* The total size of a blocknr_set_entry. */
2398 +#define BLOCKNR_SET_ENTRY_SIZE 128
2399 +
2400 +/* The number of blocks that can fit the blocknr data area. */
2401 +#define BLOCKNR_SET_ENTRIES_NUMBER             \
2402 +       ((BLOCKNR_SET_ENTRY_SIZE -              \
2403 +         2 * sizeof (unsigned) -               \
2404 +         sizeof(struct list_head)) /           \
2405 +        sizeof(reiser4_block_nr))
2406 +
2407 +/* An entry of the blocknr_set */
2408 +struct blocknr_set_entry {
2409 +       unsigned nr_singles;
2410 +       unsigned nr_pairs;
2411 +       struct list_head link;
2412 +       reiser4_block_nr entries[BLOCKNR_SET_ENTRIES_NUMBER];
2413 +};
2414 +
2415 +/* A pair of blocks as recorded in the blocknr_set_entry data. */
2416 +struct blocknr_pair {
2417 +       reiser4_block_nr a;
2418 +       reiser4_block_nr b;
2419 +};
2420 +
2421 +/* Return the number of blocknr slots available in a blocknr_set_entry. */
2422 +/* Audited by: green(2002.06.11) */
2423 +static unsigned bse_avail(blocknr_set_entry * bse)
2424 +{
2425 +       unsigned used = bse->nr_singles + 2 * bse->nr_pairs;
2426 +
2427 +       assert("jmacd-5088", BLOCKNR_SET_ENTRIES_NUMBER >= used);
2428 +       cassert(sizeof(blocknr_set_entry) == BLOCKNR_SET_ENTRY_SIZE);
2429 +
2430 +       return BLOCKNR_SET_ENTRIES_NUMBER - used;
2431 +}
2432 +
2433 +/* Initialize a blocknr_set_entry. */
2434 +static void bse_init(blocknr_set_entry *bse)
2435 +{
2436 +       bse->nr_singles = 0;
2437 +       bse->nr_pairs = 0;
2438 +       INIT_LIST_HEAD(&bse->link);
2439 +}
2440 +
2441 +/* Allocate and initialize a blocknr_set_entry. */
2442 +/* Audited by: green(2002.06.11) */
2443 +static blocknr_set_entry *bse_alloc(void)
2444 +{
2445 +       blocknr_set_entry *e;
2446 +
2447 +       if ((e = (blocknr_set_entry *) kmalloc(sizeof(blocknr_set_entry),
2448 +                                          reiser4_ctx_gfp_mask_get())) == NULL)
2449 +               return NULL;
2450 +
2451 +       bse_init(e);
2452 +
2453 +       return e;
2454 +}
2455 +
2456 +/* Free a blocknr_set_entry. */
2457 +/* Audited by: green(2002.06.11) */
2458 +static void bse_free(blocknr_set_entry * bse)
2459 +{
2460 +       kfree(bse);
2461 +}
2462 +
2463 +/* Add a block number to a blocknr_set_entry */
2464 +/* Audited by: green(2002.06.11) */
2465 +static void
2466 +bse_put_single(blocknr_set_entry * bse, const reiser4_block_nr * block)
2467 +{
2468 +       assert("jmacd-5099", bse_avail(bse) >= 1);
2469 +
2470 +       bse->entries[bse->nr_singles++] = *block;
2471 +}
2472 +
2473 +/* Get a pair of block numbers */
2474 +/* Audited by: green(2002.06.11) */
2475 +static inline blocknr_pair *bse_get_pair(blocknr_set_entry * bse, unsigned pno)
2476 +{
2477 +       assert("green-1", BLOCKNR_SET_ENTRIES_NUMBER >= 2 * (pno + 1));
2478 +
2479 +       return (blocknr_pair *) (bse->entries + BLOCKNR_SET_ENTRIES_NUMBER -
2480 +                                2 * (pno + 1));
2481 +}
2482 +
2483 +/* Add a pair of block numbers to a blocknr_set_entry */
2484 +/* Audited by: green(2002.06.11) */
2485 +static void
2486 +bse_put_pair(blocknr_set_entry * bse, const reiser4_block_nr * a,
2487 +            const reiser4_block_nr * b)
2488 +{
2489 +       blocknr_pair *pair;
2490 +
2491 +       assert("jmacd-5100", bse_avail(bse) >= 2 && a != NULL && b != NULL);
2492 +
2493 +       pair = bse_get_pair(bse, bse->nr_pairs++);
2494 +
2495 +       pair->a = *a;
2496 +       pair->b = *b;
2497 +}
2498 +
2499 +/* Add either a block or pair of blocks to the block number set.  The first
2500 +   blocknr (@a) must be non-NULL.  If @b is NULL a single blocknr is added, if
2501 +   @b is non-NULL a pair is added.  The block number set belongs to atom, and
2502 +   the call is made with the atom lock held.  There may not be enough space in
2503 +   the current blocknr_set_entry.  If new_bsep points to a non-NULL
2504 +   blocknr_set_entry then it will be added to the blocknr_set and new_bsep
2505 +   will be set to NULL.  If new_bsep contains NULL then the atom lock will be
2506 +   released and a new bse will be allocated in new_bsep.  E_REPEAT will be
2507 +   returned with the atom unlocked for the operation to be tried again.  If
2508 +   the operation succeeds, 0 is returned.  If new_bsep is non-NULL and not
2509 +   used during the call, it will be freed automatically. */
2510 +static int blocknr_set_add(txn_atom *atom, struct list_head *bset,
2511 +                          blocknr_set_entry **new_bsep, const reiser4_block_nr *a,
2512 +                          const reiser4_block_nr *b)
2513 +{
2514 +       blocknr_set_entry *bse;
2515 +       unsigned entries_needed;
2516 +
2517 +       assert("jmacd-5101", a != NULL);
2518 +
2519 +       entries_needed = (b == NULL) ? 1 : 2;
2520 +       if (list_empty(bset) ||
2521 +           bse_avail(list_entry(bset->next, blocknr_set_entry, link)) < entries_needed) {
2522 +               /* See if a bse was previously allocated. */
2523 +               if (*new_bsep == NULL) {
2524 +                       spin_unlock_atom(atom);
2525 +                       *new_bsep = bse_alloc();
2526 +                       return (*new_bsep != NULL) ? -E_REPEAT :
2527 +                               RETERR(-ENOMEM);
2528 +               }
2529 +
2530 +               /* Put it on the head of the list. */
2531 +               list_add(&((*new_bsep)->link), bset);
2532 +
2533 +               *new_bsep = NULL;
2534 +       }
2535 +
2536 +       /* Add the single or pair. */
2537 +       bse = list_entry(bset->next, blocknr_set_entry, link);
2538 +       if (b == NULL) {
2539 +               bse_put_single(bse, a);
2540 +       } else {
2541 +               bse_put_pair(bse, a, b);
2542 +       }
2543 +
2544 +       /* If new_bsep is non-NULL then there was an allocation race, free this copy. */
2545 +       if (*new_bsep != NULL) {
2546 +               bse_free(*new_bsep);
2547 +               *new_bsep = NULL;
2548 +       }
2549 +
2550 +       return 0;
2551 +}
2552 +
2553 +/* Add an extent to the block set.  If the length is 1, it is treated as a
2554 +   single block (e.g., reiser4_set_add_block). */
2555 +/* Audited by: green(2002.06.11) */
2556 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2557 +   kmalloc might schedule. The only exception is atom spinlock, which is
2558 +   properly freed. */
2559 +int
2560 +blocknr_set_add_extent(txn_atom * atom,
2561 +                      struct list_head * bset,
2562 +                      blocknr_set_entry ** new_bsep,
2563 +                      const reiser4_block_nr * start,
2564 +                      const reiser4_block_nr * len)
2565 +{
2566 +       assert("jmacd-5102", start != NULL && len != NULL && *len > 0);
2567 +       return blocknr_set_add(atom, bset, new_bsep, start,
2568 +                              *len == 1 ? NULL : len);
2569 +}
2570 +
2571 +/* Add a block pair to the block set. It adds exactly a pair, which is checked
2572 + * by an assertion that both arguments are not null.*/
2573 +/* Audited by: green(2002.06.11) */
2574 +/* Auditor note: Entire call chain cannot hold any spinlocks, because
2575 +   kmalloc might schedule. The only exception is atom spinlock, which is
2576 +   properly freed. */
2577 +int
2578 +blocknr_set_add_pair(txn_atom * atom,
2579 +                    struct list_head * bset,
2580 +                    blocknr_set_entry ** new_bsep, const reiser4_block_nr * a,
2581 +                    const reiser4_block_nr * b)
2582 +{
2583 +       assert("jmacd-5103", a != NULL && b != NULL);
2584 +       return blocknr_set_add(atom, bset, new_bsep, a, b);
2585 +}
2586 +
2587 +/* Initialize a blocknr_set. */
2588 +void blocknr_set_init(struct list_head *bset)
2589 +{
2590 +       INIT_LIST_HEAD(bset);
2591 +}
2592 +
2593 +/* Release the entries of a blocknr_set. */
2594 +void blocknr_set_destroy(struct list_head *bset)
2595 +{
2596 +       blocknr_set_entry *bse;
2597 +
2598 +       while (!list_empty(bset)) {
2599 +               bse = list_entry(bset->next, blocknr_set_entry, link);
2600 +               list_del_init(&bse->link);
2601 +               bse_free(bse);
2602 +       }
2603 +}
2604 +
2605 +/* Merge blocknr_set entries out of @from into @into. */
2606 +/* Audited by: green(2002.06.11) */
2607 +/* Auditor comments: This merge does not know if merged sets contain
2608 +   blocks pairs (As for wandered sets) or extents, so it cannot really merge
2609 +   overlapping ranges if there is some. So I believe it may lead to
2610 +   some blocks being presented several times in one blocknr_set. To help
2611 +   debugging such problems it might help to check for duplicate entries on
2612 +   actual processing of this set. Testing this kind of stuff right here is
2613 +   also complicated by the fact that these sets are not sorted and going
2614 +   through whole set on each element addition is going to be CPU-heavy task */
2615 +void blocknr_set_merge(struct list_head * from, struct list_head * into)
2616 +{
2617 +       blocknr_set_entry *bse_into = NULL;
2618 +
2619 +       /* If @from is empty, no work to perform. */
2620 +       if (list_empty(from))
2621 +               return;
2622 +       /* If @into is not empty, try merging partial-entries. */
2623 +       if (!list_empty(into)) {
2624 +
2625 +               /* Neither set is empty, pop the front to members and try to combine them. */
2626 +               blocknr_set_entry *bse_from;
2627 +               unsigned into_avail;
2628 +
2629 +               bse_into = list_entry(into->next, blocknr_set_entry, link);
2630 +               list_del_init(&bse_into->link);
2631 +               bse_from = list_entry(from->next, blocknr_set_entry, link);
2632 +               list_del_init(&bse_from->link);
2633 +
2634 +               /* Combine singles. */
2635 +               for (into_avail = bse_avail(bse_into);
2636 +                    into_avail != 0 && bse_from->nr_singles != 0;
2637 +                    into_avail -= 1) {
2638 +                       bse_put_single(bse_into,
2639 +                                      &bse_from->entries[--bse_from->
2640 +                                                         nr_singles]);
2641 +               }
2642 +
2643 +               /* Combine pairs. */
2644 +               for (; into_avail > 1 && bse_from->nr_pairs != 0;
2645 +                    into_avail -= 2) {
2646 +                       blocknr_pair *pair =
2647 +                           bse_get_pair(bse_from, --bse_from->nr_pairs);
2648 +                       bse_put_pair(bse_into, &pair->a, &pair->b);
2649 +               }
2650 +
2651 +               /* If bse_from is empty, delete it now. */
2652 +               if (bse_avail(bse_from) == BLOCKNR_SET_ENTRIES_NUMBER) {
2653 +                       bse_free(bse_from);
2654 +               } else {
2655 +                       /* Otherwise, bse_into is full or nearly full (e.g.,
2656 +                          it could have one slot avail and bse_from has one
2657 +                          pair left).  Push it back onto the list.  bse_from
2658 +                          becomes bse_into, which will be the new partial. */
2659 +                       list_add(&bse_into->link, into);
2660 +                       bse_into = bse_from;
2661 +               }
2662 +       }
2663 +
2664 +       /* Splice lists together. */
2665 +       list_splice_init(from, into->prev);
2666 +
2667 +       /* Add the partial entry back to the head of the list. */
2668 +       if (bse_into != NULL)
2669 +               list_add(&bse_into->link, into);
2670 +}
2671 +
2672 +/* Iterate over all blocknr set elements. */
2673 +int blocknr_set_iterator(txn_atom *atom, struct list_head *bset,
2674 +                        blocknr_set_actor_f actor, void *data, int delete)
2675 +{
2676 +
2677 +       blocknr_set_entry *entry;
2678 +
2679 +       assert("zam-429", atom != NULL);
2680 +       assert("zam-430", atom_is_protected(atom));
2681 +       assert("zam-431", bset != 0);
2682 +       assert("zam-432", actor != NULL);
2683 +
2684 +       entry = list_entry(bset->next, blocknr_set_entry, link);
2685 +       while (bset != &entry->link) {
2686 +               blocknr_set_entry *tmp = list_entry(entry->link.next, blocknr_set_entry, link);
2687 +               unsigned int i;
2688 +               int ret;
2689 +
2690 +               for (i = 0; i < entry->nr_singles; i++) {
2691 +                       ret = actor(atom, &entry->entries[i], NULL, data);
2692 +
2693 +                       /* We can't break a loop if delete flag is set. */
2694 +                       if (ret != 0 && !delete)
2695 +                               return ret;
2696 +               }
2697 +
2698 +               for (i = 0; i < entry->nr_pairs; i++) {
2699 +                       struct blocknr_pair *ab;
2700 +
2701 +                       ab = bse_get_pair(entry, i);
2702 +
2703 +                       ret = actor(atom, &ab->a, &ab->b, data);
2704 +
2705 +                       if (ret != 0 && !delete)
2706 +                               return ret;
2707 +               }
2708 +
2709 +               if (delete) {
2710 +                       list_del(&entry->link);
2711 +                       bse_free(entry);
2712 +               }
2713 +
2714 +               entry = tmp;
2715 +       }
2716 +
2717 +       return 0;
2718 +}
2719 +
2720 +/*
2721 + * Local variables:
2722 + * c-indentation-style: "K&R"
2723 + * mode-name: "LC"
2724 + * c-basic-offset: 8
2725 + * tab-width: 8
2726 + * fill-column: 79
2727 + * scroll-step: 1
2728 + * End:
2729 + */
2730 diff --git a/fs/reiser4/carry.c b/fs/reiser4/carry.c
2731 new file mode 100644
2732 index 0000000..c90a0f0
2733 --- /dev/null
2734 +++ b/fs/reiser4/carry.c
2735 @@ -0,0 +1,1391 @@
2736 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
2737 +/* Functions to "carry" tree modification(s) upward. */
2738 +/* Tree is modified one level at a time. As we modify a level we accumulate a
2739 +   set of changes that need to be propagated to the next level.  We manage
2740 +   node locking such that any searches that collide with carrying are
2741 +   restarted, from the root if necessary.
2742 +
2743 +   Insertion of a new item may result in items being moved among nodes and
2744 +   this requires the delimiting key to be updated at the least common parent
2745 +   of the nodes modified to preserve search tree invariants. Also, insertion
2746 +   may require allocation of a new node. A pointer to the new node has to be
2747 +   inserted into some node on the parent level, etc.
2748 +
2749 +   Tree carrying is meant to be analogous to arithmetic carrying.
2750 +
2751 +   A carry operation is always associated with some node (&carry_node).
2752 +
2753 +   Carry process starts with some initial set of operations to be performed
2754 +   and an initial set of already locked nodes.  Operations are performed one
2755 +   by one. Performing each single operation has following possible effects:
2756 +
2757 +    - content of carry node associated with operation is modified
2758 +    - new carry nodes are locked and involved into carry process on this level
2759 +    - new carry operations are posted to the next level
2760 +
2761 +   After all carry operations on this level are done, process is repeated for
2762 +   the accumulated sequence on carry operations for the next level. This
2763 +   starts by trying to lock (in left to right order) all carry nodes
2764 +   associated with carry operations on the parent level. After this, we decide
2765 +   whether more nodes are required on the left of already locked set. If so,
2766 +   all locks taken on the parent level are released, new carry nodes are
2767 +   added, and locking process repeats.
2768 +
2769 +   It may happen that balancing process fails owing to unrecoverable error on
2770 +   some of upper levels of a tree (possible causes are io error, failure to
2771 +   allocate new node, etc.). In this case we should unmount the filesystem,
2772 +   rebooting if it is the root, and possibly advise the use of fsck.
2773 +
2774 +   USAGE:
2775 +
2776 +    int some_tree_operation( znode *node, ... )
2777 +    {
2778 +       // Allocate on a stack pool of carry objects: operations and nodes.
2779 +       // Most carry processes will only take objects from here, without
2780 +       // dynamic allocation.
2781 +
2782 +I feel uneasy about this pool.  It adds to code complexity, I understand why it exists, but.... -Hans
2783 +
2784 +       carry_pool  pool;
2785 +       carry_level lowest_level;
2786 +       carry_op   *op;
2787 +
2788 +       init_carry_pool( &pool );
2789 +       init_carry_level( &lowest_level, &pool );
2790 +
2791 +       // operation may be one of:
2792 +       //   COP_INSERT    --- insert new item into node
2793 +       //   COP_CUT       --- remove part of or whole node
2794 +       //   COP_PASTE     --- increase size of item
2795 +       //   COP_DELETE    --- delete pointer from parent node
2796 +       //   COP_UPDATE    --- update delimiting key in least
2797 +       //                     common ancestor of two
2798 +
2799 +       op = reiser4_post_carry( &lowest_level, operation, node, 0 );
2800 +       if( IS_ERR( op ) || ( op == NULL ) ) {
2801 +           handle error
2802 +       } else {
2803 +           // fill in remaining fields in @op, according to carry.h:carry_op
2804 +           result = carry( &lowest_level, NULL );
2805 +       }
2806 +       done_carry_pool( &pool );
2807 +    }
2808 +
2809 +   When you are implementing node plugin method that participates in carry
2810 +   (shifting, insertion, deletion, etc.), do the following:
2811 +
2812 +   int foo_node_method( znode *node, ..., carry_level *todo )
2813 +   {
2814 +       carry_op   *op;
2815 +
2816 +       ....
2817 +
2818 +       // note, that last argument to reiser4_post_carry() is non-null
2819 +       // here, because @op is to be applied to the parent of @node, rather
2820 +       // than to the @node itself as in the previous case.
2821 +
2822 +       op = node_post_carry( todo, operation, node, 1 );
2823 +       // fill in remaining fields in @op, according to carry.h:carry_op
2824 +
2825 +       ....
2826 +
2827 +   }
2828 +
2829 +   BATCHING:
2830 +
2831 +   One of the main advantages of level-by-level balancing implemented here is
2832 +   ability to batch updates on a parent level and to peform them more
2833 +   efficiently as a result.
2834 +
2835 +   Description To Be Done (TBD).
2836 +
2837 +   DIFFICULTIES AND SUBTLE POINTS:
2838 +
2839 +   1. complex plumbing is required, because:
2840 +
2841 +       a. effective allocation through pools is needed
2842 +
2843 +       b. target of operation is not exactly known when operation is
2844 +       posted. This is worked around through bitfields in &carry_node and
2845 +       logic in lock_carry_node()
2846 +
2847 +       c. of interaction with locking code: node should be added into sibling
2848 +       list when pointer to it is inserted into its parent, which is some time
2849 +       after node was created. Between these moments, node is somewhat in
2850 +       suspended state and is only registered in the carry lists
2851 +
2852 +    2. whole balancing logic is implemented here, in particular, insertion
2853 +    logic is coded in make_space().
2854 +
2855 +    3. special cases like insertion (reiser4_add_tree_root()) or deletion
2856 +    (reiser4_kill_tree_root()) of tree root and morphing of paste into insert
2857 +    (insert_paste()) have to be handled.
2858 +
2859 +    4. there is non-trivial interdependency between allocation of new nodes
2860 +    and almost everything else. This is mainly due to the (1.c) above. I shall
2861 +    write about this later.
2862 +
2863 +*/
2864 +
2865 +#include "forward.h"
2866 +#include "debug.h"
2867 +#include "key.h"
2868 +#include "coord.h"
2869 +#include "plugin/item/item.h"
2870 +#include "plugin/item/extent.h"
2871 +#include "plugin/node/node.h"
2872 +#include "jnode.h"
2873 +#include "znode.h"
2874 +#include "tree_mod.h"
2875 +#include "tree_walk.h"
2876 +#include "block_alloc.h"
2877 +#include "pool.h"
2878 +#include "tree.h"
2879 +#include "carry.h"
2880 +#include "carry_ops.h"
2881 +#include "super.h"
2882 +#include "reiser4.h"
2883 +
2884 +#include <linux/types.h>
2885 +
2886 +/* level locking/unlocking */
2887 +static int lock_carry_level(carry_level * level);
2888 +static void unlock_carry_level(carry_level * level, int failure);
2889 +static void done_carry_level(carry_level * level);
2890 +static void unlock_carry_node(carry_level * level, carry_node * node, int fail);
2891 +
2892 +int lock_carry_node(carry_level * level, carry_node * node);
2893 +int lock_carry_node_tail(carry_node * node);
2894 +
2895 +/* carry processing proper */
2896 +static int carry_on_level(carry_level * doing, carry_level * todo);
2897 +
2898 +static carry_op *add_op(carry_level * level, pool_ordering order,
2899 +                       carry_op * reference);
2900 +
2901 +/* handlers for carry operations. */
2902 +
2903 +static void fatal_carry_error(carry_level * doing, int ecode);
2904 +static int add_new_root(carry_level * level, carry_node * node, znode * fake);
2905 +
2906 +static void print_level(const char *prefix, carry_level * level);
2907 +
2908 +#if REISER4_DEBUG
2909 +typedef enum {
2910 +       CARRY_TODO,
2911 +       CARRY_DOING
2912 +} carry_queue_state;
2913 +static int carry_level_invariant(carry_level * level, carry_queue_state state);
2914 +#endif
2915 +
2916 +/* main entry point for tree balancing.
2917 +
2918 +   Tree carry performs operations from @doing and while doing so accumulates
2919 +   information about operations to be performed on the next level ("carried"
2920 +   to the parent level). Carried operations are performed, causing possibly
2921 +   more operations to be carried upward etc. carry() takes care about
2922 +   locking and pinning znodes while operating on them.
2923 +
2924 +   For usage, see comment at the top of fs/reiser4/carry.c
2925 +
2926 +*/
2927 +int reiser4_carry(carry_level * doing /* set of carry operations to be
2928 +                                      * performed */ ,
2929 +                 carry_level * done  /* set of nodes, already performed
2930 +                                      *  at the previous level.
2931 +                                      * NULL in most cases */)
2932 +{
2933 +       int result = 0;
2934 +       /* queue of new requests */
2935 +       carry_level *todo;
2936 +       ON_DEBUG(STORE_COUNTERS);
2937 +
2938 +       assert("nikita-888", doing != NULL);
2939 +       BUG_ON(done != NULL);
2940 +
2941 +       todo = doing + 1;
2942 +       init_carry_level(todo, doing->pool);
2943 +
2944 +       /* queue of requests preformed on the previous level */
2945 +       done = todo + 1;
2946 +       init_carry_level(done, doing->pool);
2947 +
2948 +       /* iterate until there is nothing more to do */
2949 +       while (result == 0 && doing->ops_num > 0) {
2950 +               carry_level *tmp;
2951 +
2952 +               /* at this point @done is locked. */
2953 +               /* repeat lock/do/unlock while
2954 +
2955 +                  (1) lock_carry_level() fails due to deadlock avoidance, or
2956 +
2957 +                  (2) carry_on_level() decides that more nodes have to
2958 +                  be involved.
2959 +
2960 +                  (3) some unexpected error occurred while balancing on the
2961 +                  upper levels. In this case all changes are rolled back.
2962 +
2963 +                */
2964 +               while (1) {
2965 +                       result = lock_carry_level(doing);
2966 +                       if (result == 0) {
2967 +                               /* perform operations from @doing and
2968 +                                  accumulate new requests in @todo */
2969 +                               result = carry_on_level(doing, todo);
2970 +                               if (result == 0)
2971 +                                       break;
2972 +                               else if (result != -E_REPEAT ||
2973 +                                        !doing->restartable) {
2974 +                                       warning("nikita-1043",
2975 +                                               "Fatal error during carry: %i",
2976 +                                               result);
2977 +                                       print_level("done", done);
2978 +                                       print_level("doing", doing);
2979 +                                       print_level("todo", todo);
2980 +                                       /* do some rough stuff like aborting
2981 +                                          all pending transcrashes and thus
2982 +                                          pushing tree back to the consistent
2983 +                                          state. Alternatvely, just panic.
2984 +                                        */
2985 +                                       fatal_carry_error(doing, result);
2986 +                                       return result;
2987 +                               }
2988 +                       } else if (result != -E_REPEAT) {
2989 +                               fatal_carry_error(doing, result);
2990 +                               return result;
2991 +                       }
2992 +                       unlock_carry_level(doing, 1);
2993 +               }
2994 +               /* at this point @done can be safely unlocked */
2995 +               done_carry_level(done);
2996 +
2997 +               /* cyclically shift queues */
2998 +               tmp = done;
2999 +               done = doing;
3000 +               doing = todo;
3001 +               todo = tmp;
3002 +               init_carry_level(todo, doing->pool);
3003 +
3004 +               /* give other threads chance to run */
3005 +               reiser4_preempt_point();
3006 +       }
3007 +       done_carry_level(done);
3008 +
3009 +       /* all counters, but x_refs should remain the same. x_refs can change
3010 +          owing to transaction manager */
3011 +       ON_DEBUG(CHECK_COUNTERS);
3012 +       return result;
3013 +}
3014 +
3015 +/* perform carry operations on given level.
3016 +
3017 +   Optimizations proposed by pooh:
3018 +
3019 +   (1) don't lock all nodes from queue at the same time. Lock nodes lazily as
3020 +   required;
3021 +
3022 +   (2) unlock node if there are no more operations to be performed upon it and
3023 +   node didn't add any operation to @todo. This can be implemented by
3024 +   attaching to each node two counters: counter of operaions working on this
3025 +   node and counter and operations carried upward from this node.
3026 +
3027 +*/
3028 +static int carry_on_level(carry_level * doing  /* queue of carry operations to
3029 +                                                * do on this level */ ,
3030 +                         carry_level * todo    /* queue where new carry
3031 +                                                * operations to be performed on
3032 +                                                * the * parent level are
3033 +                                                * accumulated during @doing
3034 +                                                * processing. */ )
3035 +{
3036 +       int result;
3037 +       int (*f) (carry_op *, carry_level *, carry_level *);
3038 +       carry_op *op;
3039 +       carry_op *tmp_op;
3040 +
3041 +       assert("nikita-1034", doing != NULL);
3042 +       assert("nikita-1035", todo != NULL);
3043 +
3044 +       /* @doing->nodes are locked. */
3045 +
3046 +       /* This function can be split into two phases: analysis and modification.
3047 +
3048 +          Analysis calculates precisely what items should be moved between
3049 +          nodes. This information is gathered in some structures attached to
3050 +          each carry_node in a @doing queue. Analysis also determines whether
3051 +          new nodes are to be allocated etc.
3052 +
3053 +          After analysis is completed, actual modification is performed. Here
3054 +          we can take advantage of "batch modification": if there are several
3055 +          operations acting on the same node, modifications can be performed
3056 +          more efficiently when batched together.
3057 +
3058 +          Above is an optimization left for the future.
3059 +        */
3060 +       /* Important, but delayed optimization: it's possible to batch
3061 +          operations together and perform them more efficiently as a
3062 +          result. For example, deletion of several neighboring items from a
3063 +          node can be converted to a single ->cut() operation.
3064 +
3065 +          Before processing queue, it should be scanned and "mergeable"
3066 +          operations merged.
3067 +        */
3068 +       result = 0;
3069 +       for_all_ops(doing, op, tmp_op) {
3070 +               carry_opcode opcode;
3071 +
3072 +               assert("nikita-1041", op != NULL);
3073 +               opcode = op->op;
3074 +               assert("nikita-1042", op->op < COP_LAST_OP);
3075 +               f = op_dispatch_table[op->op].handler;
3076 +               result = f(op, doing, todo);
3077 +               /* locking can fail with -E_REPEAT. Any different error is fatal
3078 +                  and will be handled by fatal_carry_error() sledgehammer.
3079 +                */
3080 +               if (result != 0)
3081 +                       break;
3082 +       }
3083 +       if (result == 0) {
3084 +               carry_plugin_info info;
3085 +               carry_node *scan;
3086 +               carry_node *tmp_scan;
3087 +
3088 +               info.doing = doing;
3089 +               info.todo = todo;
3090 +
3091 +               assert("nikita-3002",
3092 +                      carry_level_invariant(doing, CARRY_DOING));
3093 +               for_all_nodes(doing, scan, tmp_scan) {
3094 +                       znode *node;
3095 +
3096 +                       node = reiser4_carry_real(scan);
3097 +                       assert("nikita-2547", node != NULL);
3098 +                       if (node_is_empty(node)) {
3099 +                               result =
3100 +                                   node_plugin_by_node(node)->
3101 +                                   prepare_removal(node, &info);
3102 +                               if (result != 0)
3103 +                                       break;
3104 +                       }
3105 +               }
3106 +       }
3107 +       return result;
3108 +}
3109 +
3110 +/* post carry operation
3111 +
3112 +   This is main function used by external carry clients: node layout plugins
3113 +   and tree operations to create new carry operation to be performed on some
3114 +   level.
3115 +
3116 +   New operation will be included in the @level queue. To actually perform it,
3117 +   call carry( level, ... ). This function takes write lock on @node. Carry
3118 +   manages all its locks by itself, don't worry about this.
3119 +
3120 +   This function adds operation and node at the end of the queue. It is up to
3121 +   caller to guarantee proper ordering of node queue.
3122 +
3123 +*/
3124 +carry_op * reiser4_post_carry(carry_level * level /* queue where new operation
3125 +                                                  * is to be posted at */ ,
3126 +                             carry_opcode op /* opcode of operation */ ,
3127 +                             znode * node      /* node on which this operation
3128 +                                                * will operate */ ,
3129 +                             int apply_to_parent_p /* whether operation will
3130 +                                                    * operate directly on @node
3131 +                                                    * or on it parent. */)
3132 +{
3133 +       carry_op *result;
3134 +       carry_node *child;
3135 +
3136 +       assert("nikita-1046", level != NULL);
3137 +       assert("nikita-1788", znode_is_write_locked(node));
3138 +
3139 +       result = add_op(level, POOLO_LAST, NULL);
3140 +       if (IS_ERR(result))
3141 +               return result;
3142 +       child = reiser4_add_carry(level, POOLO_LAST, NULL);
3143 +       if (IS_ERR(child)) {
3144 +               reiser4_pool_free(&level->pool->op_pool, &result->header);
3145 +               return (carry_op *) child;
3146 +       }
3147 +       result->node = child;
3148 +       result->op = op;
3149 +       child->parent = apply_to_parent_p;
3150 +       if (ZF_ISSET(node, JNODE_ORPHAN))
3151 +               child->left_before = 1;
3152 +       child->node = node;
3153 +       return result;
3154 +}
3155 +
3156 +/* initialize carry queue */
3157 +void init_carry_level(carry_level * level /* level to initialize */ ,
3158 +                     carry_pool * pool /* pool @level will allocate objects
3159 +                                        * from */ )
3160 +{
3161 +       assert("nikita-1045", level != NULL);
3162 +       assert("nikita-967", pool != NULL);
3163 +
3164 +       memset(level, 0, sizeof *level);
3165 +       level->pool = pool;
3166 +
3167 +       INIT_LIST_HEAD(&level->nodes);
3168 +       INIT_LIST_HEAD(&level->ops);
3169 +}
3170 +
3171 +/* allocate carry pool and initialize pools within queue */
3172 +carry_pool *init_carry_pool(int size)
3173 +{
3174 +       carry_pool *pool;
3175 +
3176 +       assert("", size >= sizeof(carry_pool) + 3 * sizeof(carry_level));
3177 +       pool = kmalloc(size, reiser4_ctx_gfp_mask_get());
3178 +       if (pool == NULL)
3179 +               return ERR_PTR(RETERR(-ENOMEM));
3180 +
3181 +       reiser4_init_pool(&pool->op_pool, sizeof(carry_op), CARRIES_POOL_SIZE,
3182 +                         (char *)pool->op);
3183 +       reiser4_init_pool(&pool->node_pool, sizeof(carry_node),
3184 +                         NODES_LOCKED_POOL_SIZE, (char *)pool->node);
3185 +       return pool;
3186 +}
3187 +
3188 +/* finish with queue pools */
3189 +void done_carry_pool(carry_pool * pool /* pool to destroy */ )
3190 +{
3191 +       reiser4_done_pool(&pool->op_pool);
3192 +       reiser4_done_pool(&pool->node_pool);
3193 +       kfree(pool);
3194 +}
3195 +
3196 +/* add new carry node to the @level.
3197 +
3198 +   Returns pointer to the new carry node allocated from pool.  It's up to
3199 +   callers to maintain proper order in the @level. Assumption is that if carry
3200 +   nodes on one level are already sorted and modifications are peroformed from
3201 +   left to right, carry nodes added on the parent level will be ordered
3202 +   automatically. To control ordering use @order and @reference parameters.
3203 +
3204 +*/
3205 +carry_node *reiser4_add_carry_skip(carry_level * level /* &carry_level to add
3206 +                                                        * node to */ ,
3207 +                                  pool_ordering order  /* where to insert:
3208 +                                                        * at the beginning of
3209 +                                                        * @level,
3210 +                                                        * before @reference,
3211 +                                                        * after @reference,
3212 +                                                        * at the end of @level
3213 +                                                        */ ,
3214 +                                  carry_node * reference/* reference node for
3215 +                                                         * insertion */)
3216 +{
3217 +       ON_DEBUG(carry_node * orig_ref = reference);
3218 +
3219 +       if (order == POOLO_BEFORE) {
3220 +               reference = find_left_carry(reference, level);
3221 +               if (reference == NULL)
3222 +                       reference = list_entry(level->nodes.next, carry_node,
3223 +                                              header.level_linkage);
3224 +               else
3225 +                       reference = list_entry(reference->header.level_linkage.next,
3226 +                                              carry_node, header.level_linkage);
3227 +       } else if (order == POOLO_AFTER) {
3228 +               reference = find_right_carry(reference, level);
3229 +               if (reference == NULL)
3230 +                       reference = list_entry(level->nodes.prev, carry_node,
3231 +                                              header.level_linkage);
3232 +               else
3233 +                       reference = list_entry(reference->header.level_linkage.prev,
3234 +                                              carry_node, header.level_linkage);
3235 +       }
3236 +       assert("nikita-2209",
3237 +              ergo(orig_ref != NULL,
3238 +                   reiser4_carry_real(reference) ==
3239 +                   reiser4_carry_real(orig_ref)));
3240 +       return reiser4_add_carry(level, order, reference);
3241 +}
3242 +
3243 +carry_node *reiser4_add_carry(carry_level * level      /* &carry_level to add node
3244 +                                                * to */ ,
3245 +                     pool_ordering order       /* where to insert: at the
3246 +                                                * beginning of @level, before
3247 +                                                * @reference, after @reference,
3248 +                                                * at the end of @level */ ,
3249 +                     carry_node * reference    /* reference node for
3250 +                                                * insertion */ )
3251 +{
3252 +       carry_node *result;
3253 +
3254 +       result =
3255 +           (carry_node *) reiser4_add_obj(&level->pool->node_pool,
3256 +                                          &level->nodes,
3257 +                                          order, &reference->header);
3258 +       if (!IS_ERR(result) && (result != NULL))
3259 +               ++level->nodes_num;
3260 +       return result;
3261 +}
3262 +
3263 +/* add new carry operation to the @level.
3264 +
3265 +   Returns pointer to the new carry operations allocated from pool. It's up to
3266 +   callers to maintain proper order in the @level. To control ordering use
3267 +   @order and @reference parameters.
3268 +
3269 +*/
3270 +static carry_op *add_op(carry_level * level /* &carry_level to add node to */ ,
3271 +                       pool_ordering order     /* where to insert: at the beginning of
3272 +                                                * @level, before @reference, after
3273 +                                                * @reference, at the end of @level */ ,
3274 +                       carry_op *
3275 +                       reference /* reference node for insertion */ )
3276 +{
3277 +       carry_op *result;
3278 +
3279 +       result =
3280 +           (carry_op *) reiser4_add_obj(&level->pool->op_pool, &level->ops,
3281 +                                        order, &reference->header);
3282 +       if (!IS_ERR(result) && (result != NULL))
3283 +               ++level->ops_num;
3284 +       return result;
3285 +}
3286 +
3287 +/* Return node on the right of which @node was created.
3288 +
3289 +   Each node is created on the right of some existing node (or it is new root,
3290 +   which is special case not handled here).
3291 +
3292 +   @node is new node created on some level, but not yet inserted into its
3293 +   parent, it has corresponding bit (JNODE_ORPHAN) set in zstate.
3294 +
3295 +*/
3296 +static carry_node *find_begetting_brother(carry_node * node    /* node to start search
3297 +                                                                * from */ ,
3298 +                                         carry_level * kin UNUSED_ARG  /* level to
3299 +                                                                        * scan */ )
3300 +{
3301 +       carry_node *scan;
3302 +
3303 +       assert("nikita-1614", node != NULL);
3304 +       assert("nikita-1615", kin != NULL);
3305 +       assert("nikita-1616", LOCK_CNT_GTZ(rw_locked_tree));
3306 +       assert("nikita-1619", ergo(reiser4_carry_real(node) != NULL,
3307 +                                  ZF_ISSET(reiser4_carry_real(node),
3308 +                                           JNODE_ORPHAN)));
3309 +       for (scan = node;;
3310 +            scan = list_entry(scan->header.level_linkage.prev, carry_node,
3311 +                              header.level_linkage)) {
3312 +               assert("nikita-1617", &kin->nodes != &scan->header.level_linkage);
3313 +               if ((scan->node != node->node) &&
3314 +                   !ZF_ISSET(scan->node, JNODE_ORPHAN)) {
3315 +                       assert("nikita-1618", reiser4_carry_real(scan) != NULL);
3316 +                       break;
3317 +               }
3318 +       }
3319 +       return scan;
3320 +}
3321 +
3322 +static cmp_t
3323 +carry_node_cmp(carry_level * level, carry_node * n1, carry_node * n2)
3324 +{
3325 +       assert("nikita-2199", n1 != NULL);
3326 +       assert("nikita-2200", n2 != NULL);
3327 +
3328 +       if (n1 == n2)
3329 +               return EQUAL_TO;
3330 +       while (1) {
3331 +               n1 = carry_node_next(n1);
3332 +               if (carry_node_end(level, n1))
3333 +                       return GREATER_THAN;
3334 +               if (n1 == n2)
3335 +                       return LESS_THAN;
3336 +       }
3337 +       impossible("nikita-2201", "End of level reached");
3338 +}
3339 +
3340 +carry_node *find_carry_node(carry_level * level, const znode * node)
3341 +{
3342 +       carry_node *scan;
3343 +       carry_node *tmp_scan;
3344 +
3345 +       assert("nikita-2202", level != NULL);
3346 +       assert("nikita-2203", node != NULL);
3347 +
3348 +       for_all_nodes(level, scan, tmp_scan) {
3349 +               if (reiser4_carry_real(scan) == node)
3350 +                       return scan;
3351 +       }
3352 +       return NULL;
3353 +}
3354 +
3355 +znode *reiser4_carry_real(const carry_node * node)
3356 +{
3357 +       assert("nikita-3061", node != NULL);
3358 +
3359 +       return node->lock_handle.node;
3360 +}
3361 +
3362 +carry_node *insert_carry_node(carry_level * doing, carry_level * todo,
3363 +                             const znode * node)
3364 +{
3365 +       carry_node *base;
3366 +       carry_node *scan;
3367 +       carry_node *tmp_scan;
3368 +       carry_node *proj;
3369 +
3370 +       base = find_carry_node(doing, node);
3371 +       assert("nikita-2204", base != NULL);
3372 +
3373 +       for_all_nodes(todo, scan, tmp_scan) {
3374 +               proj = find_carry_node(doing, scan->node);
3375 +               assert("nikita-2205", proj != NULL);
3376 +               if (carry_node_cmp(doing, proj, base) != LESS_THAN)
3377 +                       break;
3378 +       }
3379 +       return scan;
3380 +}
3381 +
3382 +static carry_node *add_carry_atplace(carry_level * doing, carry_level * todo,
3383 +                                    znode * node)
3384 +{
3385 +       carry_node *reference;
3386 +
3387 +       assert("nikita-2994", doing != NULL);
3388 +       assert("nikita-2995", todo != NULL);
3389 +       assert("nikita-2996", node != NULL);
3390 +
3391 +       reference = insert_carry_node(doing, todo, node);
3392 +       assert("nikita-2997", reference != NULL);
3393 +
3394 +       return reiser4_add_carry(todo, POOLO_BEFORE, reference);
3395 +}
3396 +
3397 +/* like reiser4_post_carry(), but designed to be called from node plugin methods.
3398 +   This function is different from reiser4_post_carry() in that it finds proper
3399 +   place to insert node in the queue. */
3400 +carry_op *node_post_carry(carry_plugin_info * info     /* carry parameters
3401 +                                                        * passed down to node
3402 +                                                        * plugin */ ,
3403 +                         carry_opcode op /* opcode of operation */ ,
3404 +                         znode * node  /* node on which this
3405 +                                        * operation will operate */ ,
3406 +                         int apply_to_parent_p /* whether operation will
3407 +                                                * operate directly on @node
3408 +                                                * or on it parent. */ )
3409 +{
3410 +       carry_op *result;
3411 +       carry_node *child;
3412 +
3413 +       assert("nikita-2207", info != NULL);
3414 +       assert("nikita-2208", info->todo != NULL);
3415 +
3416 +       if (info->doing == NULL)
3417 +               return reiser4_post_carry(info->todo, op, node,
3418 +                                         apply_to_parent_p);
3419 +
3420 +       result = add_op(info->todo, POOLO_LAST, NULL);
3421 +       if (IS_ERR(result))
3422 +               return result;
3423 +       child = add_carry_atplace(info->doing, info->todo, node);
3424 +       if (IS_ERR(child)) {
3425 +               reiser4_pool_free(&info->todo->pool->op_pool, &result->header);
3426 +               return (carry_op *) child;
3427 +       }
3428 +       result->node = child;
3429 +       result->op = op;
3430 +       child->parent = apply_to_parent_p;
3431 +       if (ZF_ISSET(node, JNODE_ORPHAN))
3432 +               child->left_before = 1;
3433 +       child->node = node;
3434 +       return result;
3435 +}
3436 +
3437 +/* lock all carry nodes in @level */
3438 +static int lock_carry_level(carry_level * level /* level to lock */ )
3439 +{
3440 +       int result;
3441 +       carry_node *node;
3442 +       carry_node *tmp_node;
3443 +
3444 +       assert("nikita-881", level != NULL);
3445 +       assert("nikita-2229", carry_level_invariant(level, CARRY_TODO));
3446 +
3447 +       /* lock nodes from left to right */
3448 +       result = 0;
3449 +       for_all_nodes(level, node, tmp_node) {
3450 +               result = lock_carry_node(level, node);
3451 +               if (result != 0)
3452 +                       break;
3453 +       }
3454 +       return result;
3455 +}
3456 +
3457 +/* Synchronize delimiting keys between @node and its left neighbor.
3458 +
3459 +   To reduce contention on dk key and simplify carry code, we synchronize
3460 +   delimiting keys only when carry ultimately leaves tree level (carrying
3461 +   changes upward) and unlocks nodes at this level.
3462 +
3463 +   This function first finds left neighbor of @node and then updates left
3464 +   neighbor's right delimiting key to conincide with least key in @node.
3465 +
3466 +*/
3467 +
3468 +ON_DEBUG(extern atomic_t delim_key_version;
3469 +    )
3470 +
3471 +static void sync_dkeys(znode * spot /* node to update */ )
3472 +{
3473 +       reiser4_key pivot;
3474 +       reiser4_tree *tree;
3475 +
3476 +       assert("nikita-1610", spot != NULL);
3477 +       assert("nikita-1612", LOCK_CNT_NIL(rw_locked_dk));
3478 +
3479 +       tree = znode_get_tree(spot);
3480 +       read_lock_tree(tree);
3481 +       write_lock_dk(tree);
3482 +
3483 +       assert("nikita-2192", znode_is_loaded(spot));
3484 +
3485 +       /* sync left delimiting key of @spot with key in its leftmost item */
3486 +       if (node_is_empty(spot))
3487 +               pivot = *znode_get_rd_key(spot);
3488 +       else
3489 +               leftmost_key_in_node(spot, &pivot);
3490 +
3491 +       znode_set_ld_key(spot, &pivot);
3492 +
3493 +       /* there can be sequence of empty nodes pending removal on the left of
3494 +          @spot. Scan them and update their left and right delimiting keys to
3495 +          match left delimiting key of @spot. Also, update right delimiting
3496 +          key of first non-empty left neighbor.
3497 +        */
3498 +       while (1) {
3499 +               if (!ZF_ISSET(spot, JNODE_LEFT_CONNECTED))
3500 +                       break;
3501 +
3502 +               spot = spot->left;
3503 +               if (spot == NULL)
3504 +                       break;
3505 +
3506 +               znode_set_rd_key(spot, &pivot);
3507 +               /* don't sink into the domain of another balancing */
3508 +               if (!znode_is_write_locked(spot))
3509 +                       break;
3510 +               if (ZF_ISSET(spot, JNODE_HEARD_BANSHEE))
3511 +                       znode_set_ld_key(spot, &pivot);
3512 +               else
3513 +                       break;
3514 +       }
3515 +
3516 +       write_unlock_dk(tree);
3517 +       read_unlock_tree(tree);
3518 +}
3519 +
3520 +/* unlock all carry nodes in @level */
3521 +static void unlock_carry_level(carry_level * level /* level to unlock */ ,
3522 +                              int failure      /* true if unlocking owing to
3523 +                                                * failure */ )
3524 +{
3525 +       carry_node *node;
3526 +       carry_node *tmp_node;
3527 +
3528 +       assert("nikita-889", level != NULL);
3529 +
3530 +       if (!failure) {
3531 +               znode *spot;
3532 +
3533 +               spot = NULL;
3534 +               /* update delimiting keys */
3535 +               for_all_nodes(level, node, tmp_node) {
3536 +                       if (reiser4_carry_real(node) != spot) {
3537 +                               spot = reiser4_carry_real(node);
3538 +                               sync_dkeys(spot);
3539 +                       }
3540 +               }
3541 +       }
3542 +
3543 +       /* nodes can be unlocked in arbitrary order.  In preemptible
3544 +          environment it's better to unlock in reverse order of locking,
3545 +          though.
3546 +        */
3547 +       for_all_nodes_back(level, node, tmp_node) {
3548 +               /* all allocated nodes should be already linked to their
3549 +                  parents at this moment. */
3550 +               assert("nikita-1631",
3551 +                      ergo(!failure, !ZF_ISSET(reiser4_carry_real(node),
3552 +                                               JNODE_ORPHAN)));
3553 +               ON_DEBUG(check_dkeys(reiser4_carry_real(node)));
3554 +               unlock_carry_node(level, node, failure);
3555 +       }
3556 +       level->new_root = NULL;
3557 +}
3558 +
3559 +/* finish with @level
3560 +
3561 +   Unlock nodes and release all allocated resources */
3562 +static void done_carry_level(carry_level * level /* level to finish */ )
3563 +{
3564 +       carry_node *node;
3565 +       carry_node *tmp_node;
3566 +       carry_op *op;
3567 +       carry_op *tmp_op;
3568 +
3569 +       assert("nikita-1076", level != NULL);
3570 +
3571 +       unlock_carry_level(level, 0);
3572 +       for_all_nodes(level, node, tmp_node) {
3573 +               assert("nikita-2113", list_empty_careful(&node->lock_handle.locks_link));
3574 +               assert("nikita-2114", list_empty_careful(&node->lock_handle.owners_link));
3575 +               reiser4_pool_free(&level->pool->node_pool, &node->header);
3576 +       }
3577 +       for_all_ops(level, op, tmp_op)
3578 +           reiser4_pool_free(&level->pool->op_pool, &op->header);
3579 +}
3580 +
3581 +/* helper function to complete locking of carry node
3582 +
3583 +   Finish locking of carry node. There are several ways in which new carry
3584 +   node can be added into carry level and locked. Normal is through
3585 +   lock_carry_node(), but also from find_{left|right}_neighbor(). This
3586 +   function factors out common final part of all locking scenarios. It
3587 +   supposes that @node -> lock_handle is lock handle for lock just taken and
3588 +   fills ->real_node from this lock handle.
3589 +
3590 +*/
3591 +int lock_carry_node_tail(carry_node * node /* node to complete locking of */ )
3592 +{
3593 +       assert("nikita-1052", node != NULL);
3594 +       assert("nikita-1187", reiser4_carry_real(node) != NULL);
3595 +       assert("nikita-1188", !node->unlock);
3596 +
3597 +       node->unlock = 1;
3598 +       /* Load node content into memory and install node plugin by
3599 +          looking at the node header.
3600 +
3601 +          Most of the time this call is cheap because the node is
3602 +          already in memory.
3603 +
3604 +          Corresponding zrelse() is in unlock_carry_node()
3605 +        */
3606 +       return zload(reiser4_carry_real(node));
3607 +}
3608 +
3609 +/* lock carry node
3610 +
3611 +   "Resolve" node to real znode, lock it and mark as locked.
3612 +   This requires recursive locking of znodes.
3613 +
3614 +   When operation is posted to the parent level, node it will be applied to is
3615 +   not yet known. For example, when shifting data between two nodes,
3616 +   delimiting has to be updated in parent or parents of nodes involved. But
3617 +   their parents is not yet locked and, moreover said nodes can be reparented
3618 +   by concurrent balancing.
3619 +
3620 +   To work around this, carry operation is applied to special "carry node"
3621 +   rather than to the znode itself. Carry node consists of some "base" or
3622 +   "reference" znode and flags indicating how to get to the target of carry
3623 +   operation (->real_node field of carry_node) from base.
3624 +
3625 +*/
3626 +int lock_carry_node(carry_level * level /* level @node is in */ ,
3627 +                   carry_node * node /* node to lock */ )
3628 +{
3629 +       int result;
3630 +       znode *reference_point;
3631 +       lock_handle lh;
3632 +       lock_handle tmp_lh;
3633 +       reiser4_tree *tree;
3634 +
3635 +       assert("nikita-887", level != NULL);
3636 +       assert("nikita-882", node != NULL);
3637 +
3638 +       result = 0;
3639 +       reference_point = node->node;
3640 +       init_lh(&lh);
3641 +       init_lh(&tmp_lh);
3642 +       if (node->left_before) {
3643 +               /* handling of new nodes, allocated on the previous level:
3644 +
3645 +                  some carry ops were propably posted from the new node, but
3646 +                  this node neither has parent pointer set, nor is
3647 +                  connected. This will be done in ->create_hook() for
3648 +                  internal item.
3649 +
3650 +                  No then less, parent of new node has to be locked. To do
3651 +                  this, first go to the "left" in the carry order. This
3652 +                  depends on the decision to always allocate new node on the
3653 +                  right of existing one.
3654 +
3655 +                  Loop handles case when multiple nodes, all orphans, were
3656 +                  inserted.
3657 +
3658 +                  Strictly speaking, taking tree lock is not necessary here,
3659 +                  because all nodes scanned by loop in
3660 +                  find_begetting_brother() are write-locked by this thread,
3661 +                  and thus, their sibling linkage cannot change.
3662 +
3663 +                */
3664 +               tree = znode_get_tree(reference_point);
3665 +               read_lock_tree(tree);
3666 +               reference_point = find_begetting_brother(node, level)->node;
3667 +               read_unlock_tree(tree);
3668 +               assert("nikita-1186", reference_point != NULL);
3669 +       }
3670 +       if (node->parent && (result == 0)) {
3671 +               result =
3672 +                   reiser4_get_parent(&tmp_lh, reference_point,
3673 +                                      ZNODE_WRITE_LOCK);
3674 +               if (result != 0) {
3675 +                       ;       /* nothing */
3676 +               } else if (znode_get_level(tmp_lh.node) == 0) {
3677 +                       assert("nikita-1347", znode_above_root(tmp_lh.node));
3678 +                       result = add_new_root(level, node, tmp_lh.node);
3679 +                       if (result == 0) {
3680 +                               reference_point = level->new_root;
3681 +                               move_lh(&lh, &node->lock_handle);
3682 +                       }
3683 +               } else if ((level->new_root != NULL)
3684 +                          && (level->new_root !=
3685 +                              znode_parent_nolock(reference_point))) {
3686 +                       /* parent of node exists, but this level aready
3687 +                          created different new root, so */
3688 +                       warning("nikita-1109",
3689 +                               /* it should be "radicis", but tradition is
3690 +                                  tradition.  do banshees read latin? */
3691 +                               "hodie natus est radici frater");
3692 +                       result = -EIO;
3693 +               } else {
3694 +                       move_lh(&lh, &tmp_lh);
3695 +                       reference_point = lh.node;
3696 +               }
3697 +       }
3698 +       if (node->left && (result == 0)) {
3699 +               assert("nikita-1183", node->parent);
3700 +               assert("nikita-883", reference_point != NULL);
3701 +               result =
3702 +                   reiser4_get_left_neighbor(&tmp_lh, reference_point,
3703 +                                             ZNODE_WRITE_LOCK,
3704 +                                             GN_CAN_USE_UPPER_LEVELS);
3705 +               if (result == 0) {
3706 +                       done_lh(&lh);
3707 +                       move_lh(&lh, &tmp_lh);
3708 +                       reference_point = lh.node;
3709 +               }
3710 +       }
3711 +       if (!node->parent && !node->left && !node->left_before) {
3712 +               result =
3713 +                   longterm_lock_znode(&lh, reference_point, ZNODE_WRITE_LOCK,
3714 +                                       ZNODE_LOCK_HIPRI);
3715 +       }
3716 +       if (result == 0) {
3717 +               move_lh(&node->lock_handle, &lh);
3718 +               result = lock_carry_node_tail(node);
3719 +       }
3720 +       done_lh(&tmp_lh);
3721 +       done_lh(&lh);
3722 +       return result;
3723 +}
3724 +
3725 +/* release a lock on &carry_node.
3726 +
3727 +   Release if necessary lock on @node. This opearion is pair of
3728 +   lock_carry_node() and is idempotent: you can call it more than once on the
3729 +   same node.
3730 +
3731 +*/
3732 +static void
3733 +unlock_carry_node(carry_level * level,
3734 +                 carry_node * node /* node to be released */ ,
3735 +                 int failure   /* 0 if node is unlocked due
3736 +                                * to some error */ )
3737 +{
3738 +       znode *real_node;
3739 +
3740 +       assert("nikita-884", node != NULL);
3741 +
3742 +       real_node = reiser4_carry_real(node);
3743 +       /* pair to zload() in lock_carry_node_tail() */
3744 +       zrelse(real_node);
3745 +       if (node->unlock && (real_node != NULL)) {
3746 +               assert("nikita-899", real_node == node->lock_handle.node);
3747 +               longterm_unlock_znode(&node->lock_handle);
3748 +       }
3749 +       if (failure) {
3750 +               if (node->deallocate && (real_node != NULL)) {
3751 +                       /* free node in bitmap
3752 +
3753 +                          Prepare node for removal. Last zput() will finish
3754 +                          with it.
3755 +                        */
3756 +                       ZF_SET(real_node, JNODE_HEARD_BANSHEE);
3757 +               }
3758 +               if (node->free) {
3759 +                       assert("nikita-2177",
3760 +                              list_empty_careful(&node->lock_handle.locks_link));
3761 +                       assert("nikita-2112",
3762 +                              list_empty_careful(&node->lock_handle.owners_link));
3763 +                       reiser4_pool_free(&level->pool->node_pool,
3764 +                                         &node->header);
3765 +               }
3766 +       }
3767 +}
3768 +
3769 +/* fatal_carry_error() - all-catching error handling function
3770 +
3771 +   It is possible that carry faces unrecoverable error, like unability to
3772 +   insert pointer at the internal level. Our simple solution is just panic in
3773 +   this situation. More sophisticated things like attempt to remount
3774 +   file-system as read-only can be implemented without much difficlties.
3775 +
3776 +   It is believed, that:
3777 +
3778 +   1. in stead of panicking, all current transactions can be aborted rolling
3779 +   system back to the consistent state.
3780 +
3781 +Umm, if you simply panic without doing anything more at all, then all current
3782 +transactions are aborted and the system is rolled back to a consistent state,
3783 +by virtue of the design of the transactional mechanism. Well, wait, let's be
3784 +precise.  If an internal node is corrupted on disk due to hardware failure,
3785 +then there may be no consistent state that can be rolled back to, so instead
3786 +we should say that it will rollback the transactions, which barring other
3787 +factors means rolling back to a consistent state.
3788 +
3789 +# Nikita: there is a subtle difference between panic and aborting
3790 +# transactions: machine doesn't reboot. Processes aren't killed. Processes
3791 +# don't using reiser4 (not that we care about such processes), or using other
3792 +# reiser4 mounts (about them we do care) will simply continue to run. With
3793 +# some luck, even application using aborted file system can survive: it will
3794 +# get some error, like EBADF, from each file descriptor on failed file system,
3795 +# but applications that do care about tolerance will cope with this (squid
3796 +# will).
3797 +
3798 +It would be a nice feature though to support rollback without rebooting
3799 +followed by remount, but this can wait for later versions.
3800 +
3801 +   2. once isolated transactions will be implemented it will be possible to
3802 +   roll back offending transaction.
3803 +
3804 +2. is additional code complexity of inconsistent value (it implies that a broken tree should be kept in operation), so we must think about
3805 +it more before deciding if it should be done.  -Hans
3806 +
3807 +*/
3808 +static void fatal_carry_error(carry_level * doing UNUSED_ARG   /* carry level
3809 +                                                                * where
3810 +                                                                * unrecoverable
3811 +                                                                * error
3812 +                                                                * occurred */ ,
3813 +                             int ecode /* error code */ )
3814 +{
3815 +       assert("nikita-1230", doing != NULL);
3816 +       assert("nikita-1231", ecode < 0);
3817 +
3818 +       reiser4_panic("nikita-1232", "Carry failed: %i", ecode);
3819 +}
3820 +
3821 +/* add new root to the tree
3822 +
3823 +   This function itself only manages changes in carry structures and delegates
3824 +   all hard work (allocation of znode for new root, changes of parent and
3825 +   sibling pointers to the reiser4_add_tree_root().
3826 +
3827 +   Locking: old tree root is locked by carry at this point. Fake znode is also
3828 +   locked.
3829 +
3830 +*/
3831 +static int add_new_root(carry_level * level    /* carry level in context of which
3832 +                                                * operation is performed */ ,
3833 +                       carry_node * node /* carry node for existing root */ ,
3834 +                       znode * fake    /* "fake" znode already locked by
3835 +                                        * us */ )
3836 +{
3837 +       int result;
3838 +
3839 +       assert("nikita-1104", level != NULL);
3840 +       assert("nikita-1105", node != NULL);
3841 +
3842 +       assert("nikita-1403", znode_is_write_locked(node->node));
3843 +       assert("nikita-1404", znode_is_write_locked(fake));
3844 +
3845 +       /* trying to create new root. */
3846 +       /* @node is root and it's already locked by us. This
3847 +          means that nobody else can be trying to add/remove
3848 +          tree root right now.
3849 +        */
3850 +       if (level->new_root == NULL)
3851 +               level->new_root = reiser4_add_tree_root(node->node, fake);
3852 +       if (!IS_ERR(level->new_root)) {
3853 +               assert("nikita-1210", znode_is_root(level->new_root));
3854 +               node->deallocate = 1;
3855 +               result =
3856 +                   longterm_lock_znode(&node->lock_handle, level->new_root,
3857 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
3858 +               if (result == 0)
3859 +                       zput(level->new_root);
3860 +       } else {
3861 +               result = PTR_ERR(level->new_root);
3862 +               level->new_root = NULL;
3863 +       }
3864 +       return result;
3865 +}
3866 +
3867 +/* allocate new znode and add the operation that inserts the
3868 +   pointer to it into the parent node into the todo level
3869 +
3870 +   Allocate new znode, add it into carry queue and post into @todo queue
3871 +   request to add pointer to new node into its parent.
3872 +
3873 +   This is carry related routing that calls reiser4_new_node() to allocate new
3874 +   node.
3875 +*/
3876 +carry_node *add_new_znode(znode * brother      /* existing left neighbor of new
3877 +                                                * node */ ,
3878 +                         carry_node * ref      /* carry node after which new
3879 +                                                * carry node is to be inserted
3880 +                                                * into queue. This affects
3881 +                                                * locking. */ ,
3882 +                         carry_level * doing   /* carry queue where new node is
3883 +                                                * to be added */ ,
3884 +                         carry_level * todo    /* carry queue where COP_INSERT
3885 +                                                * operation to add pointer to
3886 +                                                * new node will ne added */ )
3887 +{
3888 +       carry_node *fresh;
3889 +       znode *new_znode;
3890 +       carry_op *add_pointer;
3891 +       carry_plugin_info info;
3892 +
3893 +       assert("nikita-1048", brother != NULL);
3894 +       assert("nikita-1049", todo != NULL);
3895 +
3896 +       /* There is a lot of possible variations here: to what parent
3897 +          new node will be attached and where. For simplicity, always
3898 +          do the following:
3899 +
3900 +          (1) new node and @brother will have the same parent.
3901 +
3902 +          (2) new node is added on the right of @brother
3903 +
3904 +        */
3905 +
3906 +       fresh = reiser4_add_carry_skip(doing,
3907 +                                      ref ? POOLO_AFTER : POOLO_LAST, ref);
3908 +       if (IS_ERR(fresh))
3909 +               return fresh;
3910 +
3911 +       fresh->deallocate = 1;
3912 +       fresh->free = 1;
3913 +
3914 +       new_znode = reiser4_new_node(brother, znode_get_level(brother));
3915 +       if (IS_ERR(new_znode))
3916 +               /* @fresh will be deallocated automatically by error
3917 +                  handling code in the caller. */
3918 +               return (carry_node *) new_znode;
3919 +
3920 +       /* new_znode returned znode with x_count 1. Caller has to decrease
3921 +          it. make_space() does. */
3922 +
3923 +       ZF_SET(new_znode, JNODE_ORPHAN);
3924 +       fresh->node = new_znode;
3925 +
3926 +       while (ZF_ISSET(reiser4_carry_real(ref), JNODE_ORPHAN)) {
3927 +               ref = carry_node_prev(ref);
3928 +               assert("nikita-1606", !carry_node_end(doing, ref));
3929 +       }
3930 +
3931 +       info.todo = todo;
3932 +       info.doing = doing;
3933 +       add_pointer = node_post_carry(&info, COP_INSERT,
3934 +                                     reiser4_carry_real(ref), 1);
3935 +       if (IS_ERR(add_pointer)) {
3936 +               /* no need to deallocate @new_znode here: it will be
3937 +                  deallocated during carry error handling. */
3938 +               return (carry_node *) add_pointer;
3939 +       }
3940 +
3941 +       add_pointer->u.insert.type = COPT_CHILD;
3942 +       add_pointer->u.insert.child = fresh;
3943 +       add_pointer->u.insert.brother = brother;
3944 +       /* initially new node spawns empty key range */
3945 +       write_lock_dk(znode_get_tree(brother));
3946 +       znode_set_ld_key(new_znode,
3947 +                        znode_set_rd_key(new_znode,
3948 +                                         znode_get_rd_key(brother)));
3949 +       write_unlock_dk(znode_get_tree(brother));
3950 +       return fresh;
3951 +}
3952 +
3953 +/* DEBUGGING FUNCTIONS.
3954 +
3955 +   Probably we also should leave them on even when
3956 +   debugging is turned off to print dumps at errors.
3957 +*/
3958 +#if REISER4_DEBUG
3959 +static int carry_level_invariant(carry_level * level, carry_queue_state state)
3960 +{
3961 +       carry_node *node;
3962 +       carry_node *tmp_node;
3963 +
3964 +       if (level == NULL)
3965 +               return 0;
3966 +
3967 +       if (level->track_type != 0 &&
3968 +           level->track_type != CARRY_TRACK_NODE &&
3969 +           level->track_type != CARRY_TRACK_CHANGE)
3970 +               return 0;
3971 +
3972 +       /* check that nodes are in ascending order */
3973 +       for_all_nodes(level, node, tmp_node) {
3974 +               znode *left;
3975 +               znode *right;
3976 +
3977 +               reiser4_key lkey;
3978 +               reiser4_key rkey;
3979 +
3980 +               if (node != carry_node_front(level)) {
3981 +                       if (state == CARRY_TODO) {
3982 +                               right = node->node;
3983 +                               left = carry_node_prev(node)->node;
3984 +                       } else {
3985 +                               right = reiser4_carry_real(node);
3986 +                               left = reiser4_carry_real(carry_node_prev(node));
3987 +                       }
3988 +                       if (right == NULL || left == NULL)
3989 +                               continue;
3990 +                       if (node_is_empty(right) || node_is_empty(left))
3991 +                               continue;
3992 +                       if (!keyle(leftmost_key_in_node(left, &lkey),
3993 +                                  leftmost_key_in_node(right, &rkey))) {
3994 +                               warning("", "wrong key order");
3995 +                               return 0;
3996 +                       }
3997 +               }
3998 +       }
3999 +       return 1;
4000 +}
4001 +#endif
4002 +
4003 +/* get symbolic name for boolean */
4004 +static const char *tf(int boolean /* truth value */ )
4005 +{
4006 +       return boolean ? "t" : "f";
4007 +}
4008 +
4009 +/* symbolic name for carry operation */
4010 +static const char *carry_op_name(carry_opcode op /* carry opcode */ )
4011 +{
4012 +       switch (op) {
4013 +       case COP_INSERT:
4014 +               return "COP_INSERT";
4015 +       case COP_DELETE:
4016 +               return "COP_DELETE";
4017 +       case COP_CUT:
4018 +               return "COP_CUT";
4019 +       case COP_PASTE:
4020 +               return "COP_PASTE";
4021 +       case COP_UPDATE:
4022 +               return "COP_UPDATE";
4023 +       case COP_EXTENT:
4024 +               return "COP_EXTENT";
4025 +       case COP_INSERT_FLOW:
4026 +               return "COP_INSERT_FLOW";
4027 +       default:{
4028 +                       /* not mt safe, but who cares? */
4029 +                       static char buf[20];
4030 +
4031 +                       sprintf(buf, "unknown op: %x", op);
4032 +                       return buf;
4033 +               }
4034 +       }
4035 +}
4036 +
4037 +/* dump information about carry node */
4038 +static void print_carry(const char *prefix /* prefix to print */ ,
4039 +                       carry_node * node /* node to print */ )
4040 +{
4041 +       if (node == NULL) {
4042 +               printk("%s: null\n", prefix);
4043 +               return;
4044 +       }
4045 +       printk
4046 +           ("%s: %p parent: %s, left: %s, unlock: %s, free: %s, dealloc: %s\n",
4047 +            prefix, node, tf(node->parent), tf(node->left), tf(node->unlock),
4048 +            tf(node->free), tf(node->deallocate));
4049 +}
4050 +
4051 +/* dump information about carry operation */
4052 +static void print_op(const char *prefix /* prefix to print */ ,
4053 +                    carry_op * op /* operation to print */ )
4054 +{
4055 +       if (op == NULL) {
4056 +               printk("%s: null\n", prefix);
4057 +               return;
4058 +       }
4059 +       printk("%s: %p carry_opcode: %s\n", prefix, op, carry_op_name(op->op));
4060 +       print_carry("\tnode", op->node);
4061 +       switch (op->op) {
4062 +       case COP_INSERT:
4063 +       case COP_PASTE:
4064 +               print_coord("\tcoord",
4065 +                           op->u.insert.d ? op->u.insert.d->coord : NULL, 0);
4066 +               reiser4_print_key("\tkey",
4067 +                                 op->u.insert.d ? op->u.insert.d->key : NULL);
4068 +               print_carry("\tchild", op->u.insert.child);
4069 +               break;
4070 +       case COP_DELETE:
4071 +               print_carry("\tchild", op->u.delete.child);
4072 +               break;
4073 +       case COP_CUT:
4074 +               if (op->u.cut_or_kill.is_cut) {
4075 +                       print_coord("\tfrom",
4076 +                                   op->u.cut_or_kill.u.kill->params.from, 0);
4077 +                       print_coord("\tto", op->u.cut_or_kill.u.kill->params.to,
4078 +                                   0);
4079 +               } else {
4080 +                       print_coord("\tfrom",
4081 +                                   op->u.cut_or_kill.u.cut->params.from, 0);
4082 +                       print_coord("\tto", op->u.cut_or_kill.u.cut->params.to,
4083 +                                   0);
4084 +               }
4085 +               break;
4086 +       case COP_UPDATE:
4087 +               print_carry("\tleft", op->u.update.left);
4088 +               break;
4089 +       default:
4090 +               /* do nothing */
4091 +               break;
4092 +       }
4093 +}
4094 +
4095 +/* dump information about all nodes and operations in a @level */
4096 +static void print_level(const char *prefix /* prefix to print */ ,
4097 +                       carry_level * level /* level to print */ )
4098 +{
4099 +       carry_node *node;
4100 +       carry_node *tmp_node;
4101 +       carry_op *op;
4102 +       carry_op *tmp_op;
4103 +
4104 +       if (level == NULL) {
4105 +               printk("%s: null\n", prefix);
4106 +               return;
4107 +       }
4108 +       printk("%s: %p, restartable: %s\n",
4109 +              prefix, level, tf(level->restartable));
4110 +
4111 +       for_all_nodes(level, node, tmp_node)
4112 +           print_carry("\tcarry node", node);
4113 +       for_all_ops(level, op, tmp_op)
4114 +           print_op("\tcarry op", op);
4115 +}
4116 +
4117 +/* Make Linus happy.
4118 +   Local variables:
4119 +   c-indentation-style: "K&R"
4120 +   mode-name: "LC"
4121 +   c-basic-offset: 8
4122 +   tab-width: 8
4123 +   fill-column: 120
4124 +   scroll-step: 1
4125 +   End:
4126 +*/
4127 diff --git a/fs/reiser4/carry.h b/fs/reiser4/carry.h
4128 new file mode 100644
4129 index 0000000..6341d73
4130 --- /dev/null
4131 +++ b/fs/reiser4/carry.h
4132 @@ -0,0 +1,442 @@
4133 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4134 +
4135 +/* Functions and data types to "carry" tree modification(s) upward.
4136 +   See fs/reiser4/carry.c for details. */
4137 +
4138 +#if !defined( __FS_REISER4_CARRY_H__ )
4139 +#define __FS_REISER4_CARRY_H__
4140 +
4141 +#include "forward.h"
4142 +#include "debug.h"
4143 +#include "pool.h"
4144 +#include "znode.h"
4145 +
4146 +#include <linux/types.h>
4147 +
4148 +/* &carry_node - "location" of carry node.
4149 +
4150 +   "location" of node that is involved or going to be involved into
4151 +   carry process. Node where operation will be carried to on the
4152 +   parent level cannot be recorded explicitly. Operation will be carried
4153 +   usually to the parent of some node (where changes are performed at
4154 +   the current level) or, to the left neighbor of its parent. But while
4155 +   modifications are performed at the current level, parent may
4156 +   change. So, we have to allow some indirection (or, positevly,
4157 +   flexibility) in locating carry nodes.
4158 +
4159 +*/
4160 +typedef struct carry_node {
4161 +       /* pool linkage */
4162 +       reiser4_pool_header header;
4163 +
4164 +       /* base node from which real_node is calculated. See
4165 +          fs/reiser4/carry.c:lock_carry_node(). */
4166 +       znode *node;
4167 +
4168 +       /* how to get ->real_node */
4169 +       /* to get ->real_node obtain parent of ->node */
4170 +       __u32 parent:1;
4171 +       /* to get ->real_node obtain left neighbor of parent of
4172 +          ->node */
4173 +       __u32 left:1;
4174 +       __u32 left_before:1;
4175 +
4176 +       /* locking */
4177 +
4178 +       /* this node was locked by carry process and should be
4179 +          unlocked when carry leaves a level */
4180 +       __u32 unlock:1;
4181 +
4182 +       /* disk block for this node was allocated by carry process and
4183 +          should be deallocated when carry leaves a level */
4184 +       __u32 deallocate:1;
4185 +       /* this carry node was allocated by carry process and should be
4186 +          freed when carry leaves a level */
4187 +       __u32 free:1;
4188 +
4189 +       /* type of lock we want to take on this node */
4190 +       lock_handle lock_handle;
4191 +} carry_node;
4192 +
4193 +/* &carry_opcode - elementary operations that can be carried upward
4194 +
4195 +   Operations that carry() can handle. This list is supposed to be
4196 +   expanded.
4197 +
4198 +   Each carry operation (cop) is handled by appropriate function defined
4199 +   in fs/reiser4/carry.c. For example COP_INSERT is handled by
4200 +   fs/reiser4/carry.c:carry_insert() etc. These functions in turn
4201 +   call plugins of nodes affected by operation to modify nodes' content
4202 +   and to gather operations to be performed on the next level.
4203 +
4204 +*/
4205 +typedef enum {
4206 +       /* insert new item into node. */
4207 +       COP_INSERT,
4208 +       /* delete pointer from parent node */
4209 +       COP_DELETE,
4210 +       /* remove part of or whole node. */
4211 +       COP_CUT,
4212 +       /* increase size of item. */
4213 +       COP_PASTE,
4214 +       /* insert extent (that is sequence of unformatted nodes). */
4215 +       COP_EXTENT,
4216 +       /* update delimiting key in least common ancestor of two
4217 +          nodes. This is performed when items are moved between two
4218 +          nodes.
4219 +        */
4220 +       COP_UPDATE,
4221 +       /* insert flow */
4222 +       COP_INSERT_FLOW,
4223 +       COP_LAST_OP,
4224 +} carry_opcode;
4225 +
4226 +#define CARRY_FLOW_NEW_NODES_LIMIT 20
4227 +
4228 +/* mode (or subtype) of COP_{INSERT|PASTE} operation. Specifies how target
4229 +   item is determined. */
4230 +typedef enum {
4231 +       /* target item is one containing pointer to the ->child node */
4232 +       COPT_CHILD,
4233 +       /* target item is given explicitly by @coord */
4234 +       COPT_ITEM_DATA,
4235 +       /* target item is given by key */
4236 +       COPT_KEY,
4237 +       /* see insert_paste_common() for more comments on this. */
4238 +       COPT_PASTE_RESTARTED,
4239 +} cop_insert_pos_type;
4240 +
4241 +/* flags to cut and delete */
4242 +typedef enum {
4243 +       /* don't kill node even if it became completely empty as results of
4244 +        * cut. This is needed for eottl handling. See carry_extent() for
4245 +        * details. */
4246 +       DELETE_RETAIN_EMPTY = (1 << 0)
4247 +} cop_delete_flag;
4248 +
4249 +/*
4250 + * carry() implements "lock handle tracking" feature.
4251 + *
4252 + * Callers supply carry with node where to perform initial operation and lock
4253 + * handle on this node. Trying to optimize node utilization carry may actually
4254 + * move insertion point to different node. Callers expect that lock handle
4255 + * will rebe transferred to the new node also.
4256 + *
4257 + */
4258 +typedef enum {
4259 +       /* transfer lock handle along with insertion point */
4260 +       CARRY_TRACK_CHANGE = 1,
4261 +       /* acquire new lock handle to the node where insertion point is. This
4262 +        * is used when carry() client doesn't initially possess lock handle
4263 +        * on the insertion point node, for example, by extent insertion
4264 +        * code. See carry_extent(). */
4265 +       CARRY_TRACK_NODE = 2
4266 +} carry_track_type;
4267 +
4268 +/* data supplied to COP_{INSERT|PASTE} by callers */
4269 +typedef struct carry_insert_data {
4270 +       /* position where new item is to be inserted */
4271 +       coord_t *coord;
4272 +       /* new item description */
4273 +       reiser4_item_data *data;
4274 +       /* key of new item */
4275 +       const reiser4_key *key;
4276 +} carry_insert_data;
4277 +
4278 +/* cut and kill are similar, so carry_cut_data and carry_kill_data share the below structure of parameters */
4279 +struct cut_kill_params {
4280 +       /* coord where cut starts (inclusive) */
4281 +       coord_t *from;
4282 +       /* coord where cut stops (inclusive, this item/unit will also be
4283 +        * cut) */
4284 +       coord_t *to;
4285 +       /* starting key. This is necessary when item and unit pos don't
4286 +        * uniquely identify what portion or tree to remove. For example, this
4287 +        * indicates what portion of extent unit will be affected. */
4288 +       const reiser4_key *from_key;
4289 +       /* exclusive stop key */
4290 +       const reiser4_key *to_key;
4291 +       /* if this is not NULL, smallest actually removed key is stored
4292 +        * here. */
4293 +       reiser4_key *smallest_removed;
4294 +       /* kill_node_content()  is called for file truncate */
4295 +       int truncate;
4296 +};
4297 +
4298 +struct carry_cut_data {
4299 +       struct cut_kill_params params;
4300 +};
4301 +
4302 +struct carry_kill_data {
4303 +       struct cut_kill_params params;
4304 +       /* parameter to be passed to the ->kill_hook() method of item
4305 +        * plugin */
4306 +       /*void *iplug_params; *//* FIXME: unused currently */
4307 +       /* if not NULL---inode whose items are being removed. This is needed
4308 +        * for ->kill_hook() of extent item to update VM structures when
4309 +        * removing pages. */
4310 +       struct inode *inode;
4311 +       /* sibling list maintenance is complicated by existence of eottl. When
4312 +        * eottl whose left and right neighbors are formatted leaves is
4313 +        * removed, one has to connect said leaves in the sibling list. This
4314 +        * cannot be done when extent removal is just started as locking rules
4315 +        * require sibling list update to happen atomically with removal of
4316 +        * extent item. Therefore: 1. pointers to left and right neighbors
4317 +        * have to be passed down to the ->kill_hook() of extent item, and
4318 +        * 2. said neighbors have to be locked. */
4319 +       lock_handle *left;
4320 +       lock_handle *right;
4321 +       /* flags modifying behavior of kill. Currently, it may have DELETE_RETAIN_EMPTY set. */
4322 +       unsigned flags;
4323 +       char *buf;
4324 +};
4325 +
4326 +/* &carry_tree_op - operation to "carry" upward.
4327 +
4328 +   Description of an operation we want to "carry" to the upper level of
4329 +   a tree: e.g, when we insert something and there is not enough space
4330 +   we allocate a new node and "carry" the operation of inserting a
4331 +   pointer to the new node to the upper level, on removal of empty node,
4332 +   we carry up operation of removing appropriate entry from parent.
4333 +
4334 +   There are two types of carry ops: when adding or deleting node we
4335 +   node at the parent level where appropriate modification has to be
4336 +   performed is known in advance. When shifting items between nodes
4337 +   (split, merge), delimiting key should be changed in the least common
4338 +   parent of the nodes involved that is not known in advance.
4339 +
4340 +   For the operations of the first type we store in &carry_op pointer to
4341 +   the &carry_node at the parent level. For the operation of the second
4342 +   type we store &carry_node or parents of the left and right nodes
4343 +   modified and keep track of them upward until they coincide.
4344 +
4345 +*/
4346 +typedef struct carry_op {
4347 +       /* pool linkage */
4348 +       reiser4_pool_header header;
4349 +       carry_opcode op;
4350 +       /* node on which operation is to be performed:
4351 +
4352 +          for insert, paste: node where new item is to be inserted
4353 +
4354 +          for delete: node where pointer is to be deleted
4355 +
4356 +          for cut: node to cut from
4357 +
4358 +          for update: node where delimiting key is to be modified
4359 +
4360 +          for modify: parent of modified node
4361 +
4362 +        */
4363 +       carry_node *node;
4364 +       union {
4365 +               struct {
4366 +                       /* (sub-)type of insertion/paste. Taken from
4367 +                          cop_insert_pos_type. */
4368 +                       __u8 type;
4369 +                       /* various operation flags. Taken from
4370 +                          cop_insert_flag. */
4371 +                       __u8 flags;
4372 +                       carry_insert_data *d;
4373 +                       carry_node *child;
4374 +                       znode *brother;
4375 +               } insert, paste, extent;
4376 +
4377 +               struct {
4378 +                       int is_cut;
4379 +                       union {
4380 +                               carry_kill_data *kill;
4381 +                               carry_cut_data *cut;
4382 +                       } u;
4383 +               } cut_or_kill;
4384 +
4385 +               struct {
4386 +                       carry_node *left;
4387 +               } update;
4388 +               struct {
4389 +                       /* changed child */
4390 +                       carry_node *child;
4391 +                       /* bitmask of changes. See &cop_modify_flag */
4392 +                       __u32 flag;
4393 +               } modify;
4394 +               struct {
4395 +                       /* flags to deletion operation. Are taken from
4396 +                          cop_delete_flag */
4397 +                       __u32 flags;
4398 +                       /* child to delete from parent. If this is
4399 +                          NULL, delete op->node.  */
4400 +                       carry_node *child;
4401 +               } delete;
4402 +               struct {
4403 +                       /* various operation flags. Taken from
4404 +                          cop_insert_flag. */
4405 +                       __u32 flags;
4406 +                       flow_t *flow;
4407 +                       coord_t *insert_point;
4408 +                       reiser4_item_data *data;
4409 +                       /* flow insertion is limited by number of new blocks
4410 +                          added in that operation which do not get any data
4411 +                          but part of flow. This limit is set by macro
4412 +                          CARRY_FLOW_NEW_NODES_LIMIT. This field stores number
4413 +                          of nodes added already during one carry_flow */
4414 +                       int new_nodes;
4415 +               } insert_flow;
4416 +       } u;
4417 +} carry_op;
4418 +
4419 +/* &carry_op_pool - preallocated pool of carry operations, and nodes */
4420 +typedef struct carry_pool {
4421 +       carry_op op[CARRIES_POOL_SIZE];
4422 +       reiser4_pool op_pool;
4423 +       carry_node node[NODES_LOCKED_POOL_SIZE];
4424 +       reiser4_pool node_pool;
4425 +} carry_pool;
4426 +
4427 +/* &carry_tree_level - carry process on given level
4428 +
4429 +   Description of balancing process on the given level.
4430 +
4431 +   No need for locking here, as carry_tree_level is essentially per
4432 +   thread thing (for now).
4433 +
4434 +*/
4435 +struct carry_level {
4436 +       /* this level may be restarted */
4437 +       __u32 restartable:1;
4438 +       /* list of carry nodes on this level, ordered by key order */
4439 +       struct list_head nodes;
4440 +       struct list_head ops;
4441 +       /* pool where new objects are allocated from */
4442 +       carry_pool *pool;
4443 +       int ops_num;
4444 +       int nodes_num;
4445 +       /* new root created on this level, if any */
4446 +       znode *new_root;
4447 +       /* This is set by caller (insert_by_key(), rreiser4_esize_item(), etc.)
4448 +          when they want ->tracked to automagically wander to the node where
4449 +          insertion point moved after insert or paste.
4450 +        */
4451 +       carry_track_type track_type;
4452 +       /* lock handle supplied by user that we are tracking. See
4453 +          above. */
4454 +       lock_handle *tracked;
4455 +};
4456 +
4457 +/* information carry passes to plugin methods that may add new operations to
4458 +   the @todo queue  */
4459 +struct carry_plugin_info {
4460 +       carry_level *doing;
4461 +       carry_level *todo;
4462 +};
4463 +
4464 +int reiser4_carry(carry_level * doing, carry_level * done);
4465 +
4466 +carry_node *reiser4_add_carry(carry_level * level, pool_ordering order,
4467 +                             carry_node * reference);
4468 +carry_node *reiser4_add_carry_skip(carry_level * level, pool_ordering order,
4469 +                                  carry_node * reference);
4470 +
4471 +extern carry_node *insert_carry_node(carry_level * doing,
4472 +                                    carry_level * todo, const znode * node);
4473 +
4474 +extern carry_pool *init_carry_pool(int);
4475 +extern void done_carry_pool(carry_pool * pool);
4476 +
4477 +extern void init_carry_level(carry_level * level, carry_pool * pool);
4478 +
4479 +extern carry_op *reiser4_post_carry(carry_level * level, carry_opcode op,
4480 +                                   znode * node, int apply_to_parent);
4481 +extern carry_op *node_post_carry(carry_plugin_info * info, carry_opcode op,
4482 +                                znode * node, int apply_to_parent_p);
4483 +
4484 +carry_node *add_new_znode(znode * brother, carry_node * reference,
4485 +                         carry_level * doing, carry_level * todo);
4486 +
4487 +carry_node *find_carry_node(carry_level * level, const znode * node);
4488 +
4489 +extern znode *reiser4_carry_real(const carry_node * node);
4490 +
4491 +/* helper macros to iterate over carry queues */
4492 +
4493 +#define carry_node_next( node )                                        \
4494 +       list_entry((node)->header.level_linkage.next, carry_node,       \
4495 +                  header.level_linkage)
4496 +
4497 +#define carry_node_prev( node )                                        \
4498 +       list_entry((node)->header.level_linkage.prev, carry_node,       \
4499 +                  header.level_linkage)
4500 +
4501 +#define carry_node_front( level )                                              \
4502 +       list_entry((level)->nodes.next, carry_node, header.level_linkage)
4503 +
4504 +#define carry_node_back( level )                                               \
4505 +       list_entry((level)->nodes.prev, carry_node, header.level_linkage)
4506 +
4507 +#define carry_node_end( level, node )                          \
4508 +       (&(level)->nodes == &(node)->header.level_linkage)
4509 +
4510 +/* macro to iterate over all operations in a @level */
4511 +#define for_all_ops( level /* carry level (of type carry_level *) */,                  \
4512 +                    op    /* pointer to carry operation, modified by loop (of          \
4513 +                           * type carry_op *) */,                                      \
4514 +                    tmp   /* pointer to carry operation (of type carry_op *),          \
4515 +                           * used to make iterator stable in the face of               \
4516 +                           * deletions from the level */ )                             \
4517 +for (op = list_entry(level->ops.next, carry_op, header.level_linkage),                 \
4518 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage);  \
4519 +     &op->header.level_linkage != &level->ops;                                         \
4520 +     op = tmp,                                                                         \
4521 +     tmp = list_entry(op->header.level_linkage.next, carry_op, header.level_linkage))
4522 +
4523 +#if 0
4524 +for( op = ( carry_op * ) pool_level_list_front( &level -> ops ),               \
4525 +     tmp = ( carry_op * ) pool_level_list_next( &op -> header ) ;              \
4526 +     ! pool_level_list_end( &level -> ops, &op -> header ) ;                   \
4527 +     op = tmp, tmp = ( carry_op * ) pool_level_list_next( &op -> header ) )
4528 +#endif
4529 +
4530 +/* macro to iterate over all nodes in a @level */                                              \
4531 +#define for_all_nodes( level /* carry level (of type carry_level *) */,                                \
4532 +                      node  /* pointer to carry node, modified by loop (of                     \
4533 +                             * type carry_node *) */,                                          \
4534 +                      tmp   /* pointer to carry node (of type carry_node *),                   \
4535 +                             * used to make iterator stable in the face of *                   \
4536 +                             * deletions from the level */ )                                   \
4537 +for (node = list_entry(level->nodes.next, carry_node, header.level_linkage),                   \
4538 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);      \
4539 +     &node->header.level_linkage != &level->nodes;                                             \
4540 +     node = tmp,                                                                               \
4541 +     tmp = list_entry(node->header.level_linkage.next, carry_node, header.level_linkage))
4542 +
4543 +#if 0
4544 +for( node = carry_node_front( level ),                                         \
4545 +     tmp = carry_node_next( node ) ; ! carry_node_end( level, node ) ;         \
4546 +     node = tmp, tmp = carry_node_next( node ) )
4547 +#endif
4548 +
4549 +/* macro to iterate over all nodes in a @level in reverse order
4550 +
4551 +   This is used, because nodes are unlocked in reversed order of locking */
4552 +#define for_all_nodes_back( level /* carry level (of type carry_level *) */,   \
4553 +                           node  /* pointer to carry node, modified by loop    \
4554 +                                  * (of type carry_node *) */,                 \
4555 +                           tmp   /* pointer to carry node (of type carry_node  \
4556 +                                  * *), used to make iterator stable in the    \
4557 +                                  * face of deletions from the level */ )      \
4558 +for( node = carry_node_back( level ),          \
4559 +     tmp = carry_node_prev( node ) ; ! carry_node_end( level, node ) ;         \
4560 +     node = tmp, tmp = carry_node_prev( node ) )
4561 +
4562 +/* __FS_REISER4_CARRY_H__ */
4563 +#endif
4564 +
4565 +/* Make Linus happy.
4566 +   Local variables:
4567 +   c-indentation-style: "K&R"
4568 +   mode-name: "LC"
4569 +   c-basic-offset: 8
4570 +   tab-width: 8
4571 +   fill-column: 120
4572 +   scroll-step: 1
4573 +   End:
4574 +*/
4575 diff --git a/fs/reiser4/carry_ops.c b/fs/reiser4/carry_ops.c
4576 new file mode 100644
4577 index 0000000..8ce8e95
4578 --- /dev/null
4579 +++ b/fs/reiser4/carry_ops.c
4580 @@ -0,0 +1,2131 @@
4581 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
4582 +
4583 +/* implementation of carry operations */
4584 +
4585 +#include "forward.h"
4586 +#include "debug.h"
4587 +#include "key.h"
4588 +#include "coord.h"
4589 +#include "plugin/item/item.h"
4590 +#include "plugin/node/node.h"
4591 +#include "jnode.h"
4592 +#include "znode.h"
4593 +#include "block_alloc.h"
4594 +#include "tree_walk.h"
4595 +#include "pool.h"
4596 +#include "tree_mod.h"
4597 +#include "carry.h"
4598 +#include "carry_ops.h"
4599 +#include "tree.h"
4600 +#include "super.h"
4601 +#include "reiser4.h"
4602 +
4603 +#include <linux/types.h>
4604 +#include <linux/err.h>
4605 +
4606 +static int carry_shift_data(sideof side, coord_t * insert_coord, znode * node,
4607 +                           carry_level * doing, carry_level * todo,
4608 +                           unsigned int including_insert_coord_p);
4609 +
4610 +extern int lock_carry_node(carry_level * level, carry_node * node);
4611 +extern int lock_carry_node_tail(carry_node * node);
4612 +
4613 +/* find left neighbor of a carry node
4614 +
4615 +   Look for left neighbor of @node and add it to the @doing queue. See
4616 +   comments in the body.
4617 +
4618 +*/
4619 +static carry_node *find_left_neighbor(carry_op * op    /* node to find left
4620 +                                                        * neighbor of */ ,
4621 +                                     carry_level * doing /* level to scan */ )
4622 +{
4623 +       int result;
4624 +       carry_node *node;
4625 +       carry_node *left;
4626 +       int flags;
4627 +       reiser4_tree *tree;
4628 +
4629 +       node = op->node;
4630 +
4631 +       tree = current_tree;
4632 +       read_lock_tree(tree);
4633 +       /* first, check whether left neighbor is already in a @doing queue */
4634 +       if (reiser4_carry_real(node)->left != NULL) {
4635 +               /* NOTE: there is locking subtlety here. Look into
4636 +                * find_right_neighbor() for more info */
4637 +               if (find_carry_node(doing,
4638 +                                   reiser4_carry_real(node)->left) != NULL) {
4639 +                       read_unlock_tree(tree);
4640 +                       left = node;
4641 +                       do {
4642 +                               left = list_entry(left->header.level_linkage.prev,
4643 +                                                 carry_node, header.level_linkage);
4644 +                               assert("nikita-3408", !carry_node_end(doing,
4645 +                                                                     left));
4646 +                       } while (reiser4_carry_real(left) ==
4647 +                                reiser4_carry_real(node));
4648 +                       return left;
4649 +               }
4650 +       }
4651 +       read_unlock_tree(tree);
4652 +
4653 +       left = reiser4_add_carry_skip(doing, POOLO_BEFORE, node);
4654 +       if (IS_ERR(left))
4655 +               return left;
4656 +
4657 +       left->node = node->node;
4658 +       left->free = 1;
4659 +
4660 +       flags = GN_TRY_LOCK;
4661 +       if (!op->u.insert.flags & COPI_LOAD_LEFT)
4662 +               flags |= GN_NO_ALLOC;
4663 +
4664 +       /* then, feeling lucky, peek left neighbor in the cache. */
4665 +       result = reiser4_get_left_neighbor(&left->lock_handle,
4666 +                                          reiser4_carry_real(node),
4667 +                                          ZNODE_WRITE_LOCK, flags);
4668 +       if (result == 0) {
4669 +               /* ok, node found and locked. */
4670 +               result = lock_carry_node_tail(left);
4671 +               if (result != 0)
4672 +                       left = ERR_PTR(result);
4673 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT) {
4674 +               /* node is leftmost node in a tree, or neighbor wasn't in
4675 +                  cache, or there is an extent on the left. */
4676 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4677 +               left = NULL;
4678 +       } else if (doing->restartable) {
4679 +               /* if left neighbor is locked, and level is restartable, add
4680 +                  new node to @doing and restart. */
4681 +               assert("nikita-913", node->parent != 0);
4682 +               assert("nikita-914", node->node != NULL);
4683 +               left->left = 1;
4684 +               left->free = 0;
4685 +               left = ERR_PTR(-E_REPEAT);
4686 +       } else {
4687 +               /* left neighbor is locked, level cannot be restarted. Just
4688 +                  ignore left neighbor. */
4689 +               reiser4_pool_free(&doing->pool->node_pool, &left->header);
4690 +               left = NULL;
4691 +       }
4692 +       return left;
4693 +}
4694 +
4695 +/* find right neighbor of a carry node
4696 +
4697 +   Look for right neighbor of @node and add it to the @doing queue. See
4698 +   comments in the body.
4699 +
4700 +*/
4701 +static carry_node *find_right_neighbor(carry_op * op   /* node to find right
4702 +                                                        * neighbor of */ ,
4703 +                                      carry_level * doing /* level to scan */ )
4704 +{
4705 +       int result;
4706 +       carry_node *node;
4707 +       carry_node *right;
4708 +       lock_handle lh;
4709 +       int flags;
4710 +       reiser4_tree *tree;
4711 +
4712 +       init_lh(&lh);
4713 +
4714 +       node = op->node;
4715 +
4716 +       tree = current_tree;
4717 +       read_lock_tree(tree);
4718 +       /* first, check whether right neighbor is already in a @doing queue */
4719 +       if (reiser4_carry_real(node)->right != NULL) {
4720 +               /*
4721 +                * Tree lock is taken here anyway, because, even if _outcome_
4722 +                * of (find_carry_node() != NULL) doesn't depends on
4723 +                * concurrent updates to ->right, find_carry_node() cannot
4724 +                * work with second argument NULL. Hence, following comment is
4725 +                * of historic importance only.
4726 +                *
4727 +                * Subtle:
4728 +                *
4729 +                * Q: why don't we need tree lock here, looking for the right
4730 +                * neighbor?
4731 +                *
4732 +                * A: even if value of node->real_node->right were changed
4733 +                * during find_carry_node() execution, outcome of execution
4734 +                * wouldn't change, because (in short) other thread cannot add
4735 +                * elements to the @doing, and if node->real_node->right
4736 +                * already was in @doing, value of node->real_node->right
4737 +                * couldn't change, because node cannot be inserted between
4738 +                * locked neighbors.
4739 +                */
4740 +               if (find_carry_node(doing,
4741 +                                   reiser4_carry_real(node)->right) != NULL) {
4742 +                       read_unlock_tree(tree);
4743 +                       /*
4744 +                        * What we are doing here (this is also applicable to
4745 +                        * the find_left_neighbor()).
4746 +                        *
4747 +                        * tree_walk.c code requires that insertion of a
4748 +                        * pointer to a child, modification of parent pointer
4749 +                        * in the child, and insertion of the child into
4750 +                        * sibling list are atomic (see
4751 +                        * plugin/item/internal.c:create_hook_internal()).
4752 +                        *
4753 +                        * carry allocates new node long before pointer to it
4754 +                        * is inserted into parent and, actually, long before
4755 +                        * parent is even known. Such allocated-but-orphaned
4756 +                        * nodes are only trackable through carry level lists.
4757 +                        *
4758 +                        * Situation that is handled here is following: @node
4759 +                        * has valid ->right pointer, but there is
4760 +                        * allocated-but-orphaned node in the carry queue that
4761 +                        * is logically between @node and @node->right. Here
4762 +                        * we are searching for it. Critical point is that
4763 +                        * this is only possible if @node->right is also in
4764 +                        * the carry queue (this is checked above), because
4765 +                        * this is the only way new orphaned node could be
4766 +                        * inserted between them (before inserting new node,
4767 +                        * make_space() first tries to shift to the right, so,
4768 +                        * right neighbor will be locked and queued).
4769 +                        *
4770 +                        */
4771 +                       right = node;
4772 +                       do {
4773 +                               right = list_entry(right->header.level_linkage.next,
4774 +                                                  carry_node, header.level_linkage);
4775 +                               assert("nikita-3408", !carry_node_end(doing,
4776 +                                                                     right));
4777 +                       } while (reiser4_carry_real(right) ==
4778 +                                reiser4_carry_real(node));
4779 +                       return right;
4780 +               }
4781 +       }
4782 +       read_unlock_tree(tree);
4783 +
4784 +       flags = GN_CAN_USE_UPPER_LEVELS;
4785 +       if (!op->u.insert.flags & COPI_LOAD_RIGHT)
4786 +               flags = GN_NO_ALLOC;
4787 +
4788 +       /* then, try to lock right neighbor */
4789 +       init_lh(&lh);
4790 +       result = reiser4_get_right_neighbor(&lh,
4791 +                                           reiser4_carry_real(node),
4792 +                                           ZNODE_WRITE_LOCK, flags);
4793 +       if (result == 0) {
4794 +               /* ok, node found and locked. */
4795 +               right = reiser4_add_carry_skip(doing, POOLO_AFTER, node);
4796 +               if (!IS_ERR(right)) {
4797 +                       right->node = lh.node;
4798 +                       move_lh(&right->lock_handle, &lh);
4799 +                       right->free = 1;
4800 +                       result = lock_carry_node_tail(right);
4801 +                       if (result != 0)
4802 +                               right = ERR_PTR(result);
4803 +               }
4804 +       } else if ((result == -E_NO_NEIGHBOR) || (result == -ENOENT)) {
4805 +               /* node is rightmost node in a tree, or neighbor wasn't in
4806 +                  cache, or there is an extent on the right. */
4807 +               right = NULL;
4808 +       } else
4809 +               right = ERR_PTR(result);
4810 +       done_lh(&lh);
4811 +       return right;
4812 +}
4813 +
4814 +/* how much free space in a @node is needed for @op
4815 +
4816 +   How much space in @node is required for completion of @op, where @op is
4817 +   insert or paste operation.
4818 +*/
4819 +static unsigned int space_needed_for_op(znode * node   /* znode data are
4820 +                                                        * inserted or
4821 +                                                        * pasted in */ ,
4822 +                                       carry_op * op   /* carry
4823 +                                                          operation */ )
4824 +{
4825 +       assert("nikita-919", op != NULL);
4826 +
4827 +       switch (op->op) {
4828 +       default:
4829 +               impossible("nikita-1701", "Wrong opcode");
4830 +       case COP_INSERT:
4831 +               return space_needed(node, NULL, op->u.insert.d->data, 1);
4832 +       case COP_PASTE:
4833 +               return space_needed(node, op->u.insert.d->coord,
4834 +                                   op->u.insert.d->data, 0);
4835 +       }
4836 +}
4837 +
4838 +/* how much space in @node is required to insert or paste @data at
4839 +   @coord. */
4840 +unsigned int space_needed(const znode * node   /* node data are inserted or
4841 +                                                * pasted in */ ,
4842 +                         const coord_t * coord /* coord where data are
4843 +                                                * inserted or pasted
4844 +                                                * at */ ,
4845 +                         const reiser4_item_data * data        /* data to insert or
4846 +                                                                * paste */ ,
4847 +                         int insertion /* non-0 is inserting, 0---paste */ )
4848 +{
4849 +       int result;
4850 +       item_plugin *iplug;
4851 +
4852 +       assert("nikita-917", node != NULL);
4853 +       assert("nikita-918", node_plugin_by_node(node) != NULL);
4854 +       assert("vs-230", !insertion || (coord == NULL));
4855 +
4856 +       result = 0;
4857 +       iplug = data->iplug;
4858 +       if (iplug->b.estimate != NULL) {
4859 +               /* ask item plugin how much space is needed to insert this
4860 +                  item */
4861 +               result += iplug->b.estimate(insertion ? NULL : coord, data);
4862 +       } else {
4863 +               /* reasonable default */
4864 +               result += data->length;
4865 +       }
4866 +       if (insertion) {
4867 +               node_plugin *nplug;
4868 +
4869 +               nplug = node->nplug;
4870 +               /* and add node overhead */
4871 +               if (nplug->item_overhead != NULL) {
4872 +                       result += nplug->item_overhead(node, NULL);
4873 +               }
4874 +       }
4875 +       return result;
4876 +}
4877 +
4878 +/* find &coord in parent where pointer to new child is to be stored. */
4879 +static int find_new_child_coord(carry_op * op  /* COP_INSERT carry operation to
4880 +                                                * insert pointer to new
4881 +                                                * child */ )
4882 +{
4883 +       int result;
4884 +       znode *node;
4885 +       znode *child;
4886 +
4887 +       assert("nikita-941", op != NULL);
4888 +       assert("nikita-942", op->op == COP_INSERT);
4889 +
4890 +       node = reiser4_carry_real(op->node);
4891 +       assert("nikita-943", node != NULL);
4892 +       assert("nikita-944", node_plugin_by_node(node) != NULL);
4893 +
4894 +       child = reiser4_carry_real(op->u.insert.child);
4895 +       result =
4896 +           find_new_child_ptr(node, child, op->u.insert.brother,
4897 +                              op->u.insert.d->coord);
4898 +
4899 +       build_child_ptr_data(child, op->u.insert.d->data);
4900 +       return result;
4901 +}
4902 +
4903 +/* additional amount of free space in @node required to complete @op */
4904 +static int free_space_shortage(znode * node /* node to check */ ,
4905 +                              carry_op * op /* operation being performed */ )
4906 +{
4907 +       assert("nikita-1061", node != NULL);
4908 +       assert("nikita-1062", op != NULL);
4909 +
4910 +       switch (op->op) {
4911 +       default:
4912 +               impossible("nikita-1702", "Wrong opcode");
4913 +       case COP_INSERT:
4914 +       case COP_PASTE:
4915 +               return space_needed_for_op(node, op) - znode_free_space(node);
4916 +       case COP_EXTENT:
4917 +               /* when inserting extent shift data around until insertion
4918 +                  point is utmost in the node. */
4919 +               if (coord_wrt(op->u.insert.d->coord) == COORD_INSIDE)
4920 +                       return +1;
4921 +               else
4922 +                       return -1;
4923 +       }
4924 +}
4925 +
4926 +/* helper function: update node pointer in operation after insertion
4927 +   point was probably shifted into @target. */
4928 +static znode *sync_op(carry_op * op, carry_node * target)
4929 +{
4930 +       znode *insertion_node;
4931 +
4932 +       /* reget node from coord: shift might move insertion coord to
4933 +          the neighbor */
4934 +       insertion_node = op->u.insert.d->coord->node;
4935 +       /* if insertion point was actually moved into new node,
4936 +          update carry node pointer in operation. */
4937 +       if (insertion_node != reiser4_carry_real(op->node)) {
4938 +               op->node = target;
4939 +               assert("nikita-2540",
4940 +                      reiser4_carry_real(target) == insertion_node);
4941 +       }
4942 +       assert("nikita-2541",
4943 +              reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
4944 +       return insertion_node;
4945 +}
4946 +
4947 +/*
4948 + * complete make_space() call: update tracked lock handle if necessary. See
4949 + * comments for fs/reiser4/carry.h:carry_track_type
4950 + */
4951 +static int
4952 +make_space_tail(carry_op * op, carry_level * doing, znode * orig_node)
4953 +{
4954 +       int result;
4955 +       carry_track_type tracking;
4956 +       znode *node;
4957 +
4958 +       tracking = doing->track_type;
4959 +       node = op->u.insert.d->coord->node;
4960 +
4961 +       if (tracking == CARRY_TRACK_NODE ||
4962 +           (tracking == CARRY_TRACK_CHANGE && node != orig_node)) {
4963 +               /* inserting or pasting into node different from
4964 +                  original. Update lock handle supplied by caller. */
4965 +               assert("nikita-1417", doing->tracked != NULL);
4966 +               done_lh(doing->tracked);
4967 +               init_lh(doing->tracked);
4968 +               result = longterm_lock_znode(doing->tracked, node,
4969 +                                            ZNODE_WRITE_LOCK,
4970 +                                            ZNODE_LOCK_HIPRI);
4971 +       } else
4972 +               result = 0;
4973 +       return result;
4974 +}
4975 +
4976 +/* This is insertion policy function. It shifts data to the left and right
4977 +   neighbors of insertion coord and allocates new nodes until there is enough
4978 +   free space to complete @op.
4979 +
4980 +   See comments in the body.
4981 +
4982 +   Assumes that the node format favors insertions at the right end of the node
4983 +   as node40 does.
4984 +
4985 +   See carry_flow() on detail about flow insertion
4986 +*/
4987 +static int make_space(carry_op * op /* carry operation, insert or paste */ ,
4988 +                     carry_level * doing /* current carry queue */ ,
4989 +                     carry_level * todo /* carry queue on the parent level */ )
4990 +{
4991 +       znode *node;
4992 +       int result;
4993 +       int not_enough_space;
4994 +       int blk_alloc;
4995 +       znode *orig_node;
4996 +       __u32 flags;
4997 +
4998 +       coord_t *coord;
4999 +
5000 +       assert("nikita-890", op != NULL);
5001 +       assert("nikita-891", todo != NULL);
5002 +       assert("nikita-892",
5003 +              op->op == COP_INSERT ||
5004 +              op->op == COP_PASTE || op->op == COP_EXTENT);
5005 +       assert("nikita-1607",
5006 +              reiser4_carry_real(op->node) == op->u.insert.d->coord->node);
5007 +
5008 +       flags = op->u.insert.flags;
5009 +
5010 +       /* NOTE check that new node can only be allocated after checking left
5011 +        * and right neighbors. This is necessary for proper work of
5012 +        * find_{left,right}_neighbor(). */
5013 +       assert("nikita-3410", ergo(flags & COPI_DONT_ALLOCATE,
5014 +                                  flags & COPI_DONT_SHIFT_LEFT));
5015 +       assert("nikita-3411", ergo(flags & COPI_DONT_ALLOCATE,
5016 +                                  flags & COPI_DONT_SHIFT_RIGHT));
5017 +
5018 +       coord = op->u.insert.d->coord;
5019 +       orig_node = node = coord->node;
5020 +
5021 +       assert("nikita-908", node != NULL);
5022 +       assert("nikita-909", node_plugin_by_node(node) != NULL);
5023 +
5024 +       result = 0;
5025 +       /* If there is not enough space in a node, try to shift something to
5026 +          the left neighbor. This is a bit tricky, as locking to the left is
5027 +          low priority. This is handled by restart logic in carry().
5028 +        */
5029 +       not_enough_space = free_space_shortage(node, op);
5030 +       if (not_enough_space <= 0)
5031 +               /* it is possible that carry was called when there actually
5032 +                  was enough space in the node. For example, when inserting
5033 +                  leftmost item so that delimiting keys have to be updated.
5034 +                */
5035 +               return make_space_tail(op, doing, orig_node);
5036 +       if (!(flags & COPI_DONT_SHIFT_LEFT)) {
5037 +               carry_node *left;
5038 +               /* make note in statistics of an attempt to move
5039 +                  something into the left neighbor */
5040 +               left = find_left_neighbor(op, doing);
5041 +               if (unlikely(IS_ERR(left))) {
5042 +                       if (PTR_ERR(left) == -E_REPEAT)
5043 +                               return -E_REPEAT;
5044 +                       else {
5045 +                               /* some error other than restart request
5046 +                                  occurred. This shouldn't happen. Issue a
5047 +                                  warning and continue as if left neighbor
5048 +                                  weren't existing.
5049 +                                */
5050 +                               warning("nikita-924",
5051 +                                       "Error accessing left neighbor: %li",
5052 +                                       PTR_ERR(left));
5053 +                       }
5054 +               } else if (left != NULL) {
5055 +
5056 +                       /* shift everything possible on the left of and
5057 +                          including insertion coord into the left neighbor */
5058 +                       result = carry_shift_data(LEFT_SIDE, coord,
5059 +                                                 reiser4_carry_real(left),
5060 +                                                 doing, todo,
5061 +                                                 flags & COPI_GO_LEFT);
5062 +
5063 +                       /* reget node from coord: shift_left() might move
5064 +                          insertion coord to the left neighbor */
5065 +                       node = sync_op(op, left);
5066 +
5067 +                       not_enough_space = free_space_shortage(node, op);
5068 +                       /* There is not enough free space in @node, but
5069 +                          may be, there is enough free space in
5070 +                          @left. Various balancing decisions are valid here.
5071 +                          The same for the shifiting to the right.
5072 +                        */
5073 +               }
5074 +       }
5075 +       /* If there still is not enough space, shift to the right */
5076 +       if (not_enough_space > 0 && !(flags & COPI_DONT_SHIFT_RIGHT)) {
5077 +               carry_node *right;
5078 +
5079 +               right = find_right_neighbor(op, doing);
5080 +               if (IS_ERR(right)) {
5081 +                       warning("nikita-1065",
5082 +                               "Error accessing right neighbor: %li",
5083 +                               PTR_ERR(right));
5084 +               } else if (right != NULL) {
5085 +                       /* node containing insertion point, and its right
5086 +                          neighbor node are write locked by now.
5087 +
5088 +                          shift everything possible on the right of but
5089 +                          excluding insertion coord into the right neighbor
5090 +                        */
5091 +                       result = carry_shift_data(RIGHT_SIDE, coord,
5092 +                                                 reiser4_carry_real(right),
5093 +                                                 doing, todo,
5094 +                                                 flags & COPI_GO_RIGHT);
5095 +                       /* reget node from coord: shift_right() might move
5096 +                          insertion coord to the right neighbor */
5097 +                       node = sync_op(op, right);
5098 +                       not_enough_space = free_space_shortage(node, op);
5099 +               }
5100 +       }
5101 +       /* If there is still not enough space, allocate new node(s).
5102 +
5103 +          We try to allocate new blocks if COPI_DONT_ALLOCATE is not set in
5104 +          the carry operation flags (currently this is needed during flush
5105 +          only).
5106 +        */
5107 +       for (blk_alloc = 0;
5108 +            not_enough_space > 0 && result == 0 && blk_alloc < 2 &&
5109 +            !(flags & COPI_DONT_ALLOCATE); ++blk_alloc) {
5110 +               carry_node *fresh;      /* new node we are allocating */
5111 +               coord_t coord_shadow;   /* remembered insertion point before
5112 +                                        * shifting data into new node */
5113 +               carry_node *node_shadow;        /* remembered insertion node before
5114 +                                                * shifting */
5115 +               unsigned int gointo;    /* whether insertion point should move
5116 +                                        * into newly allocated node */
5117 +
5118 +               /* allocate new node on the right of @node. Znode and disk
5119 +                  fake block number for new node are allocated.
5120 +
5121 +                  add_new_znode() posts carry operation COP_INSERT with
5122 +                  COPT_CHILD option to the parent level to add
5123 +                  pointer to newly created node to its parent.
5124 +
5125 +                  Subtle point: if several new nodes are required to complete
5126 +                  insertion operation at this level, they will be inserted
5127 +                  into their parents in the order of creation, which means
5128 +                  that @node will be valid "cookie" at the time of insertion.
5129 +
5130 +                */
5131 +               fresh = add_new_znode(node, op->node, doing, todo);
5132 +               if (IS_ERR(fresh))
5133 +                       return PTR_ERR(fresh);
5134 +
5135 +               /* Try to shift into new node. */
5136 +               result = lock_carry_node(doing, fresh);
5137 +               zput(reiser4_carry_real(fresh));
5138 +               if (result != 0) {
5139 +                       warning("nikita-947",
5140 +                               "Cannot lock new node: %i", result);
5141 +                       return result;
5142 +               }
5143 +
5144 +               /* both nodes are write locked by now.
5145 +
5146 +                  shift everything possible on the right of and
5147 +                  including insertion coord into the right neighbor.
5148 +                */
5149 +               coord_dup(&coord_shadow, op->u.insert.d->coord);
5150 +               node_shadow = op->node;
5151 +               /* move insertion point into newly created node if:
5152 +
5153 +                  . insertion point is rightmost in the source node, or
5154 +                  . this is not the first node we are allocating in a row.
5155 +                */
5156 +               gointo =
5157 +                   (blk_alloc > 0) ||
5158 +                   coord_is_after_rightmost(op->u.insert.d->coord);
5159 +
5160 +               if (gointo &&
5161 +                   op->op == COP_PASTE &&
5162 +                   coord_is_existing_item(op->u.insert.d->coord) &&
5163 +                   is_solid_item((item_plugin_by_coord(op->u.insert.d->coord)))) {
5164 +                       /* paste into solid (atomic) item, which can contain
5165 +                          only one unit, so we need to shift it right, where
5166 +                          insertion point supposed to be */
5167 +
5168 +                       assert("edward-1444", op->u.insert.d->data->iplug ==
5169 +                              item_plugin_by_id(STATIC_STAT_DATA_ID));
5170 +                       assert("edward-1445",
5171 +                              op->u.insert.d->data->length >
5172 +                              node_plugin_by_node(coord->node)->free_space
5173 +                              (coord->node));
5174 +
5175 +                       op->u.insert.d->coord->between = BEFORE_UNIT;
5176 +               }
5177 +
5178 +               result = carry_shift_data(RIGHT_SIDE, coord,
5179 +                                         reiser4_carry_real(fresh),
5180 +                                         doing, todo, gointo);
5181 +               /* if insertion point was actually moved into new node,
5182 +                  update carry node pointer in operation. */
5183 +               node = sync_op(op, fresh);
5184 +               not_enough_space = free_space_shortage(node, op);
5185 +               if ((not_enough_space > 0) && (node != coord_shadow.node)) {
5186 +                       /* there is not enough free in new node. Shift
5187 +                          insertion point back to the @shadow_node so that
5188 +                          next new node would be inserted between
5189 +                          @shadow_node and @fresh.
5190 +                        */
5191 +                       coord_normalize(&coord_shadow);
5192 +                       coord_dup(coord, &coord_shadow);
5193 +                       node = coord->node;
5194 +                       op->node = node_shadow;
5195 +                       if (1 || (flags & COPI_STEP_BACK)) {
5196 +                               /* still not enough space?! Maybe there is
5197 +                                  enough space in the source node (i.e., node
5198 +                                  data are moved from) now.
5199 +                                */
5200 +                               not_enough_space =
5201 +                                   free_space_shortage(node, op);
5202 +                       }
5203 +               }
5204 +       }
5205 +       if (not_enough_space > 0) {
5206 +               if (!(flags & COPI_DONT_ALLOCATE))
5207 +                       warning("nikita-948", "Cannot insert new item");
5208 +               result = -E_NODE_FULL;
5209 +       }
5210 +       assert("nikita-1622", ergo(result == 0,
5211 +                                 reiser4_carry_real(op->node) == coord->node));
5212 +       assert("nikita-2616", coord == op->u.insert.d->coord);
5213 +       if (result == 0)
5214 +               result = make_space_tail(op, doing, orig_node);
5215 +       return result;
5216 +}
5217 +
5218 +/* insert_paste_common() - common part of insert and paste operations
5219 +
5220 +   This function performs common part of COP_INSERT and COP_PASTE.
5221 +
5222 +   There are two ways in which insertion/paste can be requested:
5223 +
5224 +    . by directly supplying reiser4_item_data. In this case, op ->
5225 +    u.insert.type is set to COPT_ITEM_DATA.
5226 +
5227 +    . by supplying child pointer to which is to inserted into parent. In this
5228 +    case op -> u.insert.type == COPT_CHILD.
5229 +
5230 +    . by supplying key of new item/unit. This is currently only used during
5231 +    extent insertion
5232 +
5233 +   This is required, because when new node is allocated we don't know at what
5234 +   position pointer to it is to be stored in the parent. Actually, we don't
5235 +   even know what its parent will be, because parent can be re-balanced
5236 +   concurrently and new node re-parented, and because parent can be full and
5237 +   pointer to the new node will go into some other node.
5238 +
5239 +   insert_paste_common() resolves pointer to child node into position in the
5240 +   parent by calling find_new_child_coord(), that fills
5241 +   reiser4_item_data. After this, insertion/paste proceeds uniformly.
5242 +
5243 +   Another complication is with finding free space during pasting. It may
5244 +   happen that while shifting items to the neighbors and newly allocated
5245 +   nodes, insertion coord can no longer be in the item we wanted to paste
5246 +   into. At this point, paste becomes (morphs) into insert. Moreover free
5247 +   space analysis has to be repeated, because amount of space required for
5248 +   insertion is different from that of paste (item header overhead, etc).
5249 +
5250 +   This function "unifies" different insertion modes (by resolving child
5251 +   pointer or key into insertion coord), and then calls make_space() to free
5252 +   enough space in the node by shifting data to the left and right and by
5253 +   allocating new nodes if necessary. Carry operation knows amount of space
5254 +   required for its completion. After enough free space is obtained, caller of
5255 +   this function (carry_{insert,paste,etc.}) performs actual insertion/paste
5256 +   by calling item plugin method.
5257 +
5258 +*/
5259 +static int insert_paste_common(carry_op * op   /* carry operation being
5260 +                                                * performed */ ,
5261 +                              carry_level * doing /* current carry level */ ,
5262 +                              carry_level * todo /* next carry level */ ,
5263 +                              carry_insert_data * cdata        /* pointer to
5264 +                                                                * cdata */ ,
5265 +                              coord_t * coord /* insertion/paste coord */ ,
5266 +                              reiser4_item_data * data /* data to be
5267 +                                                        * inserted/pasted */ )
5268 +{
5269 +       assert("nikita-981", op != NULL);
5270 +       assert("nikita-980", todo != NULL);
5271 +       assert("nikita-979", (op->op == COP_INSERT) || (op->op == COP_PASTE)
5272 +              || (op->op == COP_EXTENT));
5273 +
5274 +       if (op->u.insert.type == COPT_PASTE_RESTARTED) {
5275 +               /* nothing to do. Fall through to make_space(). */
5276 +               ;
5277 +       } else if (op->u.insert.type == COPT_KEY) {
5278 +               node_search_result intra_node;
5279 +               znode *node;
5280 +               /* Problem with doing batching at the lowest level, is that
5281 +                  operations here are given by coords where modification is
5282 +                  to be performed, and one modification can invalidate coords
5283 +                  of all following operations.
5284 +
5285 +                  So, we are implementing yet another type for operation that
5286 +                  will use (the only) "locator" stable across shifting of
5287 +                  data between nodes, etc.: key (COPT_KEY).
5288 +
5289 +                  This clause resolves key to the coord in the node.
5290 +
5291 +                  But node can change also. Probably some pieces have to be
5292 +                  added to the lock_carry_node(), to lock node by its key.
5293 +
5294 +                */
5295 +               /* NOTE-NIKITA Lookup bias is fixed to FIND_EXACT. Complain
5296 +                  if you need something else. */
5297 +               op->u.insert.d->coord = coord;
5298 +               node = reiser4_carry_real(op->node);
5299 +               intra_node = node_plugin_by_node(node)->lookup
5300 +                   (node, op->u.insert.d->key, FIND_EXACT,
5301 +                    op->u.insert.d->coord);
5302 +               if ((intra_node != NS_FOUND) && (intra_node != NS_NOT_FOUND)) {
5303 +                       warning("nikita-1715", "Intra node lookup failure: %i",
5304 +                               intra_node);
5305 +                       return intra_node;
5306 +               }
5307 +       } else if (op->u.insert.type == COPT_CHILD) {
5308 +               /* if we are asked to insert pointer to the child into
5309 +                  internal node, first convert pointer to the child into
5310 +                  coord within parent node.
5311 +                */
5312 +               znode *child;
5313 +               int result;
5314 +
5315 +               op->u.insert.d = cdata;
5316 +               op->u.insert.d->coord = coord;
5317 +               op->u.insert.d->data = data;
5318 +               op->u.insert.d->coord->node = reiser4_carry_real(op->node);
5319 +               result = find_new_child_coord(op);
5320 +               child = reiser4_carry_real(op->u.insert.child);
5321 +               if (result != NS_NOT_FOUND) {
5322 +                       warning("nikita-993",
5323 +                               "Cannot find a place for child pointer: %i",
5324 +                               result);
5325 +                       return result;
5326 +               }
5327 +               /* This only happens when we did multiple insertions at
5328 +                  the previous level, trying to insert single item and
5329 +                  it so happened, that insertion of pointers to all new
5330 +                  nodes before this one already caused parent node to
5331 +                  split (may be several times).
5332 +
5333 +                  I am going to come up with better solution.
5334 +
5335 +                  You are not expected to understand this.
5336 +                  -- v6root/usr/sys/ken/slp.c
5337 +
5338 +                  Basically, what happens here is the following: carry came
5339 +                  to the parent level and is about to insert internal item
5340 +                  pointing to the child node that it just inserted in the
5341 +                  level below. Position where internal item is to be inserted
5342 +                  was found by find_new_child_coord() above, but node of the
5343 +                  current carry operation (that is, parent node of child
5344 +                  inserted on the previous level), was determined earlier in
5345 +                  the lock_carry_level/lock_carry_node. It could so happen
5346 +                  that other carry operations already performed on the parent
5347 +                  level already split parent node, so that insertion point
5348 +                  moved into another node. Handle this by creating new carry
5349 +                  node for insertion point if necessary.
5350 +                */
5351 +               if (reiser4_carry_real(op->node) !=
5352 +                   op->u.insert.d->coord->node) {
5353 +                       pool_ordering direction;
5354 +                       znode *z1;
5355 +                       znode *z2;
5356 +                       reiser4_key k1;
5357 +                       reiser4_key k2;
5358 +
5359 +                       /*
5360 +                        * determine in what direction insertion point
5361 +                        * moved. Do this by comparing delimiting keys.
5362 +                        */
5363 +                       z1 = op->u.insert.d->coord->node;
5364 +                       z2 = reiser4_carry_real(op->node);
5365 +                       if (keyle(leftmost_key_in_node(z1, &k1),
5366 +                                 leftmost_key_in_node(z2, &k2)))
5367 +                               /* insertion point moved to the left */
5368 +                               direction = POOLO_BEFORE;
5369 +                       else
5370 +                               /* insertion point moved to the right */
5371 +                               direction = POOLO_AFTER;
5372 +
5373 +                       op->node = reiser4_add_carry_skip(doing,
5374 +                                                         direction, op->node);
5375 +                       if (IS_ERR(op->node))
5376 +                               return PTR_ERR(op->node);
5377 +                       op->node->node = op->u.insert.d->coord->node;
5378 +                       op->node->free = 1;
5379 +                       result = lock_carry_node(doing, op->node);
5380 +                       if (result != 0)
5381 +                               return result;
5382 +               }
5383 +
5384 +               /*
5385 +                * set up key of an item being inserted: we are inserting
5386 +                * internal item and its key is (by the very definition of
5387 +                * search tree) is leftmost key in the child node.
5388 +                */
5389 +               write_lock_dk(znode_get_tree(child));
5390 +               op->u.insert.d->key = leftmost_key_in_node(child,
5391 +                                                          znode_get_ld_key(child));
5392 +               write_unlock_dk(znode_get_tree(child));
5393 +               op->u.insert.d->data->arg = op->u.insert.brother;
5394 +       } else {
5395 +               assert("vs-243", op->u.insert.d->coord != NULL);
5396 +               op->u.insert.d->coord->node = reiser4_carry_real(op->node);
5397 +       }
5398 +
5399 +       /* find free space. */
5400 +       return make_space(op, doing, todo);
5401 +}
5402 +
5403 +/* handle carry COP_INSERT operation.
5404 +
5405 +   Insert new item into node. New item can be given in one of two ways:
5406 +
5407 +   - by passing &tree_coord and &reiser4_item_data as part of @op. This is
5408 +   only applicable at the leaf/twig level.
5409 +
5410 +   - by passing a child node pointer to which is to be inserted by this
5411 +   operation.
5412 +
5413 +*/
5414 +static int carry_insert(carry_op * op /* operation to perform */ ,
5415 +                       carry_level * doing     /* queue of operations @op
5416 +                                                * is part of */ ,
5417 +                       carry_level * todo      /* queue where new operations
5418 +                                                * are accumulated */ )
5419 +{
5420 +       znode *node;
5421 +       carry_insert_data cdata;
5422 +       coord_t coord;
5423 +       reiser4_item_data data;
5424 +       carry_plugin_info info;
5425 +       int result;
5426 +
5427 +       assert("nikita-1036", op != NULL);
5428 +       assert("nikita-1037", todo != NULL);
5429 +       assert("nikita-1038", op->op == COP_INSERT);
5430 +
5431 +       coord_init_zero(&coord);
5432 +
5433 +       /* perform common functionality of insert and paste. */
5434 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
5435 +       if (result != 0)
5436 +               return result;
5437 +
5438 +       node = op->u.insert.d->coord->node;
5439 +       assert("nikita-1039", node != NULL);
5440 +       assert("nikita-1040", node_plugin_by_node(node) != NULL);
5441 +
5442 +       assert("nikita-949",
5443 +              space_needed_for_op(node, op) <= znode_free_space(node));
5444 +
5445 +       /* ask node layout to create new item. */
5446 +       info.doing = doing;
5447 +       info.todo = todo;
5448 +       result = node_plugin_by_node(node)->create_item
5449 +           (op->u.insert.d->coord, op->u.insert.d->key, op->u.insert.d->data,
5450 +            &info);
5451 +       doing->restartable = 0;
5452 +       znode_make_dirty(node);
5453 +
5454 +       return result;
5455 +}
5456 +
5457 +/*
5458 + * Flow insertion code. COP_INSERT_FLOW is special tree operation that is
5459 + * supplied with a "flow" (that is, a stream of data) and inserts it into tree
5460 + * by slicing into multiple items.
5461 + */
5462 +
5463 +#define flow_insert_point(op) ( ( op ) -> u.insert_flow.insert_point )
5464 +#define flow_insert_flow(op) ( ( op ) -> u.insert_flow.flow )
5465 +#define flow_insert_data(op) ( ( op ) -> u.insert_flow.data )
5466 +
5467 +static size_t item_data_overhead(carry_op * op)
5468 +{
5469 +       if (flow_insert_data(op)->iplug->b.estimate == NULL)
5470 +               return 0;
5471 +       return (flow_insert_data(op)->iplug->b.
5472 +               estimate(NULL /* estimate insertion */ , flow_insert_data(op)) -
5473 +               flow_insert_data(op)->length);
5474 +}
5475 +
5476 +/* FIXME-VS: this is called several times during one make_flow_for_insertion
5477 +   and it will always return the same result. Some optimization could be made
5478 +   by calculating this value once at the beginning and passing it around. That
5479 +   would reduce some flexibility in future changes
5480 +*/
5481 +static int can_paste(coord_t *, const reiser4_key *, const reiser4_item_data *);
5482 +static size_t flow_insertion_overhead(carry_op * op)
5483 +{
5484 +       znode *node;
5485 +       size_t insertion_overhead;
5486 +
5487 +       node = flow_insert_point(op)->node;
5488 +       insertion_overhead = 0;
5489 +       if (node->nplug->item_overhead &&
5490 +           !can_paste(flow_insert_point(op), &flow_insert_flow(op)->key,
5491 +                      flow_insert_data(op)))
5492 +               insertion_overhead =
5493 +                   node->nplug->item_overhead(node, NULL) +
5494 +                       item_data_overhead(op);
5495 +       return insertion_overhead;
5496 +}
5497 +
5498 +/* how many bytes of flow does fit to the node */
5499 +static int what_can_fit_into_node(carry_op * op)
5500 +{
5501 +       size_t free, overhead;
5502 +
5503 +       overhead = flow_insertion_overhead(op);
5504 +       free = znode_free_space(flow_insert_point(op)->node);
5505 +       if (free <= overhead)
5506 +               return 0;
5507 +       free -= overhead;
5508 +       /* FIXME: flow->length is loff_t only to not get overflowed in case of expandign truncate */
5509 +       if (free < op->u.insert_flow.flow->length)
5510 +               return free;
5511 +       return (int)op->u.insert_flow.flow->length;
5512 +}
5513 +
5514 +/* in make_space_for_flow_insertion we need to check either whether whole flow
5515 +   fits into a node or whether minimal fraction of flow fits into a node */
5516 +static int enough_space_for_whole_flow(carry_op * op)
5517 +{
5518 +       return (unsigned)what_can_fit_into_node(op) ==
5519 +           op->u.insert_flow.flow->length;
5520 +}
5521 +
5522 +#define MIN_FLOW_FRACTION 1
5523 +static int enough_space_for_min_flow_fraction(carry_op * op)
5524 +{
5525 +       assert("vs-902", coord_is_after_rightmost(flow_insert_point(op)));
5526 +
5527 +       return what_can_fit_into_node(op) >= MIN_FLOW_FRACTION;
5528 +}
5529 +
5530 +/* this returns 0 if left neighbor was obtained successfully and everything
5531 +   upto insertion point including it were shifted and left neighbor still has
5532 +   some free space to put minimal fraction of flow into it */
5533 +static int
5534 +make_space_by_shift_left(carry_op * op, carry_level * doing, carry_level * todo)
5535 +{
5536 +       carry_node *left;
5537 +       znode *orig;
5538 +
5539 +       left = find_left_neighbor(op, doing);
5540 +       if (unlikely(IS_ERR(left))) {
5541 +               warning("vs-899",
5542 +                       "make_space_by_shift_left: "
5543 +                       "error accessing left neighbor: %li", PTR_ERR(left));
5544 +               return 1;
5545 +       }
5546 +       if (left == NULL)
5547 +               /* left neighbor either does not exist or is unformatted
5548 +                  node */
5549 +               return 1;
5550 +
5551 +       orig = flow_insert_point(op)->node;
5552 +       /* try to shift content of node @orig from its head upto insert point
5553 +          including insertion point into the left neighbor */
5554 +       carry_shift_data(LEFT_SIDE, flow_insert_point(op),
5555 +                        reiser4_carry_real(left), doing, todo,
5556 +                        1 /* including insert point */);
5557 +       if (reiser4_carry_real(left) != flow_insert_point(op)->node) {
5558 +               /* insertion point did not move */
5559 +               return 1;
5560 +       }
5561 +
5562 +       /* insertion point is set after last item in the node */
5563 +       assert("vs-900", coord_is_after_rightmost(flow_insert_point(op)));
5564 +
5565 +       if (!enough_space_for_min_flow_fraction(op)) {
5566 +               /* insertion point node does not have enough free space to put
5567 +                  even minimal portion of flow into it, therefore, move
5568 +                  insertion point back to orig node (before first item) */
5569 +               coord_init_before_first_item(flow_insert_point(op), orig);
5570 +               return 1;
5571 +       }
5572 +
5573 +       /* part of flow is to be written to the end of node */
5574 +       op->node = left;
5575 +       return 0;
5576 +}
5577 +
5578 +/* this returns 0 if right neighbor was obtained successfully and everything to
5579 +   the right of insertion point was shifted to it and node got enough free
5580 +   space to put minimal fraction of flow into it */
5581 +static int
5582 +make_space_by_shift_right(carry_op * op, carry_level * doing,
5583 +                         carry_level * todo)
5584 +{
5585 +       carry_node *right;
5586 +
5587 +       right = find_right_neighbor(op, doing);
5588 +       if (unlikely(IS_ERR(right))) {
5589 +               warning("nikita-1065", "shift_right_excluding_insert_point: "
5590 +                       "error accessing right neighbor: %li", PTR_ERR(right));
5591 +               return 1;
5592 +       }
5593 +       if (right) {
5594 +               /* shift everything possible on the right of but excluding
5595 +                  insertion coord into the right neighbor */
5596 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5597 +                                reiser4_carry_real(right), doing, todo,
5598 +                                0 /* not including insert point */);
5599 +       } else {
5600 +               /* right neighbor either does not exist or is unformatted
5601 +                  node */
5602 +               ;
5603 +       }
5604 +       if (coord_is_after_rightmost(flow_insert_point(op))) {
5605 +               if (enough_space_for_min_flow_fraction(op)) {
5606 +                       /* part of flow is to be written to the end of node */
5607 +                       return 0;
5608 +               }
5609 +       }
5610 +
5611 +       /* new node is to be added if insert point node did not get enough
5612 +          space for whole flow */
5613 +       return 1;
5614 +}
5615 +
5616 +/* this returns 0 when insert coord is set at the node end and fraction of flow
5617 +   fits into that node */
5618 +static int
5619 +make_space_by_new_nodes(carry_op * op, carry_level * doing, carry_level * todo)
5620 +{
5621 +       int result;
5622 +       znode *node;
5623 +       carry_node *new;
5624 +
5625 +       node = flow_insert_point(op)->node;
5626 +
5627 +       if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5628 +               return RETERR(-E_NODE_FULL);
5629 +       /* add new node after insert point node */
5630 +       new = add_new_znode(node, op->node, doing, todo);
5631 +       if (unlikely(IS_ERR(new))) {
5632 +               return PTR_ERR(new);
5633 +       }
5634 +       result = lock_carry_node(doing, new);
5635 +       zput(reiser4_carry_real(new));
5636 +       if (unlikely(result)) {
5637 +               return result;
5638 +       }
5639 +       op->u.insert_flow.new_nodes++;
5640 +       if (!coord_is_after_rightmost(flow_insert_point(op))) {
5641 +               carry_shift_data(RIGHT_SIDE, flow_insert_point(op),
5642 +                                reiser4_carry_real(new), doing, todo,
5643 +                                0 /* not including insert point */);
5644 +               assert("vs-901",
5645 +                      coord_is_after_rightmost(flow_insert_point(op)));
5646 +
5647 +               if (enough_space_for_min_flow_fraction(op)) {
5648 +                       return 0;
5649 +               }
5650 +               if (op->u.insert_flow.new_nodes == CARRY_FLOW_NEW_NODES_LIMIT)
5651 +                       return RETERR(-E_NODE_FULL);
5652 +
5653 +               /* add one more new node */
5654 +               new = add_new_znode(node, op->node, doing, todo);
5655 +               if (unlikely(IS_ERR(new))) {
5656 +                       return PTR_ERR(new);
5657 +               }
5658 +               result = lock_carry_node(doing, new);
5659 +               zput(reiser4_carry_real(new));
5660 +               if (unlikely(result)) {
5661 +                       return result;
5662 +               }
5663 +               op->u.insert_flow.new_nodes++;
5664 +       }
5665 +
5666 +       /* move insertion point to new node */
5667 +       coord_init_before_first_item(flow_insert_point(op),
5668 +                                    reiser4_carry_real(new));
5669 +       op->node = new;
5670 +       return 0;
5671 +}
5672 +
5673 +static int
5674 +make_space_for_flow_insertion(carry_op * op, carry_level * doing,
5675 +                             carry_level * todo)
5676 +{
5677 +       __u32 flags = op->u.insert_flow.flags;
5678 +
5679 +       if (enough_space_for_whole_flow(op)) {
5680 +               /* whole flow fits into insert point node */
5681 +               return 0;
5682 +       }
5683 +
5684 +       if (!(flags & COPI_DONT_SHIFT_LEFT)
5685 +           && (make_space_by_shift_left(op, doing, todo) == 0)) {
5686 +               /* insert point is shifted to left neighbor of original insert
5687 +                  point node and is set after last unit in that node. It has
5688 +                  enough space to fit at least minimal fraction of flow. */
5689 +               return 0;
5690 +       }
5691 +
5692 +       if (enough_space_for_whole_flow(op)) {
5693 +               /* whole flow fits into insert point node */
5694 +               return 0;
5695 +       }
5696 +
5697 +       if (!(flags & COPI_DONT_SHIFT_RIGHT)
5698 +           && (make_space_by_shift_right(op, doing, todo) == 0)) {
5699 +               /* insert point is still set to the same node, but there is
5700 +                  nothing to the right of insert point. */
5701 +               return 0;
5702 +       }
5703 +
5704 +       if (enough_space_for_whole_flow(op)) {
5705 +               /* whole flow fits into insert point node */
5706 +               return 0;
5707 +       }
5708 +
5709 +       return make_space_by_new_nodes(op, doing, todo);
5710 +}
5711 +
5712 +/* implements COP_INSERT_FLOW operation */
5713 +static int
5714 +carry_insert_flow(carry_op * op, carry_level * doing, carry_level * todo)
5715 +{
5716 +       int result;
5717 +       flow_t *f;
5718 +       coord_t *insert_point;
5719 +       node_plugin *nplug;
5720 +       carry_plugin_info info;
5721 +       znode *orig_node;
5722 +       lock_handle *orig_lh;
5723 +
5724 +       f = op->u.insert_flow.flow;
5725 +       result = 0;
5726 +
5727 +       /* carry system needs this to work */
5728 +       info.doing = doing;
5729 +       info.todo = todo;
5730 +
5731 +       orig_node = flow_insert_point(op)->node;
5732 +       orig_lh = doing->tracked;
5733 +
5734 +       while (f->length) {
5735 +               result = make_space_for_flow_insertion(op, doing, todo);
5736 +               if (result)
5737 +                       break;
5738 +
5739 +               insert_point = flow_insert_point(op);
5740 +               nplug = node_plugin_by_node(insert_point->node);
5741 +
5742 +               /* compose item data for insertion/pasting */
5743 +               flow_insert_data(op)->data = f->data;
5744 +               flow_insert_data(op)->length = what_can_fit_into_node(op);
5745 +
5746 +               if (can_paste(insert_point, &f->key, flow_insert_data(op))) {
5747 +                       /* insert point is set to item of file we are writing to and we have to append to it */
5748 +                       assert("vs-903", insert_point->between == AFTER_UNIT);
5749 +                       nplug->change_item_size(insert_point,
5750 +                                               flow_insert_data(op)->length);
5751 +                       flow_insert_data(op)->iplug->b.paste(insert_point,
5752 +                                                            flow_insert_data
5753 +                                                            (op), &info);
5754 +               } else {
5755 +                       /* new item must be inserted */
5756 +                       pos_in_node_t new_pos;
5757 +                       flow_insert_data(op)->length += item_data_overhead(op);
5758 +
5759 +                       /* FIXME-VS: this is because node40_create_item changes
5760 +                          insert_point for obscure reasons */
5761 +                       switch (insert_point->between) {
5762 +                       case AFTER_ITEM:
5763 +                               new_pos = insert_point->item_pos + 1;
5764 +                               break;
5765 +                       case EMPTY_NODE:
5766 +                               new_pos = 0;
5767 +                               break;
5768 +                       case BEFORE_ITEM:
5769 +                               assert("vs-905", insert_point->item_pos == 0);
5770 +                               new_pos = 0;
5771 +                               break;
5772 +                       default:
5773 +                               impossible("vs-906",
5774 +                                          "carry_insert_flow: invalid coord");
5775 +                               new_pos = 0;
5776 +                               break;
5777 +                       }
5778 +
5779 +                       nplug->create_item(insert_point, &f->key,
5780 +                                          flow_insert_data(op), &info);
5781 +                       coord_set_item_pos(insert_point, new_pos);
5782 +               }
5783 +               coord_init_after_item_end(insert_point);
5784 +               doing->restartable = 0;
5785 +               znode_make_dirty(insert_point->node);
5786 +
5787 +               move_flow_forward(f, (unsigned)flow_insert_data(op)->length);
5788 +       }
5789 +
5790 +       if (orig_node != flow_insert_point(op)->node) {
5791 +               /* move lock to new insert point */
5792 +               done_lh(orig_lh);
5793 +               init_lh(orig_lh);
5794 +               result =
5795 +                   longterm_lock_znode(orig_lh, flow_insert_point(op)->node,
5796 +                                       ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
5797 +       }
5798 +
5799 +       return result;
5800 +}
5801 +
5802 +/* implements COP_DELETE operation
5803 +
5804 +   Remove pointer to @op -> u.delete.child from it's parent.
5805 +
5806 +   This function also handles killing of a tree root is last pointer from it
5807 +   was removed. This is complicated by our handling of "twig" level: root on
5808 +   twig level is never killed.
5809 +
5810 +*/
5811 +static int carry_delete(carry_op * op /* operation to be performed */ ,
5812 +                       carry_level * doing UNUSED_ARG  /* current carry
5813 +                                                        * level */ ,
5814 +                       carry_level * todo /* next carry level */ )
5815 +{
5816 +       int result;
5817 +       coord_t coord;
5818 +       coord_t coord2;
5819 +       znode *parent;
5820 +       znode *child;
5821 +       carry_plugin_info info;
5822 +       reiser4_tree *tree;
5823 +
5824 +       /*
5825 +        * This operation is called to delete internal item pointing to the
5826 +        * child node that was removed by carry from the tree on the previous
5827 +        * tree level.
5828 +        */
5829 +
5830 +       assert("nikita-893", op != NULL);
5831 +       assert("nikita-894", todo != NULL);
5832 +       assert("nikita-895", op->op == COP_DELETE);
5833 +
5834 +       coord_init_zero(&coord);
5835 +       coord_init_zero(&coord2);
5836 +
5837 +       parent = reiser4_carry_real(op->node);
5838 +       child = op->u.delete.child ?
5839 +               reiser4_carry_real(op->u.delete.child) : op->node->node;
5840 +       tree = znode_get_tree(child);
5841 +       read_lock_tree(tree);
5842 +
5843 +       /*
5844 +        * @parent was determined when carry entered parent level
5845 +        * (lock_carry_level/lock_carry_node). Since then, actual parent of
5846 +        * @child node could change due to other carry operations performed on
5847 +        * the parent level. Check for this.
5848 +        */
5849 +
5850 +       if (znode_parent(child) != parent) {
5851 +               /* NOTE-NIKITA add stat counter for this. */
5852 +               parent = znode_parent(child);
5853 +               assert("nikita-2581", find_carry_node(doing, parent));
5854 +       }
5855 +       read_unlock_tree(tree);
5856 +
5857 +       assert("nikita-1213", znode_get_level(parent) > LEAF_LEVEL);
5858 +
5859 +       /* Twig level horrors: tree should be of height at least 2. So, last
5860 +          pointer from the root at twig level is preserved even if child is
5861 +          empty. This is ugly, but so it was architectured.
5862 +        */
5863 +
5864 +       if (znode_is_root(parent) &&
5865 +           znode_get_level(parent) <= REISER4_MIN_TREE_HEIGHT &&
5866 +           node_num_items(parent) == 1) {
5867 +               /* Delimiting key manipulations. */
5868 +               write_lock_dk(tree);
5869 +               znode_set_ld_key(child, znode_set_ld_key(parent, reiser4_min_key()));
5870 +               znode_set_rd_key(child, znode_set_rd_key(parent, reiser4_max_key()));
5871 +               ZF_SET(child, JNODE_DKSET);
5872 +               write_unlock_dk(tree);
5873 +
5874 +               /* @child escaped imminent death! */
5875 +               ZF_CLR(child, JNODE_HEARD_BANSHEE);
5876 +               return 0;
5877 +       }
5878 +
5879 +       /* convert child pointer to the coord_t */
5880 +       result = find_child_ptr(parent, child, &coord);
5881 +       if (result != NS_FOUND) {
5882 +               warning("nikita-994", "Cannot find child pointer: %i", result);
5883 +               print_coord_content("coord", &coord);
5884 +               return result;
5885 +       }
5886 +
5887 +       coord_dup(&coord2, &coord);
5888 +       info.doing = doing;
5889 +       info.todo = todo;
5890 +       {
5891 +               /*
5892 +                * Actually kill internal item: prepare structure with
5893 +                * arguments for ->cut_and_kill() method...
5894 +                */
5895 +
5896 +               struct carry_kill_data kdata;
5897 +               kdata.params.from = &coord;
5898 +               kdata.params.to = &coord2;
5899 +               kdata.params.from_key = NULL;
5900 +               kdata.params.to_key = NULL;
5901 +               kdata.params.smallest_removed = NULL;
5902 +               kdata.params.truncate = 1;
5903 +               kdata.flags = op->u.delete.flags;
5904 +               kdata.inode = NULL;
5905 +               kdata.left = NULL;
5906 +               kdata.right = NULL;
5907 +               kdata.buf = NULL;
5908 +               /* ... and call it. */
5909 +               result = node_plugin_by_node(parent)->cut_and_kill(&kdata,
5910 +                                                                  &info);
5911 +       }
5912 +       doing->restartable = 0;
5913 +
5914 +       /* check whether root should be killed violently */
5915 +       if (znode_is_root(parent) &&
5916 +           /* don't kill roots at and lower than twig level */
5917 +           znode_get_level(parent) > REISER4_MIN_TREE_HEIGHT &&
5918 +           node_num_items(parent) == 1) {
5919 +               result = reiser4_kill_tree_root(coord.node);
5920 +       }
5921 +
5922 +       return result < 0 ? : 0;
5923 +}
5924 +
5925 +/* implements COP_CUT opration
5926 +
5927 +   Cuts part or whole content of node.
5928 +
5929 +*/
5930 +static int carry_cut(carry_op * op /* operation to be performed */ ,
5931 +                    carry_level * doing /* current carry level */ ,
5932 +                    carry_level * todo /* next carry level */ )
5933 +{
5934 +       int result;
5935 +       carry_plugin_info info;
5936 +       node_plugin *nplug;
5937 +
5938 +       assert("nikita-896", op != NULL);
5939 +       assert("nikita-897", todo != NULL);
5940 +       assert("nikita-898", op->op == COP_CUT);
5941 +
5942 +       info.doing = doing;
5943 +       info.todo = todo;
5944 +
5945 +       nplug = node_plugin_by_node(reiser4_carry_real(op->node));
5946 +       if (op->u.cut_or_kill.is_cut)
5947 +               result = nplug->cut(op->u.cut_or_kill.u.cut, &info);
5948 +       else
5949 +               result = nplug->cut_and_kill(op->u.cut_or_kill.u.kill, &info);
5950 +
5951 +       doing->restartable = 0;
5952 +       return result < 0 ? : 0;
5953 +}
5954 +
5955 +/* helper function for carry_paste(): returns true if @op can be continued as
5956 +   paste  */
5957 +static int
5958 +can_paste(coord_t * icoord, const reiser4_key * key,
5959 +         const reiser4_item_data * data)
5960 +{
5961 +       coord_t circa;
5962 +       item_plugin *new_iplug;
5963 +       item_plugin *old_iplug;
5964 +       int result = 0;         /* to keep gcc shut */
5965 +
5966 +       assert("", icoord->between != AT_UNIT);
5967 +
5968 +       /* obviously, one cannot paste when node is empty---there is nothing
5969 +          to paste into. */
5970 +       if (node_is_empty(icoord->node))
5971 +               return 0;
5972 +       /* if insertion point is at the middle of the item, then paste */
5973 +       if (!coord_is_between_items(icoord))
5974 +               return 1;
5975 +       coord_dup(&circa, icoord);
5976 +       circa.between = AT_UNIT;
5977 +
5978 +       old_iplug = item_plugin_by_coord(&circa);
5979 +       new_iplug = data->iplug;
5980 +
5981 +       /* check whether we can paste to the item @icoord is "at" when we
5982 +          ignore ->between field */
5983 +       if (old_iplug == new_iplug && item_can_contain_key(&circa, key, data)) {
5984 +               result = 1;
5985 +       } else if (icoord->between == BEFORE_UNIT
5986 +                  || icoord->between == BEFORE_ITEM) {
5987 +               /* otherwise, try to glue to the item at the left, if any */
5988 +               coord_dup(&circa, icoord);
5989 +               if (coord_set_to_left(&circa)) {
5990 +                       result = 0;
5991 +                       coord_init_before_item(icoord);
5992 +               } else {
5993 +                       old_iplug = item_plugin_by_coord(&circa);
5994 +                       result = (old_iplug == new_iplug)
5995 +                           && item_can_contain_key(icoord, key, data);
5996 +                       if (result) {
5997 +                               coord_dup(icoord, &circa);
5998 +                               icoord->between = AFTER_UNIT;
5999 +                       }
6000 +               }
6001 +       } else if (icoord->between == AFTER_UNIT
6002 +                  || icoord->between == AFTER_ITEM) {
6003 +               coord_dup(&circa, icoord);
6004 +               /* otherwise, try to glue to the item at the right, if any */
6005 +               if (coord_set_to_right(&circa)) {
6006 +                       result = 0;
6007 +                       coord_init_after_item(icoord);
6008 +               } else {
6009 +                       int (*cck) (const coord_t *, const reiser4_key *,
6010 +                                   const reiser4_item_data *);
6011 +
6012 +                       old_iplug = item_plugin_by_coord(&circa);
6013 +
6014 +                       cck = old_iplug->b.can_contain_key;
6015 +                       if (cck == NULL)
6016 +                               /* item doesn't define ->can_contain_key
6017 +                                  method? So it is not expandable. */
6018 +                               result = 0;
6019 +                       else {
6020 +                               result = (old_iplug == new_iplug)
6021 +                                   && cck(&circa /*icoord */ , key, data);
6022 +                               if (result) {
6023 +                                       coord_dup(icoord, &circa);
6024 +                                       icoord->between = BEFORE_UNIT;
6025 +                               }
6026 +                       }
6027 +               }
6028 +       } else
6029 +               impossible("nikita-2513", "Nothing works");
6030 +       if (result) {
6031 +               if (icoord->between == BEFORE_ITEM) {
6032 +                       assert("vs-912", icoord->unit_pos == 0);
6033 +                       icoord->between = BEFORE_UNIT;
6034 +               } else if (icoord->between == AFTER_ITEM) {
6035 +                       coord_init_after_item_end(icoord);
6036 +               }
6037 +       }
6038 +       return result;
6039 +}
6040 +
6041 +/* implements COP_PASTE operation
6042 +
6043 +   Paste data into existing item. This is complicated by the fact that after
6044 +   we shifted something to the left or right neighbors trying to free some
6045 +   space, item we were supposed to paste into can be in different node than
6046 +   insertion coord. If so, we are no longer doing paste, but insert. See
6047 +   comments in insert_paste_common().
6048 +
6049 +*/
6050 +static int carry_paste(carry_op * op /* operation to be performed */ ,
6051 +                      carry_level * doing UNUSED_ARG   /* current carry
6052 +                                                        * level */ ,
6053 +                      carry_level * todo /* next carry level */ )
6054 +{
6055 +       znode *node;
6056 +       carry_insert_data cdata;
6057 +       coord_t dcoord;
6058 +       reiser4_item_data data;
6059 +       int result;
6060 +       int real_size;
6061 +       item_plugin *iplug;
6062 +       carry_plugin_info info;
6063 +       coord_t *coord;
6064 +
6065 +       assert("nikita-982", op != NULL);
6066 +       assert("nikita-983", todo != NULL);
6067 +       assert("nikita-984", op->op == COP_PASTE);
6068 +
6069 +       coord_init_zero(&dcoord);
6070 +
6071 +       result = insert_paste_common(op, doing, todo, &cdata, &dcoord, &data);
6072 +       if (result != 0)
6073 +               return result;
6074 +
6075 +       coord = op->u.insert.d->coord;
6076 +
6077 +       /* handle case when op -> u.insert.coord doesn't point to the item
6078 +          of required type. restart as insert. */
6079 +       if (!can_paste(coord, op->u.insert.d->key, op->u.insert.d->data)) {
6080 +               op->op = COP_INSERT;
6081 +               op->u.insert.type = COPT_PASTE_RESTARTED;
6082 +               result = op_dispatch_table[COP_INSERT].handler(op, doing, todo);
6083 +
6084 +               return result;
6085 +       }
6086 +
6087 +       node = coord->node;
6088 +       iplug = item_plugin_by_coord(coord);
6089 +       assert("nikita-992", iplug != NULL);
6090 +
6091 +       assert("nikita-985", node != NULL);
6092 +       assert("nikita-986", node_plugin_by_node(node) != NULL);
6093 +
6094 +       assert("nikita-987",
6095 +              space_needed_for_op(node, op) <= znode_free_space(node));
6096 +
6097 +       assert("nikita-1286", coord_is_existing_item(coord));
6098 +
6099 +       /*
6100 +        * if item is expanded as a result of this operation, we should first
6101 +        * change item size, than call ->b.paste item method. If item is
6102 +        * shrunk, it should be done other way around: first call ->b.paste
6103 +        * method, then reduce item size.
6104 +        */
6105 +
6106 +       real_size = space_needed_for_op(node, op);
6107 +       if (real_size > 0)
6108 +               node->nplug->change_item_size(coord, real_size);
6109 +
6110 +       doing->restartable = 0;
6111 +       info.doing = doing;
6112 +       info.todo = todo;
6113 +
6114 +       result = iplug->b.paste(coord, op->u.insert.d->data, &info);
6115 +
6116 +       if (real_size < 0)
6117 +               node->nplug->change_item_size(coord, real_size);
6118 +
6119 +       /* if we pasted at the beginning of the item, update item's key. */
6120 +       if (coord->unit_pos == 0 && coord->between != AFTER_UNIT)
6121 +               node->nplug->update_item_key(coord, op->u.insert.d->key, &info);
6122 +
6123 +       znode_make_dirty(node);
6124 +       return result;
6125 +}
6126 +
6127 +/* handle carry COP_EXTENT operation. */
6128 +static int carry_extent(carry_op * op /* operation to perform */ ,
6129 +                       carry_level * doing     /* queue of operations @op
6130 +                                                * is part of */ ,
6131 +                       carry_level * todo      /* queue where new operations
6132 +                                                * are accumulated */ )
6133 +{
6134 +       znode *node;
6135 +       carry_insert_data cdata;
6136 +       coord_t coord;
6137 +       reiser4_item_data data;
6138 +       carry_op *delete_dummy;
6139 +       carry_op *insert_extent;
6140 +       int result;
6141 +       carry_plugin_info info;
6142 +
6143 +       assert("nikita-1751", op != NULL);
6144 +       assert("nikita-1752", todo != NULL);
6145 +       assert("nikita-1753", op->op == COP_EXTENT);
6146 +
6147 +       /* extent insertion overview:
6148 +
6149 +          extents live on the TWIG LEVEL, which is level one above the leaf
6150 +          one. This complicates extent insertion logic somewhat: it may
6151 +          happen (and going to happen all the time) that in logical key
6152 +          ordering extent has to be placed between items I1 and I2, located
6153 +          at the leaf level, but I1 and I2 are in the same formatted leaf
6154 +          node N1. To insert extent one has to
6155 +
6156 +          (1) reach node N1 and shift data between N1, its neighbors and
6157 +          possibly newly allocated nodes until I1 and I2 fall into different
6158 +          nodes. Since I1 and I2 are still neighboring items in logical key
6159 +          order, they will be necessary utmost items in their respective
6160 +          nodes.
6161 +
6162 +          (2) After this new extent item is inserted into node on the twig
6163 +          level.
6164 +
6165 +          Fortunately this process can reuse almost all code from standard
6166 +          insertion procedure (viz. make_space() and insert_paste_common()),
6167 +          due to the following observation: make_space() only shifts data up
6168 +          to and excluding or including insertion point. It never
6169 +          "over-moves" through insertion point. Thus, one can use
6170 +          make_space() to perform step (1). All required for this is just to
6171 +          instruct free_space_shortage() to keep make_space() shifting data
6172 +          until insertion point is at the node border.
6173 +
6174 +        */
6175 +
6176 +       /* perform common functionality of insert and paste. */
6177 +       result = insert_paste_common(op, doing, todo, &cdata, &coord, &data);
6178 +       if (result != 0)
6179 +               return result;
6180 +
6181 +       node = op->u.extent.d->coord->node;
6182 +       assert("nikita-1754", node != NULL);
6183 +       assert("nikita-1755", node_plugin_by_node(node) != NULL);
6184 +       assert("nikita-1700", coord_wrt(op->u.extent.d->coord) != COORD_INSIDE);
6185 +
6186 +       /* NOTE-NIKITA add some checks here. Not assertions, -EIO. Check that
6187 +          extent fits between items. */
6188 +
6189 +       info.doing = doing;
6190 +       info.todo = todo;
6191 +
6192 +       /* there is another complication due to placement of extents on the
6193 +          twig level: extents are "rigid" in the sense that key-range
6194 +          occupied by extent cannot grow indefinitely to the right as it is
6195 +          for the formatted leaf nodes. Because of this when search finds two
6196 +          adjacent extents on the twig level, it has to "drill" to the leaf
6197 +          level, creating new node. Here we are removing this node.
6198 +        */
6199 +       if (node_is_empty(node)) {
6200 +               delete_dummy = node_post_carry(&info, COP_DELETE, node, 1);
6201 +               if (IS_ERR(delete_dummy))
6202 +                       return PTR_ERR(delete_dummy);
6203 +               delete_dummy->u.delete.child = NULL;
6204 +               delete_dummy->u.delete.flags = DELETE_RETAIN_EMPTY;
6205 +               ZF_SET(node, JNODE_HEARD_BANSHEE);
6206 +       }
6207 +
6208 +       /* proceed with inserting extent item into parent. We are definitely
6209 +          inserting rather than pasting if we get that far. */
6210 +       insert_extent = node_post_carry(&info, COP_INSERT, node, 1);
6211 +       if (IS_ERR(insert_extent))
6212 +               /* @delete_dummy will be automatically destroyed on the level
6213 +                  exiting  */
6214 +               return PTR_ERR(insert_extent);
6215 +       /* NOTE-NIKITA insertion by key is simplest option here. Another
6216 +          possibility is to insert on the left or right of already existing
6217 +          item.
6218 +        */
6219 +       insert_extent->u.insert.type = COPT_KEY;
6220 +       insert_extent->u.insert.d = op->u.extent.d;
6221 +       assert("nikita-1719", op->u.extent.d->key != NULL);
6222 +       insert_extent->u.insert.d->data->arg = op->u.extent.d->coord;
6223 +       insert_extent->u.insert.flags =
6224 +           znode_get_tree(node)->carry.new_extent_flags;
6225 +
6226 +       /*
6227 +        * if carry was asked to track lock handle we should actually track
6228 +        * lock handle on the twig node rather than on the leaf where
6229 +        * operation was started from. Transfer tracked lock handle.
6230 +        */
6231 +       if (doing->track_type) {
6232 +               assert("nikita-3242", doing->tracked != NULL);
6233 +               assert("nikita-3244", todo->tracked == NULL);
6234 +               todo->tracked = doing->tracked;
6235 +               todo->track_type = CARRY_TRACK_NODE;
6236 +               doing->tracked = NULL;
6237 +               doing->track_type = 0;
6238 +       }
6239 +
6240 +       return 0;
6241 +}
6242 +
6243 +/* update key in @parent between pointers to @left and @right.
6244 +
6245 +   Find coords of @left and @right and update delimiting key between them.
6246 +   This is helper function called by carry_update(). Finds position of
6247 +   internal item involved. Updates item key. Updates delimiting keys of child
6248 +   nodes involved.
6249 +*/
6250 +static int update_delimiting_key(znode * parent        /* node key is updated
6251 +                                                * in */ ,
6252 +                                znode * left /* child of @parent */ ,
6253 +                                znode * right /* child of @parent */ ,
6254 +                                carry_level * doing    /* current carry
6255 +                                                        * level */ ,
6256 +                                carry_level * todo     /* parent carry
6257 +                                                        * level */ ,
6258 +                                const char **error_msg /* place to
6259 +                                                        * store error
6260 +                                                        * message */ )
6261 +{
6262 +       coord_t left_pos;
6263 +       coord_t right_pos;
6264 +       int result;
6265 +       reiser4_key ldkey;
6266 +       carry_plugin_info info;
6267 +
6268 +       assert("nikita-1177", right != NULL);
6269 +       /* find position of right left child in a parent */
6270 +       result = find_child_ptr(parent, right, &right_pos);
6271 +       if (result != NS_FOUND) {
6272 +               *error_msg = "Cannot find position of right child";
6273 +               return result;
6274 +       }
6275 +
6276 +       if ((left != NULL) && !coord_is_leftmost_unit(&right_pos)) {
6277 +               /* find position of the left child in a parent */
6278 +               result = find_child_ptr(parent, left, &left_pos);
6279 +               if (result != NS_FOUND) {
6280 +                       *error_msg = "Cannot find position of left child";
6281 +                       return result;
6282 +               }
6283 +               assert("nikita-1355", left_pos.node != NULL);
6284 +       } else
6285 +               left_pos.node = NULL;
6286 +
6287 +       /* check that they are separated by exactly one key and are basically
6288 +          sane */
6289 +       if (REISER4_DEBUG) {
6290 +               if ((left_pos.node != NULL)
6291 +                   && !coord_is_existing_unit(&left_pos)) {
6292 +                       *error_msg = "Left child is bastard";
6293 +                       return RETERR(-EIO);
6294 +               }
6295 +               if (!coord_is_existing_unit(&right_pos)) {
6296 +                       *error_msg = "Right child is bastard";
6297 +                       return RETERR(-EIO);
6298 +               }
6299 +               if (left_pos.node != NULL &&
6300 +                   !coord_are_neighbors(&left_pos, &right_pos)) {
6301 +                       *error_msg = "Children are not direct siblings";
6302 +                       return RETERR(-EIO);
6303 +               }
6304 +       }
6305 +       *error_msg = NULL;
6306 +
6307 +       info.doing = doing;
6308 +       info.todo = todo;
6309 +
6310 +       /*
6311 +        * If child node is not empty, new key of internal item is a key of
6312 +        * leftmost item in the child node. If the child is empty, take its
6313 +        * right delimiting key as a new key of the internal item. Precise key
6314 +        * in the latter case is not important per se, because the child (and
6315 +        * the internal item) are going to be killed shortly anyway, but we
6316 +        * have to preserve correct order of keys in the parent node.
6317 +        */
6318 +
6319 +       if (!ZF_ISSET(right, JNODE_HEARD_BANSHEE))
6320 +               leftmost_key_in_node(right, &ldkey);
6321 +       else {
6322 +               read_lock_dk(znode_get_tree(parent));
6323 +               ldkey = *znode_get_rd_key(right);
6324 +               read_unlock_dk(znode_get_tree(parent));
6325 +       }
6326 +       node_plugin_by_node(parent)->update_item_key(&right_pos, &ldkey, &info);
6327 +       doing->restartable = 0;
6328 +       znode_make_dirty(parent);
6329 +       return 0;
6330 +}
6331 +
6332 +/* implements COP_UPDATE opration
6333 +
6334 +   Update delimiting keys.
6335 +
6336 +*/
6337 +static int carry_update(carry_op * op /* operation to be performed */ ,
6338 +                       carry_level * doing /* current carry level */ ,
6339 +                       carry_level * todo /* next carry level */ )
6340 +{
6341 +       int result;
6342 +       carry_node *missing UNUSED_ARG;
6343 +       znode *left;
6344 +       znode *right;
6345 +       carry_node *lchild;
6346 +       carry_node *rchild;
6347 +       const char *error_msg;
6348 +       reiser4_tree *tree;
6349 +
6350 +       /*
6351 +        * This operation is called to update key of internal item. This is
6352 +        * necessary when carry shifted of cut data on the child
6353 +        * level. Arguments of this operation are:
6354 +        *
6355 +        *     @right --- child node. Operation should update key of internal
6356 +        *     item pointing to @right.
6357 +        *
6358 +        *     @left --- left neighbor of @right. This parameter is optional.
6359 +        */
6360 +
6361 +       assert("nikita-902", op != NULL);
6362 +       assert("nikita-903", todo != NULL);
6363 +       assert("nikita-904", op->op == COP_UPDATE);
6364 +
6365 +       lchild = op->u.update.left;
6366 +       rchild = op->node;
6367 +
6368 +       if (lchild != NULL) {
6369 +               assert("nikita-1001", lchild->parent);
6370 +               assert("nikita-1003", !lchild->left);
6371 +               left = reiser4_carry_real(lchild);
6372 +       } else
6373 +               left = NULL;
6374 +
6375 +       tree = znode_get_tree(rchild->node);
6376 +       read_lock_tree(tree);
6377 +       right = znode_parent(rchild->node);
6378 +       read_unlock_tree(tree);
6379 +
6380 +       if (right != NULL) {
6381 +               result = update_delimiting_key(right,
6382 +                                              lchild ? lchild->node : NULL,
6383 +                                              rchild->node,
6384 +                                              doing, todo, &error_msg);
6385 +       } else {
6386 +               error_msg = "Cannot find node to update key in";
6387 +               result = RETERR(-EIO);
6388 +       }
6389 +       /* operation will be reposted to the next level by the
6390 +          ->update_item_key() method of node plugin, if necessary. */
6391 +
6392 +       if (result != 0) {
6393 +               warning("nikita-999", "Error updating delimiting key: %s (%i)",
6394 +                       error_msg ? : "", result);
6395 +       }
6396 +       return result;
6397 +}
6398 +
6399 +/* move items from @node during carry */
6400 +static int carry_shift_data(sideof side /* in what direction to move data */ ,
6401 +                           coord_t * insert_coord      /* coord where new item
6402 +                                                        * is to be inserted */ ,
6403 +                           znode * node /* node which data are moved from */ ,
6404 +                           carry_level * doing /* active carry queue */ ,
6405 +                           carry_level * todo  /* carry queue where new
6406 +                                                * operations are to be put
6407 +                                                * in */ ,
6408 +                           unsigned int including_insert_coord_p       /* true if
6409 +                                                                        * @insertion_coord
6410 +                                                                        * can be moved */ )
6411 +{
6412 +       int result;
6413 +       znode *source;
6414 +       carry_plugin_info info;
6415 +       node_plugin *nplug;
6416 +
6417 +       source = insert_coord->node;
6418 +
6419 +       info.doing = doing;
6420 +       info.todo = todo;
6421 +
6422 +       nplug = node_plugin_by_node(node);
6423 +       result = nplug->shift(insert_coord, node,
6424 +                             (side == LEFT_SIDE) ? SHIFT_LEFT : SHIFT_RIGHT, 0,
6425 +                             (int)including_insert_coord_p, &info);
6426 +       /* the only error ->shift() method of node plugin can return is
6427 +          -ENOMEM due to carry node/operation allocation. */
6428 +       assert("nikita-915", result >= 0 || result == -ENOMEM);
6429 +       if (result > 0) {
6430 +               /*
6431 +                * if some number of bytes was actually shifted, mark nodes
6432 +                * dirty, and carry level as non-restartable.
6433 +                */
6434 +               doing->restartable = 0;
6435 +               znode_make_dirty(source);
6436 +               znode_make_dirty(node);
6437 +       }
6438 +
6439 +       assert("nikita-2077", coord_check(insert_coord));
6440 +       return 0;
6441 +}
6442 +
6443 +typedef carry_node *(*carry_iterator) (carry_node * node);
6444 +static carry_node *find_dir_carry(carry_node * node, carry_level * level,
6445 +                                 carry_iterator iterator);
6446 +
6447 +static carry_node *pool_level_list_prev(carry_node *node)
6448 +{
6449 +       return list_entry(node->header.level_linkage.prev, carry_node, header.level_linkage);
6450 +}
6451 +
6452 +/* look for the left neighbor of given carry node in a carry queue.
6453 +
6454 +   This is used by find_left_neighbor(), but I am not sure that this
6455 +   really gives any advantage. More statistics required.
6456 +
6457 +*/
6458 +carry_node *find_left_carry(carry_node * node  /* node to find left neighbor
6459 +                                                * of */ ,
6460 +                           carry_level * level /* level to scan */ )
6461 +{
6462 +       return find_dir_carry(node, level,
6463 +                             (carry_iterator) pool_level_list_prev);
6464 +}
6465 +
6466 +static carry_node *pool_level_list_next(carry_node *node)
6467 +{
6468 +       return list_entry(node->header.level_linkage.next, carry_node, header.level_linkage);
6469 +}
6470 +
6471 +/* look for the right neighbor of given carry node in a
6472 +   carry queue.
6473 +
6474 +   This is used by find_right_neighbor(), but I am not sure that this
6475 +   really gives any advantage. More statistics required.
6476 +
6477 +*/
6478 +carry_node *find_right_carry(carry_node * node /* node to find right neighbor
6479 +                                                * of */ ,
6480 +                            carry_level * level /* level to scan */ )
6481 +{
6482 +       return find_dir_carry(node, level,
6483 +                             (carry_iterator) pool_level_list_next);
6484 +}
6485 +
6486 +/* look for the left or right neighbor of given carry node in a carry
6487 +   queue.
6488 +
6489 +   Helper function used by find_{left|right}_carry().
6490 +*/
6491 +static carry_node *find_dir_carry(carry_node * node    /* node to start scanning
6492 +                                                        * from */ ,
6493 +                                 carry_level * level /* level to scan */ ,
6494 +                                 carry_iterator iterator       /* operation to
6495 +                                                                * move to the next
6496 +                                                                * node */ )
6497 +{
6498 +       carry_node *neighbor;
6499 +
6500 +       assert("nikita-1059", node != NULL);
6501 +       assert("nikita-1060", level != NULL);
6502 +
6503 +       /* scan list of carry nodes on this list dir-ward, skipping all
6504 +          carry nodes referencing the same znode. */
6505 +       neighbor = node;
6506 +       while (1) {
6507 +               neighbor = iterator(neighbor);
6508 +               if (carry_node_end(level, neighbor))
6509 +                       /* list head is reached */
6510 +                       return NULL;
6511 +               if (reiser4_carry_real(neighbor) != reiser4_carry_real(node))
6512 +                       return neighbor;
6513 +       }
6514 +}
6515 +
6516 +/*
6517 + * Memory reservation estimation.
6518 + *
6519 + * Carry process proceeds through tree levels upwards. Carry assumes that it
6520 + * takes tree in consistent state (e.g., that search tree invariants hold),
6521 + * and leaves tree consistent after it finishes. This means that when some
6522 + * error occurs carry cannot simply return if there are pending carry
6523 + * operations. Generic solution for this problem is carry-undo either as
6524 + * transaction manager feature (requiring checkpoints and isolation), or
6525 + * through some carry specific mechanism.
6526 + *
6527 + * Our current approach is to panic if carry hits an error while tree is
6528 + * inconsistent. Unfortunately -ENOMEM can easily be triggered. To work around
6529 + * this "memory reservation" mechanism was added.
6530 + *
6531 + * Memory reservation is implemented by perthread-pages.diff patch from
6532 + * core-patches. Its API is defined in <linux/gfp.h>
6533 + *
6534 + *     int  perthread_pages_reserve(int nrpages, gfp_t gfp);
6535 + *     void perthread_pages_release(int nrpages);
6536 + *     int  perthread_pages_count(void);
6537 + *
6538 + * carry estimates its worst case memory requirements at the entry, reserved
6539 + * enough memory, and released unused pages before returning.
6540 + *
6541 + * Code below estimates worst case memory requirements for a given carry
6542 + * queue. This is dome by summing worst case memory requirements for each
6543 + * operation in the queue.
6544 + *
6545 + */
6546 +
6547 +/*
6548 + * Memory memory requirements of many operations depends on the tree
6549 + * height. For example, item insertion requires new node to be inserted at
6550 + * each tree level in the worst case. What tree height should be used for
6551 + * estimation? Current tree height is wrong, because tree height can change
6552 + * between the time when estimation was done and the time when operation is
6553 + * actually performed. Maximal possible tree height (REISER4_MAX_ZTREE_HEIGHT)
6554 + * is also not desirable, because it would lead to the huge over-estimation
6555 + * all the time. Plausible solution is "capped tree height": if current tree
6556 + * height is less than some TREE_HEIGHT_CAP constant, capped tree height is
6557 + * TREE_HEIGHT_CAP, otherwise it's current tree height. Idea behind this is
6558 + * that if tree height is TREE_HEIGHT_CAP or larger, it's extremely unlikely
6559 + * to be increased even more during short interval of time.
6560 + */
6561 +#define TREE_HEIGHT_CAP (5)
6562 +
6563 +/* return capped tree height for the @tree. See comment above. */
6564 +static int cap_tree_height(reiser4_tree * tree)
6565 +{
6566 +       return max_t(int, tree->height, TREE_HEIGHT_CAP);
6567 +}
6568 +
6569 +/* return capped tree height for the current tree. */
6570 +static int capped_height(void)
6571 +{
6572 +       return cap_tree_height(current_tree);
6573 +}
6574 +
6575 +/* return number of pages required to store given number of bytes */
6576 +static int bytes_to_pages(int bytes)
6577 +{
6578 +       return (bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
6579 +}
6580 +
6581 +/* how many pages are required to allocate znodes during item insertion. */
6582 +static int carry_estimate_znodes(void)
6583 +{
6584 +       /*
6585 +        * Note, that there we have some problem here: there is no way to
6586 +        * reserve pages specifically for the given slab. This means that
6587 +        * these pages can be hijacked for some other end.
6588 +        */
6589 +
6590 +       /* in the worst case we need 3 new znode on each tree level */
6591 +       return bytes_to_pages(capped_height() * sizeof(znode) * 3);
6592 +}
6593 +
6594 +/*
6595 + * how many pages are required to load bitmaps. One bitmap per level.
6596 + */
6597 +static int carry_estimate_bitmaps(void)
6598 +{
6599 +       if (reiser4_is_set(reiser4_get_current_sb(), REISER4_DONT_LOAD_BITMAP)) {
6600 +               int bytes;
6601 +
6602 +               bytes = capped_height() * (0 +  /* bnode should be added, but its is private to
6603 +                                                * bitmap.c, skip for now. */
6604 +                                          2 * sizeof(jnode));  /* working and commit jnodes */
6605 +               return bytes_to_pages(bytes) + 2;       /* and their contents */
6606 +       } else
6607 +               /* bitmaps were pre-loaded during mount */
6608 +               return 0;
6609 +}
6610 +
6611 +/* worst case item insertion memory requirements */
6612 +static int carry_estimate_insert(carry_op * op, carry_level * level)
6613 +{
6614 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6615 +           capped_height() +   /* new block on each level */
6616 +           1 +                 /* and possibly extra new block at the leaf level */
6617 +           3;                  /* loading of leaves into memory */
6618 +}
6619 +
6620 +/* worst case item deletion memory requirements */
6621 +static int carry_estimate_delete(carry_op * op, carry_level * level)
6622 +{
6623 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6624 +           3;                  /* loading of leaves into memory */
6625 +}
6626 +
6627 +/* worst case tree cut memory requirements */
6628 +static int carry_estimate_cut(carry_op * op, carry_level * level)
6629 +{
6630 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6631 +           3;                  /* loading of leaves into memory */
6632 +}
6633 +
6634 +/* worst case memory requirements of pasting into item */
6635 +static int carry_estimate_paste(carry_op * op, carry_level * level)
6636 +{
6637 +       return carry_estimate_bitmaps() + carry_estimate_znodes() + 1 + /* new atom */
6638 +           capped_height() +   /* new block on each level */
6639 +           1 +                 /* and possibly extra new block at the leaf level */
6640 +           3;                  /* loading of leaves into memory */
6641 +}
6642 +
6643 +/* worst case memory requirements of extent insertion */
6644 +static int carry_estimate_extent(carry_op * op, carry_level * level)
6645 +{
6646 +       return carry_estimate_insert(op, level) +       /* insert extent */
6647 +           carry_estimate_delete(op, level);   /* kill leaf */
6648 +}
6649 +
6650 +/* worst case memory requirements of key update */
6651 +static int carry_estimate_update(carry_op * op, carry_level * level)
6652 +{
6653 +       return 0;
6654 +}
6655 +
6656 +/* worst case memory requirements of flow insertion */
6657 +static int carry_estimate_insert_flow(carry_op * op, carry_level * level)
6658 +{
6659 +       int newnodes;
6660 +
6661 +       newnodes = min(bytes_to_pages(op->u.insert_flow.flow->length),
6662 +                      CARRY_FLOW_NEW_NODES_LIMIT);
6663 +       /*
6664 +        * roughly estimate insert_flow as a sequence of insertions.
6665 +        */
6666 +       return newnodes * carry_estimate_insert(op, level);
6667 +}
6668 +
6669 +/* This is dispatch table for carry operations. It can be trivially
6670 +   abstracted into useful plugin: tunable balancing policy is a good
6671 +   thing. */
6672 +carry_op_handler op_dispatch_table[COP_LAST_OP] = {
6673 +       [COP_INSERT] = {
6674 +                       .handler = carry_insert,
6675 +                       .estimate = carry_estimate_insert}
6676 +       ,
6677 +       [COP_DELETE] = {
6678 +                       .handler = carry_delete,
6679 +                       .estimate = carry_estimate_delete}
6680 +       ,
6681 +       [COP_CUT] = {
6682 +                    .handler = carry_cut,
6683 +                    .estimate = carry_estimate_cut}
6684 +       ,
6685 +       [COP_PASTE] = {
6686 +                      .handler = carry_paste,
6687 +                      .estimate = carry_estimate_paste}
6688 +       ,
6689 +       [COP_EXTENT] = {
6690 +                       .handler = carry_extent,
6691 +                       .estimate = carry_estimate_extent}
6692 +       ,
6693 +       [COP_UPDATE] = {
6694 +                       .handler = carry_update,
6695 +                       .estimate = carry_estimate_update}
6696 +       ,
6697 +       [COP_INSERT_FLOW] = {
6698 +                            .handler = carry_insert_flow,
6699 +                            .estimate = carry_estimate_insert_flow}
6700 +};
6701 +
6702 +/* Make Linus happy.
6703 +   Local variables:
6704 +   c-indentation-style: "K&R"
6705 +   mode-name: "LC"
6706 +   c-basic-offset: 8
6707 +   tab-width: 8
6708 +   fill-column: 120
6709 +   scroll-step: 1
6710 +   End:
6711 +*/
6712 diff --git a/fs/reiser4/carry_ops.h b/fs/reiser4/carry_ops.h
6713 new file mode 100644
6714 index 0000000..688ca8f
6715 --- /dev/null
6716 +++ b/fs/reiser4/carry_ops.h
6717 @@ -0,0 +1,42 @@
6718 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6719 +
6720 +/* implementation of carry operations. See carry_ops.c for details. */
6721 +
6722 +#if !defined( __CARRY_OPS_H__ )
6723 +#define __CARRY_OPS_H__
6724 +
6725 +#include "forward.h"
6726 +#include "znode.h"
6727 +#include "carry.h"
6728 +
6729 +/* carry operation handlers */
6730 +typedef struct carry_op_handler {
6731 +       /* perform operation */
6732 +       int (*handler) (carry_op * op, carry_level * doing, carry_level * todo);
6733 +       /* estimate memory requirements for @op */
6734 +       int (*estimate) (carry_op * op, carry_level * level);
6735 +} carry_op_handler;
6736 +
6737 +/* This is dispatch table for carry operations. It can be trivially
6738 +   abstracted into useful plugin: tunable balancing policy is a good
6739 +   thing. */
6740 +extern carry_op_handler op_dispatch_table[COP_LAST_OP];
6741 +
6742 +unsigned int space_needed(const znode * node, const coord_t * coord,
6743 +                         const reiser4_item_data * data, int inserting);
6744 +extern carry_node *find_left_carry(carry_node * node, carry_level * level);
6745 +extern carry_node *find_right_carry(carry_node * node, carry_level * level);
6746 +
6747 +/* __CARRY_OPS_H__ */
6748 +#endif
6749 +
6750 +/* Make Linus happy.
6751 +   Local variables:
6752 +   c-indentation-style: "K&R"
6753 +   mode-name: "LC"
6754 +   c-basic-offset: 8
6755 +   tab-width: 8
6756 +   fill-column: 120
6757 +   scroll-step: 1
6758 +   End:
6759 +*/
6760 diff --git a/fs/reiser4/context.c b/fs/reiser4/context.c
6761 new file mode 100644
6762 index 0000000..4b3137f
6763 --- /dev/null
6764 +++ b/fs/reiser4/context.c
6765 @@ -0,0 +1,288 @@
6766 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
6767 +
6768 +/* Manipulation of reiser4_context */
6769 +
6770 +/*
6771 + * global context used during system call. Variable of this type is allocated
6772 + * on the stack at the beginning of the reiser4 part of the system call and
6773 + * pointer to it is stored in the current->fs_context. This allows us to avoid
6774 + * passing pointer to current transaction and current lockstack (both in
6775 + * one-to-one mapping with threads) all over the call chain.
6776 + *
6777 + * It's kind of like those global variables the prof used to tell you not to
6778 + * use in CS1, except thread specific.;-) Nikita, this was a good idea.
6779 + *
6780 + * In some situations it is desirable to have ability to enter reiser4_context
6781 + * more than once for the same thread (nested contexts). For example, there
6782 + * are some functions that can be called either directly from VFS/VM or from
6783 + * already active reiser4 context (->writepage, for example).
6784 + *
6785 + * In such situations "child" context acts like dummy: all activity is
6786 + * actually performed in the top level context, and get_current_context()
6787 + * always returns top level context.
6788 + * Of course, reiser4_init_context()/reiser4_done_context() have to be properly
6789 + * nested any way.
6790 + *
6791 + * Note that there is an important difference between reiser4 uses
6792 + * ->fs_context and the way other file systems use it. Other file systems
6793 + * (ext3 and reiserfs) use ->fs_context only for the duration of _transaction_
6794 + * (this is why ->fs_context was initially called ->journal_info). This means,
6795 + * that when ext3 or reiserfs finds that ->fs_context is not NULL on the entry
6796 + * to the file system, they assume that some transaction is already underway,
6797 + * and usually bail out, because starting nested transaction would most likely
6798 + * lead to the deadlock. This gives false positives with reiser4, because we
6799 + * set ->fs_context before starting transaction.
6800 + */
6801 +
6802 +#include "debug.h"
6803 +#include "super.h"
6804 +#include "context.h"
6805 +
6806 +#include <linux/writeback.h>   /* balance_dirty_pages() */
6807 +#include <linux/hardirq.h>
6808 +
6809 +static void _reiser4_init_context(reiser4_context * context,
6810 +                                 struct super_block *super)
6811 +{
6812 +       memset(context, 0, sizeof(*context));
6813 +
6814 +       context->super = super;
6815 +       context->magic = context_magic;
6816 +       context->outer = current->journal_info;
6817 +       current->journal_info = (void *)context;
6818 +       context->nr_children = 0;
6819 +       context->gfp_mask = GFP_KERNEL;
6820 +
6821 +       init_lock_stack(&context->stack);
6822 +
6823 +       reiser4_txn_begin(context);
6824 +
6825 +       /* initialize head of tap list */
6826 +       INIT_LIST_HEAD(&context->taps);
6827 +#if REISER4_DEBUG
6828 +       context->task = current;
6829 +#endif
6830 +       grab_space_enable();
6831 +}
6832 +
6833 +/* initialize context and bind it to the current thread
6834 +
6835 +   This function should be called at the beginning of reiser4 part of
6836 +   syscall.
6837 +*/
6838 +reiser4_context * reiser4_init_context(struct super_block * super)
6839 +{
6840 +       reiser4_context *context;
6841 +
6842 +       assert("nikita-2662", !in_interrupt() && !in_irq());
6843 +       assert("nikita-3357", super != NULL);
6844 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6845 +
6846 +       context = get_current_context_check();
6847 +       if (context && context->super == super) {
6848 +               context = (reiser4_context *) current->journal_info;
6849 +               context->nr_children++;
6850 +               return context;
6851 +       }
6852 +
6853 +       context = kmalloc(sizeof(*context), GFP_KERNEL);
6854 +       if (context == NULL)
6855 +               return ERR_PTR(RETERR(-ENOMEM));
6856 +
6857 +       _reiser4_init_context(context, super);
6858 +       return context;
6859 +}
6860 +
6861 +/* this is used in scan_mgr which is called with spinlock held and in
6862 +   reiser4_fill_super magic */
6863 +void init_stack_context(reiser4_context *context, struct super_block *super)
6864 +{
6865 +       assert("nikita-2662", !in_interrupt() && !in_irq());
6866 +       assert("nikita-3357", super != NULL);
6867 +       assert("nikita-3358", super->s_op == NULL || is_reiser4_super(super));
6868 +       assert("vs-12", !is_in_reiser4_context());
6869 +
6870 +       _reiser4_init_context(context, super);
6871 +       context->on_stack = 1;
6872 +       return;
6873 +}
6874 +
6875 +/* cast lock stack embedded into reiser4 context up to its container */
6876 +reiser4_context *get_context_by_lock_stack(lock_stack * owner)
6877 +{
6878 +       return container_of(owner, reiser4_context, stack);
6879 +}
6880 +
6881 +/* true if there is already _any_ reiser4 context for the current thread */
6882 +int is_in_reiser4_context(void)
6883 +{
6884 +       reiser4_context *ctx;
6885 +
6886 +       ctx = current->journal_info;
6887 +       return ctx != NULL && ((unsigned long)ctx->magic) == context_magic;
6888 +}
6889 +
6890 +/*
6891 + * call balance dirty pages for the current context.
6892 + *
6893 + * File system is expected to call balance_dirty_pages_ratelimited() whenever
6894 + * it dirties a page. reiser4 does this for unformatted nodes (that is, during
6895 + * write---this covers vast majority of all dirty traffic), but we cannot do
6896 + * this immediately when formatted node is dirtied, because long term lock is
6897 + * usually held at that time. To work around this, dirtying of formatted node
6898 + * simply increases ->nr_marked_dirty counter in the current reiser4
6899 + * context. When we are about to leave this context,
6900 + * balance_dirty_pages_ratelimited() is called, if necessary.
6901 + *
6902 + * This introduces another problem: sometimes we do not want to run
6903 + * balance_dirty_pages_ratelimited() when leaving a context, for example
6904 + * because some important lock (like ->i_mutex on the parent directory) is
6905 + * held. To achieve this, ->nobalance flag can be set in the current context.
6906 + */
6907 +static void balance_dirty_pages_at(reiser4_context *context)
6908 +{
6909 +       reiser4_super_info_data *sbinfo = get_super_private(context->super);
6910 +
6911 +       /*
6912 +        * call balance_dirty_pages_ratelimited() to process formatted nodes
6913 +        * dirtied during this system call. Do that only if we are not in mount
6914 +        * and there were nodes dirtied in this context and we are not in
6915 +        * writepage (to avoid deadlock) and not in pdflush
6916 +        */
6917 +       if (sbinfo != NULL && sbinfo->fake != NULL &&
6918 +           context->nr_marked_dirty != 0 &&
6919 +           !(current->flags & PF_MEMALLOC) &&
6920 +           !current_is_pdflush())
6921 +               balance_dirty_pages_ratelimited(sbinfo->fake->i_mapping);
6922 +}
6923 +
6924 +/* release resources associated with context.
6925 +
6926 +   This function should be called at the end of "session" with reiser4,
6927 +   typically just before leaving reiser4 driver back to VFS.
6928 +
6929 +   This is good place to put some degugging consistency checks, like that
6930 +   thread released all locks and closed transcrash etc.
6931 +
6932 +*/
6933 +static void reiser4_done_context(reiser4_context * context /* context being released */ )
6934 +{
6935 +       assert("nikita-860", context != NULL);
6936 +       assert("nikita-859", context->magic == context_magic);
6937 +       assert("vs-646", (reiser4_context *) current->journal_info == context);
6938 +       assert("zam-686", !in_interrupt() && !in_irq());
6939 +
6940 +       /* only do anything when leaving top-level reiser4 context. All nested
6941 +        * contexts are just dummies. */
6942 +       if (context->nr_children == 0) {
6943 +               assert("jmacd-673", context->trans == NULL);
6944 +               assert("jmacd-1002", lock_stack_isclean(&context->stack));
6945 +               assert("nikita-1936", reiser4_no_counters_are_held());
6946 +               assert("nikita-2626", list_empty_careful(reiser4_taps_list()));
6947 +               assert("zam-1004", ergo(get_super_private(context->super),
6948 +                                       get_super_private(context->super)->delete_mutex_owner !=
6949 +                                       current));
6950 +
6951 +               /* release all grabbed but as yet unused blocks */
6952 +               if (context->grabbed_blocks != 0)
6953 +                       all_grabbed2free();
6954 +
6955 +               /*
6956 +                * synchronize against longterm_unlock_znode():
6957 +                * wake_up_requestor() wakes up requestors without holding
6958 +                * zlock (otherwise they will immediately bump into that lock
6959 +                * after wake up on another CPU). To work around (rare)
6960 +                * situation where requestor has been woken up asynchronously
6961 +                * and managed to run until completion (and destroy its
6962 +                * context and lock stack) before wake_up_requestor() called
6963 +                * wake_up() on it, wake_up_requestor() synchronize on lock
6964 +                * stack spin lock. It has actually been observed that spin
6965 +                * lock _was_ locked at this point, because
6966 +                * wake_up_requestor() took interrupt.
6967 +                */
6968 +               spin_lock_stack(&context->stack);
6969 +               spin_unlock_stack(&context->stack);
6970 +
6971 +               assert("zam-684", context->nr_children == 0);
6972 +               /* restore original ->fs_context value */
6973 +               current->journal_info = context->outer;
6974 +               if (context->on_stack == 0)
6975 +                       kfree(context);
6976 +       } else {
6977 +               context->nr_children--;
6978 +#if REISER4_DEBUG
6979 +               assert("zam-685", context->nr_children >= 0);
6980 +#endif
6981 +       }
6982 +}
6983 +
6984 +/*
6985 + * exit reiser4 context. Call balance_dirty_pages_at() if necessary. Close
6986 + * transaction. Call done_context() to do context related book-keeping.
6987 + */
6988 +void reiser4_exit_context(reiser4_context * context)
6989 +{
6990 +       assert("nikita-3021", reiser4_schedulable());
6991 +
6992 +       if (context->nr_children == 0) {
6993 +               if (!context->nobalance) {
6994 +                       reiser4_txn_restart(context);
6995 +                       balance_dirty_pages_at(context);
6996 +               }
6997 +
6998 +               /* if filesystem is mounted with -o sync or -o dirsync - commit
6999 +                  transaction.  FIXME: TXNH_DONT_COMMIT is used to avoid
7000 +                  commiting on exit_context when inode semaphore is held and
7001 +                  to have ktxnmgrd to do commit instead to get better
7002 +                  concurrent filesystem accesses. But, when one mounts with -o
7003 +                  sync, he cares more about reliability than about
7004 +                  performance. So, for now we have this simple mount -o sync
7005 +                  support. */
7006 +               if (context->super->s_flags & (MS_SYNCHRONOUS | MS_DIRSYNC)) {
7007 +                       txn_atom *atom;
7008 +
7009 +                       atom = get_current_atom_locked_nocheck();
7010 +                       if (atom) {
7011 +                               atom->flags |= ATOM_FORCE_COMMIT;
7012 +                               context->trans->flags &= ~TXNH_DONT_COMMIT;
7013 +                               spin_unlock_atom(atom);
7014 +                       }
7015 +               }
7016 +               reiser4_txn_end(context);
7017 +       }
7018 +       reiser4_done_context(context);
7019 +}
7020 +
7021 +void reiser4_ctx_gfp_mask_set(void)
7022 +{
7023 +       reiser4_context *ctx;
7024 +
7025 +       ctx = get_current_context();
7026 +       if (ctx->entd == 0 &&
7027 +           list_empty(&ctx->stack.locks) &&
7028 +           ctx->trans->atom == NULL)
7029 +               ctx->gfp_mask = GFP_KERNEL;
7030 +       else
7031 +               ctx->gfp_mask = GFP_NOFS;
7032 +}
7033 +
7034 +void reiser4_ctx_gfp_mask_force (gfp_t mask)
7035 +{
7036 +       reiser4_context *ctx;
7037 +       ctx = get_current_context();
7038 +
7039 +       assert("edward-1454", ctx != NULL);
7040 +
7041 +       ctx->gfp_mask = mask;
7042 +}
7043 +
7044 +/*
7045 + * Local variables:
7046 + * c-indentation-style: "K&R"
7047 + * mode-name: "LC"
7048 + * c-basic-offset: 8
7049 + * tab-width: 8
7050 + * fill-column: 120
7051 + * scroll-step: 1
7052 + * End:
7053 + */
7054 diff --git a/fs/reiser4/context.h b/fs/reiser4/context.h
7055 new file mode 100644
7056 index 0000000..da240a9
7057 --- /dev/null
7058 +++ b/fs/reiser4/context.h
7059 @@ -0,0 +1,228 @@
7060 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
7061 + * reiser4/README */
7062 +
7063 +/* Reiser4 context. See context.c for details. */
7064 +
7065 +#if !defined( __REISER4_CONTEXT_H__ )
7066 +#define __REISER4_CONTEXT_H__
7067 +
7068 +#include "forward.h"
7069 +#include "debug.h"
7070 +#include "dformat.h"
7071 +#include "tap.h"
7072 +#include "lock.h"
7073 +
7074 +#include <linux/types.h>       /* for __u??  */
7075 +#include <linux/fs.h>          /* for struct super_block  */
7076 +#include <linux/spinlock.h>
7077 +#include <linux/sched.h>       /* for struct task_struct */
7078 +
7079 +/* reiser4 per-thread context */
7080 +struct reiser4_context {
7081 +       /* magic constant. For identification of reiser4 contexts. */
7082 +       __u32 magic;
7083 +
7084 +       /* current lock stack. See lock.[ch]. This is where list of all
7085 +          locks taken by current thread is kept. This is also used in
7086 +          deadlock detection. */
7087 +       lock_stack stack;
7088 +
7089 +       /* current transcrash. */
7090 +       txn_handle *trans;
7091 +       /* transaction handle embedded into reiser4_context. ->trans points
7092 +        * here by default. */
7093 +       txn_handle trans_in_ctx;
7094 +
7095 +       /* super block we are working with.  To get the current tree
7096 +          use &get_super_private (reiser4_get_current_sb ())->tree. */
7097 +       struct super_block *super;
7098 +
7099 +       /* parent fs activation */
7100 +       struct fs_activation *outer;
7101 +
7102 +       /* per-thread grabbed (for further allocation) blocks counter */
7103 +       reiser4_block_nr grabbed_blocks;
7104 +
7105 +       /* list of taps currently monitored. See tap.c */
7106 +       struct list_head taps;
7107 +
7108 +       /* grabbing space is enabled */
7109 +       unsigned int grab_enabled:1;
7110 +       /* should be set when we are write dirty nodes to disk in jnode_flush or
7111 +        * reiser4_write_logs() */
7112 +       unsigned int writeout_mode:1;
7113 +       /* true, if current thread is an ent thread */
7114 +       unsigned int entd:1;
7115 +       /* true, if balance_dirty_pages() should not be run when leaving this
7116 +        * context. This is used to avoid lengthly balance_dirty_pages()
7117 +        * operation when holding some important resource, like directory
7118 +        * ->i_mutex */
7119 +       unsigned int nobalance:1;
7120 +
7121 +       /* this bit is used on reiser4_done_context to decide whether context is
7122 +          kmalloc-ed and has to be kfree-ed */
7123 +       unsigned int on_stack:1;
7124 +
7125 +       /* count non-trivial jnode_set_dirty() calls */
7126 +       unsigned long nr_marked_dirty;
7127 +
7128 +       /* reiser4_sync_inodes calls (via generic_sync_sb_inodes)
7129 +        * reiser4_writepages for each of dirty inodes. Reiser4_writepages
7130 +        * captures pages. When number of pages captured in one
7131 +        * reiser4_sync_inodes reaches some threshold - some atoms get
7132 +        * flushed */
7133 +       int nr_captured;
7134 +       int nr_children;        /* number of child contexts */
7135 +#if REISER4_DEBUG
7136 +       /* debugging information about reiser4 locks held by the current
7137 +        * thread */
7138 +       reiser4_lock_counters_info locks;
7139 +       struct task_struct *task;       /* so we can easily find owner of the stack */
7140 +
7141 +       /*
7142 +        * disk space grabbing debugging support
7143 +        */
7144 +       /* how many disk blocks were grabbed by the first call to
7145 +        * reiser4_grab_space() in this context */
7146 +       reiser4_block_nr grabbed_initially;
7147 +
7148 +       /* list of all threads doing flush currently */
7149 +       struct list_head flushers_link;
7150 +       /* information about last error encountered by reiser4 */
7151 +       err_site err;
7152 +#endif
7153 +       void *vp;
7154 +       gfp_t gfp_mask;
7155 +};
7156 +
7157 +extern reiser4_context *get_context_by_lock_stack(lock_stack *);
7158 +
7159 +/* Debugging helps. */
7160 +#if REISER4_DEBUG
7161 +extern void print_contexts(void);
7162 +#endif
7163 +
7164 +#define current_tree (&(get_super_private(reiser4_get_current_sb())->tree))
7165 +#define current_blocksize reiser4_get_current_sb()->s_blocksize
7166 +#define current_blocksize_bits reiser4_get_current_sb()->s_blocksize_bits
7167 +
7168 +extern reiser4_context *reiser4_init_context(struct super_block *);
7169 +extern void init_stack_context(reiser4_context *, struct super_block *);
7170 +extern void reiser4_exit_context(reiser4_context *);
7171 +
7172 +/* magic constant we store in reiser4_context allocated at the stack. Used to
7173 +   catch accesses to staled or uninitialized contexts. */
7174 +#define context_magic ((__u32) 0x4b1b5d0b)
7175 +
7176 +extern int is_in_reiser4_context(void);
7177 +
7178 +/*
7179 + * return reiser4_context for the thread @tsk
7180 + */
7181 +static inline reiser4_context *get_context(const struct task_struct *tsk)
7182 +{
7183 +       assert("vs-1682",
7184 +              ((reiser4_context *) tsk->journal_info)->magic == context_magic);
7185 +       return (reiser4_context *) tsk->journal_info;
7186 +}
7187 +
7188 +/*
7189 + * return reiser4 context of the current thread, or NULL if there is none.
7190 + */
7191 +static inline reiser4_context *get_current_context_check(void)
7192 +{
7193 +       if (is_in_reiser4_context())
7194 +               return get_context(current);
7195 +       else
7196 +               return NULL;
7197 +}
7198 +
7199 +static inline reiser4_context *get_current_context(void);      /* __attribute__((const)); */
7200 +
7201 +/* return context associated with current thread */
7202 +static inline reiser4_context *get_current_context(void)
7203 +{
7204 +       return get_context(current);
7205 +}
7206 +
7207 +static inline gfp_t reiser4_ctx_gfp_mask_get(void)
7208 +{
7209 +       reiser4_context *ctx;
7210 +
7211 +       ctx = get_current_context_check();
7212 +       return (ctx == NULL) ? GFP_KERNEL : ctx->gfp_mask;
7213 +}
7214 +
7215 +void reiser4_ctx_gfp_mask_set(void);
7216 +void reiser4_ctx_gfp_mask_force (gfp_t mask);
7217 +
7218 +/*
7219 + * true if current thread is in the write-out mode. Thread enters write-out
7220 + * mode during jnode_flush and reiser4_write_logs().
7221 + */
7222 +static inline int is_writeout_mode(void)
7223 +{
7224 +       return get_current_context()->writeout_mode;
7225 +}
7226 +
7227 +/*
7228 + * enter write-out mode
7229 + */
7230 +static inline void writeout_mode_enable(void)
7231 +{
7232 +       assert("zam-941", !get_current_context()->writeout_mode);
7233 +       get_current_context()->writeout_mode = 1;
7234 +}
7235 +
7236 +/*
7237 + * leave write-out mode
7238 + */
7239 +static inline void writeout_mode_disable(void)
7240 +{
7241 +       assert("zam-942", get_current_context()->writeout_mode);
7242 +       get_current_context()->writeout_mode = 0;
7243 +}
7244 +
7245 +static inline void grab_space_enable(void)
7246 +{
7247 +       get_current_context()->grab_enabled = 1;
7248 +}
7249 +
7250 +static inline void grab_space_disable(void)
7251 +{
7252 +       get_current_context()->grab_enabled = 0;
7253 +}
7254 +
7255 +static inline void grab_space_set_enabled(int enabled)
7256 +{
7257 +       get_current_context()->grab_enabled = enabled;
7258 +}
7259 +
7260 +static inline int is_grab_enabled(reiser4_context * ctx)
7261 +{
7262 +       return ctx->grab_enabled;
7263 +}
7264 +
7265 +/* mark transaction handle in @ctx as TXNH_DONT_COMMIT, so that no commit or
7266 + * flush would be performed when it is closed. This is necessary when handle
7267 + * has to be closed under some coarse semaphore, like i_mutex of
7268 + * directory. Commit will be performed by ktxnmgrd. */
7269 +static inline void context_set_commit_async(reiser4_context * context)
7270 +{
7271 +       context->nobalance = 1;
7272 +       context->trans->flags |= TXNH_DONT_COMMIT;
7273 +}
7274 +
7275 +/* __REISER4_CONTEXT_H__ */
7276 +#endif
7277 +
7278 +/* Make Linus happy.
7279 +   Local variables:
7280 +   c-indentation-style: "K&R"
7281 +   mode-name: "LC"
7282 +   c-basic-offset: 8
7283 +   tab-width: 8
7284 +   fill-column: 120
7285 +   scroll-step: 1
7286 +   End:
7287 +*/
7288 diff --git a/fs/reiser4/coord.c b/fs/reiser4/coord.c
7289 new file mode 100644
7290 index 0000000..d171786
7291 --- /dev/null
7292 +++ b/fs/reiser4/coord.c
7293 @@ -0,0 +1,935 @@
7294 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
7295 +
7296 +#include "forward.h"
7297 +#include "debug.h"
7298 +#include "dformat.h"
7299 +#include "tree.h"
7300 +#include "plugin/item/item.h"
7301 +#include "znode.h"
7302 +#include "coord.h"
7303 +
7304 +/* Internal constructor. */
7305 +static inline void
7306 +coord_init_values(coord_t * coord, const znode * node, pos_in_node_t item_pos,
7307 +                 pos_in_node_t unit_pos, between_enum between)
7308 +{
7309 +       coord->node = (znode *) node;
7310 +       coord_set_item_pos(coord, item_pos);
7311 +       coord->unit_pos = unit_pos;
7312 +       coord->between = between;
7313 +       ON_DEBUG(coord->plug_v = 0);
7314 +       ON_DEBUG(coord->body_v = 0);
7315 +
7316 +       /*ON_TRACE (TRACE_COORDS, "init coord %p node %p: %u %u %s\n", coord, node, item_pos, unit_pos, coord_tween_tostring (between)); */
7317 +}
7318 +
7319 +/* after shifting of node content, coord previously set properly may become
7320 +   invalid, try to "normalize" it. */
7321 +void coord_normalize(coord_t * coord)
7322 +{
7323 +       znode *node;
7324 +
7325 +       node = coord->node;
7326 +       assert("vs-683", node);
7327 +
7328 +       coord_clear_iplug(coord);
7329 +
7330 +       if (node_is_empty(node)) {
7331 +               coord_init_first_unit(coord, node);
7332 +       } else if ((coord->between == AFTER_ITEM)
7333 +                  || (coord->between == AFTER_UNIT)) {
7334 +               return;
7335 +       } else if (coord->item_pos == coord_num_items(coord)
7336 +                  && coord->between == BEFORE_ITEM) {
7337 +               coord_dec_item_pos(coord);
7338 +               coord->between = AFTER_ITEM;
7339 +       } else if (coord->unit_pos == coord_num_units(coord)
7340 +                  && coord->between == BEFORE_UNIT) {
7341 +               coord->unit_pos--;
7342 +               coord->between = AFTER_UNIT;
7343 +       } else if (coord->item_pos == coord_num_items(coord)
7344 +                  && coord->unit_pos == 0 && coord->between == BEFORE_UNIT) {
7345 +               coord_dec_item_pos(coord);
7346 +               coord->unit_pos = 0;
7347 +               coord->between = AFTER_ITEM;
7348 +       }
7349 +}
7350 +
7351 +/* Copy a coordinate. */
7352 +void coord_dup(coord_t * coord, const coord_t * old_coord)
7353 +{
7354 +       assert("jmacd-9800", coord_check(old_coord));
7355 +       coord_dup_nocheck(coord, old_coord);
7356 +}
7357 +
7358 +/* Copy a coordinate without check. Useful when old_coord->node is not
7359 +   loaded. As in cbk_tree_lookup -> connect_znode -> connect_one_side */
7360 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord)
7361 +{
7362 +       coord->node = old_coord->node;
7363 +       coord_set_item_pos(coord, old_coord->item_pos);
7364 +       coord->unit_pos = old_coord->unit_pos;
7365 +       coord->between = old_coord->between;
7366 +       coord->iplugid = old_coord->iplugid;
7367 +       ON_DEBUG(coord->plug_v = old_coord->plug_v);
7368 +       ON_DEBUG(coord->body_v = old_coord->body_v);
7369 +}
7370 +
7371 +/* Initialize an invalid coordinate. */
7372 +void coord_init_invalid(coord_t * coord, const znode * node)
7373 +{
7374 +       coord_init_values(coord, node, 0, 0, INVALID_COORD);
7375 +}
7376 +
7377 +void coord_init_first_unit_nocheck(coord_t * coord, const znode * node)
7378 +{
7379 +       coord_init_values(coord, node, 0, 0, AT_UNIT);
7380 +}
7381 +
7382 +/* Initialize a coordinate to point at the first unit of the first item.  If the node is
7383 +   empty, it is positioned at the EMPTY_NODE. */
7384 +void coord_init_first_unit(coord_t * coord, const znode * node)
7385 +{
7386 +       int is_empty = node_is_empty(node);
7387 +
7388 +       coord_init_values(coord, node, 0, 0, (is_empty ? EMPTY_NODE : AT_UNIT));
7389 +
7390 +       assert("jmacd-9801", coord_check(coord));
7391 +}
7392 +
7393 +/* Initialize a coordinate to point at the last unit of the last item.  If the node is
7394 +   empty, it is positioned at the EMPTY_NODE. */
7395 +void coord_init_last_unit(coord_t * coord, const znode * node)
7396 +{
7397 +       int is_empty = node_is_empty(node);
7398 +
7399 +       coord_init_values(coord, node,
7400 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
7401 +                         (is_empty ? EMPTY_NODE : AT_UNIT));
7402 +       if (!is_empty)
7403 +               coord->unit_pos = coord_last_unit_pos(coord);
7404 +       assert("jmacd-9802", coord_check(coord));
7405 +}
7406 +
7407 +/* Initialize a coordinate to before the first item.  If the node is empty, it is
7408 +   positioned at the EMPTY_NODE. */
7409 +void coord_init_before_first_item(coord_t * coord, const znode * node)
7410 +{
7411 +       int is_empty = node_is_empty(node);
7412 +
7413 +       coord_init_values(coord, node, 0, 0,
7414 +                         (is_empty ? EMPTY_NODE : BEFORE_UNIT));
7415 +
7416 +       assert("jmacd-9803", coord_check(coord));
7417 +}
7418 +
7419 +/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
7420 +   at the EMPTY_NODE. */
7421 +void coord_init_after_last_item(coord_t * coord, const znode * node)
7422 +{
7423 +       int is_empty = node_is_empty(node);
7424 +
7425 +       coord_init_values(coord, node,
7426 +                         (is_empty ? 0 : node_num_items(node) - 1), 0,
7427 +                         (is_empty ? EMPTY_NODE : AFTER_ITEM));
7428 +
7429 +       assert("jmacd-9804", coord_check(coord));
7430 +}
7431 +
7432 +/* Initialize a coordinate to after last unit in the item. Coord must be set
7433 +   already to existing item */
7434 +void coord_init_after_item_end(coord_t * coord)
7435 +{
7436 +       coord->between = AFTER_UNIT;
7437 +       coord->unit_pos = coord_last_unit_pos(coord);
7438 +}
7439 +
7440 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
7441 +void coord_init_before_item(coord_t * coord)
7442 +{
7443 +       coord->unit_pos = 0;
7444 +       coord->between = BEFORE_ITEM;
7445 +}
7446 +
7447 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
7448 +void coord_init_after_item(coord_t * coord)
7449 +{
7450 +       coord->unit_pos = 0;
7451 +       coord->between = AFTER_ITEM;
7452 +}
7453 +
7454 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
7455 +   it was not clear how actually */
7456 +void coord_init_zero(coord_t * coord)
7457 +{
7458 +       memset(coord, 0, sizeof(*coord));
7459 +}
7460 +
7461 +/* Return the number of units at the present item.  Asserts coord_is_existing_item(). */
7462 +unsigned coord_num_units(const coord_t * coord)
7463 +{
7464 +       assert("jmacd-9806", coord_is_existing_item(coord));
7465 +
7466 +       return item_plugin_by_coord(coord)->b.nr_units(coord);
7467 +}
7468 +
7469 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
7470 +/* Audited by: green(2002.06.15) */
7471 +int coord_is_invalid(const coord_t * coord)
7472 +{
7473 +       return coord->between == INVALID_COORD;
7474 +}
7475 +
7476 +/* Returns true if the coordinate is positioned at an existing item, not before or after
7477 +   an item.  It may be placed at, before, or after any unit within the item, whether
7478 +   existing or not. */
7479 +int coord_is_existing_item(const coord_t * coord)
7480 +{
7481 +       switch (coord->between) {
7482 +       case EMPTY_NODE:
7483 +       case BEFORE_ITEM:
7484 +       case AFTER_ITEM:
7485 +       case INVALID_COORD:
7486 +               return 0;
7487 +
7488 +       case BEFORE_UNIT:
7489 +       case AT_UNIT:
7490 +       case AFTER_UNIT:
7491 +               return coord->item_pos < coord_num_items(coord);
7492 +       }
7493 +
7494 +       impossible("jmacd-9900", "unreachable coord: %p", coord);
7495 +       return 0;
7496 +}
7497 +
7498 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
7499 +   unit. */
7500 +/* Audited by: green(2002.06.15) */
7501 +int coord_is_existing_unit(const coord_t * coord)
7502 +{
7503 +       switch (coord->between) {
7504 +       case EMPTY_NODE:
7505 +       case BEFORE_UNIT:
7506 +       case AFTER_UNIT:
7507 +       case BEFORE_ITEM:
7508 +       case AFTER_ITEM:
7509 +       case INVALID_COORD:
7510 +               return 0;
7511 +
7512 +       case AT_UNIT:
7513 +               return (coord->item_pos < coord_num_items(coord)
7514 +                       && coord->unit_pos < coord_num_units(coord));
7515 +       }
7516 +
7517 +       impossible("jmacd-9902", "unreachable");
7518 +       return 0;
7519 +}
7520 +
7521 +/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
7522 +   true for empty nodes nor coordinates positioned before the first item. */
7523 +/* Audited by: green(2002.06.15) */
7524 +int coord_is_leftmost_unit(const coord_t * coord)
7525 +{
7526 +       return (coord->between == AT_UNIT && coord->item_pos == 0
7527 +               && coord->unit_pos == 0);
7528 +}
7529 +
7530 +#if REISER4_DEBUG
7531 +/* For assertions only, checks for a valid coordinate. */
7532 +int coord_check(const coord_t * coord)
7533 +{
7534 +       if (coord->node == NULL) {
7535 +               return 0;
7536 +       }
7537 +       if (znode_above_root(coord->node))
7538 +               return 1;
7539 +
7540 +       switch (coord->between) {
7541 +       default:
7542 +       case INVALID_COORD:
7543 +               return 0;
7544 +       case EMPTY_NODE:
7545 +               if (!node_is_empty(coord->node)) {
7546 +                       return 0;
7547 +               }
7548 +               return coord->item_pos == 0 && coord->unit_pos == 0;
7549 +
7550 +       case BEFORE_UNIT:
7551 +       case AFTER_UNIT:
7552 +               if (node_is_empty(coord->node) && (coord->item_pos == 0)
7553 +                   && (coord->unit_pos == 0))
7554 +                       return 1;
7555 +       case AT_UNIT:
7556 +               break;
7557 +       case AFTER_ITEM:
7558 +       case BEFORE_ITEM:
7559 +               /* before/after item should not set unit_pos. */
7560 +               if (coord->unit_pos != 0) {
7561 +                       return 0;
7562 +               }
7563 +               break;
7564 +       }
7565 +
7566 +       if (coord->item_pos >= node_num_items(coord->node)) {
7567 +               return 0;
7568 +       }
7569 +
7570 +       /* FIXME-VS: we are going to check unit_pos. This makes no sense when
7571 +          between is set either AFTER_ITEM or BEFORE_ITEM */
7572 +       if (coord->between == AFTER_ITEM || coord->between == BEFORE_ITEM)
7573 +               return 1;
7574 +
7575 +       if (coord_is_iplug_set(coord) &&
7576 +           coord->unit_pos >
7577 +           item_plugin_by_coord(coord)->b.nr_units(coord) - 1) {
7578 +               return 0;
7579 +       }
7580 +       return 1;
7581 +}
7582 +#endif
7583 +
7584 +/* Adjust coordinate boundaries based on the number of items prior to coord_next/prev.
7585 +   Returns 1 if the new position is does not exist. */
7586 +static int coord_adjust_items(coord_t * coord, unsigned items, int is_next)
7587 +{
7588 +       /* If the node is invalid, leave it. */
7589 +       if (coord->between == INVALID_COORD) {
7590 +               return 1;
7591 +       }
7592 +
7593 +       /* If the node is empty, set it appropriately. */
7594 +       if (items == 0) {
7595 +               coord->between = EMPTY_NODE;
7596 +               coord_set_item_pos(coord, 0);
7597 +               coord->unit_pos = 0;
7598 +               return 1;
7599 +       }
7600 +
7601 +       /* If it was empty and it no longer is, set to BEFORE/AFTER_ITEM. */
7602 +       if (coord->between == EMPTY_NODE) {
7603 +               coord->between = (is_next ? BEFORE_ITEM : AFTER_ITEM);
7604 +               coord_set_item_pos(coord, 0);
7605 +               coord->unit_pos = 0;
7606 +               return 0;
7607 +       }
7608 +
7609 +       /* If the item_pos is out-of-range, set it appropriatly. */
7610 +       if (coord->item_pos >= items) {
7611 +               coord->between = AFTER_ITEM;
7612 +               coord_set_item_pos(coord, items - 1);
7613 +               coord->unit_pos = 0;
7614 +               /* If is_next, return 1 (can't go any further). */
7615 +               return is_next;
7616 +       }
7617 +
7618 +       return 0;
7619 +}
7620 +
7621 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
7622 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is an
7623 +   existing unit. */
7624 +int coord_next_unit(coord_t * coord)
7625 +{
7626 +       unsigned items = coord_num_items(coord);
7627 +
7628 +       if (coord_adjust_items(coord, items, 1) == 1) {
7629 +               return 1;
7630 +       }
7631 +
7632 +       switch (coord->between) {
7633 +       case BEFORE_UNIT:
7634 +               /* Now it is positioned at the same unit. */
7635 +               coord->between = AT_UNIT;
7636 +               return 0;
7637 +
7638 +       case AFTER_UNIT:
7639 +       case AT_UNIT:
7640 +               /* If it was at or after a unit and there are more units in this item,
7641 +                  advance to the next one. */
7642 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
7643 +                       coord->unit_pos += 1;
7644 +                       coord->between = AT_UNIT;
7645 +                       return 0;
7646 +               }
7647 +
7648 +               /* Otherwise, it is crossing an item boundary and treated as if it was
7649 +                  after the current item. */
7650 +               coord->between = AFTER_ITEM;
7651 +               coord->unit_pos = 0;
7652 +               /* FALLTHROUGH */
7653 +
7654 +       case AFTER_ITEM:
7655 +               /* Check for end-of-node. */
7656 +               if (coord->item_pos == items - 1) {
7657 +                       return 1;
7658 +               }
7659 +
7660 +               coord_inc_item_pos(coord);
7661 +               coord->unit_pos = 0;
7662 +               coord->between = AT_UNIT;
7663 +               return 0;
7664 +
7665 +       case BEFORE_ITEM:
7666 +               /* The adjust_items checks ensure that we are valid here. */
7667 +               coord->unit_pos = 0;
7668 +               coord->between = AT_UNIT;
7669 +               return 0;
7670 +
7671 +       case INVALID_COORD:
7672 +       case EMPTY_NODE:
7673 +               /* Handled in coord_adjust_items(). */
7674 +               break;
7675 +       }
7676 +
7677 +       impossible("jmacd-9902", "unreachable");
7678 +       return 0;
7679 +}
7680 +
7681 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
7682 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
7683 +   an existing item. */
7684 +int coord_next_item(coord_t * coord)
7685 +{
7686 +       unsigned items = coord_num_items(coord);
7687 +
7688 +       if (coord_adjust_items(coord, items, 1) == 1) {
7689 +               return 1;
7690 +       }
7691 +
7692 +       switch (coord->between) {
7693 +       case AFTER_UNIT:
7694 +       case AT_UNIT:
7695 +       case BEFORE_UNIT:
7696 +       case AFTER_ITEM:
7697 +               /* Check for end-of-node. */
7698 +               if (coord->item_pos == items - 1) {
7699 +                       coord->between = AFTER_ITEM;
7700 +                       coord->unit_pos = 0;
7701 +                       coord_clear_iplug(coord);
7702 +                       return 1;
7703 +               }
7704 +
7705 +               /* Anywhere in an item, go to the next one. */
7706 +               coord->between = AT_UNIT;
7707 +               coord_inc_item_pos(coord);
7708 +               coord->unit_pos = 0;
7709 +               return 0;
7710 +
7711 +       case BEFORE_ITEM:
7712 +               /* The out-of-range check ensures that we are valid here. */
7713 +               coord->unit_pos = 0;
7714 +               coord->between = AT_UNIT;
7715 +               return 0;
7716 +       case INVALID_COORD:
7717 +       case EMPTY_NODE:
7718 +               /* Handled in coord_adjust_items(). */
7719 +               break;
7720 +       }
7721 +
7722 +       impossible("jmacd-9903", "unreachable");
7723 +       return 0;
7724 +}
7725 +
7726 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
7727 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
7728 +   is an existing unit. */
7729 +int coord_prev_unit(coord_t * coord)
7730 +{
7731 +       unsigned items = coord_num_items(coord);
7732 +
7733 +       if (coord_adjust_items(coord, items, 0) == 1) {
7734 +               return 1;
7735 +       }
7736 +
7737 +       switch (coord->between) {
7738 +       case AT_UNIT:
7739 +       case BEFORE_UNIT:
7740 +               if (coord->unit_pos > 0) {
7741 +                       coord->unit_pos -= 1;
7742 +                       coord->between = AT_UNIT;
7743 +                       return 0;
7744 +               }
7745 +
7746 +               if (coord->item_pos == 0) {
7747 +                       coord->between = BEFORE_ITEM;
7748 +                       return 1;
7749 +               }
7750 +
7751 +               coord_dec_item_pos(coord);
7752 +               coord->unit_pos = coord_last_unit_pos(coord);
7753 +               coord->between = AT_UNIT;
7754 +               return 0;
7755 +
7756 +       case AFTER_UNIT:
7757 +               /* What if unit_pos is out-of-range? */
7758 +               assert("jmacd-5442",
7759 +                      coord->unit_pos <= coord_last_unit_pos(coord));
7760 +               coord->between = AT_UNIT;
7761 +               return 0;
7762 +
7763 +       case BEFORE_ITEM:
7764 +               if (coord->item_pos == 0) {
7765 +                       return 1;
7766 +               }
7767 +
7768 +               coord_dec_item_pos(coord);
7769 +               /* FALLTHROUGH */
7770 +
7771 +       case AFTER_ITEM:
7772 +               coord->between = AT_UNIT;
7773 +               coord->unit_pos = coord_last_unit_pos(coord);
7774 +               return 0;
7775 +
7776 +       case INVALID_COORD:
7777 +       case EMPTY_NODE:
7778 +               break;
7779 +       }
7780 +
7781 +       impossible("jmacd-9904", "unreachable");
7782 +       return 0;
7783 +}
7784 +
7785 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
7786 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
7787 +   is an existing item. */
7788 +int coord_prev_item(coord_t * coord)
7789 +{
7790 +       unsigned items = coord_num_items(coord);
7791 +
7792 +       if (coord_adjust_items(coord, items, 0) == 1) {
7793 +               return 1;
7794 +       }
7795 +
7796 +       switch (coord->between) {
7797 +       case AT_UNIT:
7798 +       case AFTER_UNIT:
7799 +       case BEFORE_UNIT:
7800 +       case BEFORE_ITEM:
7801 +
7802 +               if (coord->item_pos == 0) {
7803 +                       coord->between = BEFORE_ITEM;
7804 +                       coord->unit_pos = 0;
7805 +                       return 1;
7806 +               }
7807 +
7808 +               coord_dec_item_pos(coord);
7809 +               coord->unit_pos = 0;
7810 +               coord->between = AT_UNIT;
7811 +               return 0;
7812 +
7813 +       case AFTER_ITEM:
7814 +               coord->between = AT_UNIT;
7815 +               coord->unit_pos = 0;
7816 +               return 0;
7817 +
7818 +       case INVALID_COORD:
7819 +       case EMPTY_NODE:
7820 +               break;
7821 +       }
7822 +
7823 +       impossible("jmacd-9905", "unreachable");
7824 +       return 0;
7825 +}
7826 +
7827 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
7828 +void coord_init_sideof_unit(coord_t * coord, const znode * node, sideof dir)
7829 +{
7830 +       assert("jmacd-9821", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7831 +       if (dir == LEFT_SIDE) {
7832 +               coord_init_first_unit(coord, node);
7833 +       } else {
7834 +               coord_init_last_unit(coord, node);
7835 +       }
7836 +}
7837 +
7838 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
7839 +   argument. */
7840 +/* Audited by: green(2002.06.15) */
7841 +int coord_is_after_sideof_unit(coord_t * coord, sideof dir)
7842 +{
7843 +       assert("jmacd-9822", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7844 +       if (dir == LEFT_SIDE) {
7845 +               return coord_is_before_leftmost(coord);
7846 +       } else {
7847 +               return coord_is_after_rightmost(coord);
7848 +       }
7849 +}
7850 +
7851 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
7852 +/* Audited by: green(2002.06.15) */
7853 +int coord_sideof_unit(coord_t * coord, sideof dir)
7854 +{
7855 +       assert("jmacd-9823", dir == LEFT_SIDE || dir == RIGHT_SIDE);
7856 +       if (dir == LEFT_SIDE) {
7857 +               return coord_prev_unit(coord);
7858 +       } else {
7859 +               return coord_next_unit(coord);
7860 +       }
7861 +}
7862 +
7863 +#if REISER4_DEBUG
7864 +int coords_equal(const coord_t * c1, const coord_t * c2)
7865 +{
7866 +       assert("nikita-2840", c1 != NULL);
7867 +       assert("nikita-2841", c2 != NULL);
7868 +
7869 +       return
7870 +           c1->node == c2->node &&
7871 +           c1->item_pos == c2->item_pos &&
7872 +           c1->unit_pos == c2->unit_pos && c1->between == c2->between;
7873 +}
7874 +#endif  /*  REISER4_DEBUG  */
7875 +
7876 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
7877 +   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
7878 +/* Audited by: green(2002.06.15) */
7879 +coord_wrt_node coord_wrt(const coord_t * coord)
7880 +{
7881 +       if (coord_is_before_leftmost(coord)) {
7882 +               return COORD_ON_THE_LEFT;
7883 +       }
7884 +
7885 +       if (coord_is_after_rightmost(coord)) {
7886 +               return COORD_ON_THE_RIGHT;
7887 +       }
7888 +
7889 +       return COORD_INSIDE;
7890 +}
7891 +
7892 +/* Returns true if the coordinate is positioned after the last item or after the last unit
7893 +   of the last item or it is an empty node. */
7894 +/* Audited by: green(2002.06.15) */
7895 +int coord_is_after_rightmost(const coord_t * coord)
7896 +{
7897 +       assert("jmacd-7313", coord_check(coord));
7898 +
7899 +       switch (coord->between) {
7900 +       case INVALID_COORD:
7901 +       case AT_UNIT:
7902 +       case BEFORE_UNIT:
7903 +       case BEFORE_ITEM:
7904 +               return 0;
7905 +
7906 +       case EMPTY_NODE:
7907 +               return 1;
7908 +
7909 +       case AFTER_ITEM:
7910 +               return (coord->item_pos == node_num_items(coord->node) - 1);
7911 +
7912 +       case AFTER_UNIT:
7913 +               return ((coord->item_pos == node_num_items(coord->node) - 1) &&
7914 +                       coord->unit_pos == coord_last_unit_pos(coord));
7915 +       }
7916 +
7917 +       impossible("jmacd-9908", "unreachable");
7918 +       return 0;
7919 +}
7920 +
7921 +/* Returns true if the coordinate is positioned before the first item or it is an empty
7922 +   node. */
7923 +int coord_is_before_leftmost(const coord_t * coord)
7924 +{
7925 +       /* FIXME-VS: coord_check requires node to be loaded whereas it is not
7926 +          necessary to check if coord is set before leftmost
7927 +          assert ("jmacd-7313", coord_check (coord)); */
7928 +       switch (coord->between) {
7929 +       case INVALID_COORD:
7930 +       case AT_UNIT:
7931 +       case AFTER_ITEM:
7932 +       case AFTER_UNIT:
7933 +               return 0;
7934 +
7935 +       case EMPTY_NODE:
7936 +               return 1;
7937 +
7938 +       case BEFORE_ITEM:
7939 +       case BEFORE_UNIT:
7940 +               return (coord->item_pos == 0) && (coord->unit_pos == 0);
7941 +       }
7942 +
7943 +       impossible("jmacd-9908", "unreachable");
7944 +       return 0;
7945 +}
7946 +
7947 +/* Returns true if the coordinate is positioned after a item, before a item, after the
7948 +   last unit of an item, before the first unit of an item, or at an empty node. */
7949 +/* Audited by: green(2002.06.15) */
7950 +int coord_is_between_items(const coord_t * coord)
7951 +{
7952 +       assert("jmacd-7313", coord_check(coord));
7953 +
7954 +       switch (coord->between) {
7955 +       case INVALID_COORD:
7956 +       case AT_UNIT:
7957 +               return 0;
7958 +
7959 +       case AFTER_ITEM:
7960 +       case BEFORE_ITEM:
7961 +       case EMPTY_NODE:
7962 +               return 1;
7963 +
7964 +       case BEFORE_UNIT:
7965 +               return coord->unit_pos == 0;
7966 +
7967 +       case AFTER_UNIT:
7968 +               return coord->unit_pos == coord_last_unit_pos(coord);
7969 +       }
7970 +
7971 +       impossible("jmacd-9908", "unreachable");
7972 +       return 0;
7973 +}
7974 +
7975 +#if REISER4_DEBUG
7976 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
7977 +   before-after or item boundaries. */
7978 +int coord_are_neighbors(coord_t * c1, coord_t * c2)
7979 +{
7980 +       coord_t *left;
7981 +       coord_t *right;
7982 +
7983 +       assert("nikita-1241", c1 != NULL);
7984 +       assert("nikita-1242", c2 != NULL);
7985 +       assert("nikita-1243", c1->node == c2->node);
7986 +       assert("nikita-1244", coord_is_existing_unit(c1));
7987 +       assert("nikita-1245", coord_is_existing_unit(c2));
7988 +
7989 +       left = right = NULL;
7990 +       switch (coord_compare(c1, c2)) {
7991 +       case COORD_CMP_ON_LEFT:
7992 +               left = c1;
7993 +               right = c2;
7994 +               break;
7995 +       case COORD_CMP_ON_RIGHT:
7996 +               left = c2;
7997 +               right = c1;
7998 +               break;
7999 +       case COORD_CMP_SAME:
8000 +               return 0;
8001 +       default:
8002 +               wrong_return_value("nikita-1246", "compare_coords()");
8003 +       }
8004 +       assert("vs-731", left && right);
8005 +       if (left->item_pos == right->item_pos) {
8006 +               return left->unit_pos + 1 == right->unit_pos;
8007 +       } else if (left->item_pos + 1 == right->item_pos) {
8008 +               return (left->unit_pos == coord_last_unit_pos(left))
8009 +                   && (right->unit_pos == 0);
8010 +       } else {
8011 +               return 0;
8012 +       }
8013 +}
8014 +#endif  /*  REISER4_DEBUG  */
8015 +
8016 +/* Assuming two coordinates are positioned in the same node, return COORD_CMP_ON_RIGHT,
8017 +   COORD_CMP_ON_LEFT, or COORD_CMP_SAME depending on c1's position relative to c2.  */
8018 +/* Audited by: green(2002.06.15) */
8019 +coord_cmp coord_compare(coord_t * c1, coord_t * c2)
8020 +{
8021 +       assert("vs-209", c1->node == c2->node);
8022 +       assert("vs-194", coord_is_existing_unit(c1)
8023 +              && coord_is_existing_unit(c2));
8024 +
8025 +       if (c1->item_pos > c2->item_pos)
8026 +               return COORD_CMP_ON_RIGHT;
8027 +       if (c1->item_pos < c2->item_pos)
8028 +               return COORD_CMP_ON_LEFT;
8029 +       if (c1->unit_pos > c2->unit_pos)
8030 +               return COORD_CMP_ON_RIGHT;
8031 +       if (c1->unit_pos < c2->unit_pos)
8032 +               return COORD_CMP_ON_LEFT;
8033 +       return COORD_CMP_SAME;
8034 +}
8035 +
8036 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
8037 +   non-zero if there is no position to the right. */
8038 +int coord_set_to_right(coord_t * coord)
8039 +{
8040 +       unsigned items = coord_num_items(coord);
8041 +
8042 +       if (coord_adjust_items(coord, items, 1) == 1) {
8043 +               return 1;
8044 +       }
8045 +
8046 +       switch (coord->between) {
8047 +       case AT_UNIT:
8048 +               return 0;
8049 +
8050 +       case BEFORE_ITEM:
8051 +       case BEFORE_UNIT:
8052 +               coord->between = AT_UNIT;
8053 +               return 0;
8054 +
8055 +       case AFTER_UNIT:
8056 +               if (coord->unit_pos < coord_last_unit_pos(coord)) {
8057 +                       coord->unit_pos += 1;
8058 +                       coord->between = AT_UNIT;
8059 +                       return 0;
8060 +               } else {
8061 +
8062 +                       coord->unit_pos = 0;
8063 +
8064 +                       if (coord->item_pos == items - 1) {
8065 +                               coord->between = AFTER_ITEM;
8066 +                               return 1;
8067 +                       }
8068 +
8069 +                       coord_inc_item_pos(coord);
8070 +                       coord->between = AT_UNIT;
8071 +                       return 0;
8072 +               }
8073 +
8074 +       case AFTER_ITEM:
8075 +               if (coord->item_pos == items - 1) {
8076 +                       return 1;
8077 +               }
8078 +
8079 +               coord_inc_item_pos(coord);
8080 +               coord->unit_pos = 0;
8081 +               coord->between = AT_UNIT;
8082 +               return 0;
8083 +
8084 +       case EMPTY_NODE:
8085 +               return 1;
8086 +
8087 +       case INVALID_COORD:
8088 +               break;
8089 +       }
8090 +
8091 +       impossible("jmacd-9920", "unreachable");
8092 +       return 0;
8093 +}
8094 +
8095 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
8096 +   non-zero if there is no position to the left. */
8097 +int coord_set_to_left(coord_t * coord)
8098 +{
8099 +       unsigned items = coord_num_items(coord);
8100 +
8101 +       if (coord_adjust_items(coord, items, 0) == 1) {
8102 +               return 1;
8103 +       }
8104 +
8105 +       switch (coord->between) {
8106 +       case AT_UNIT:
8107 +               return 0;
8108 +
8109 +       case AFTER_UNIT:
8110 +               coord->between = AT_UNIT;
8111 +               return 0;
8112 +
8113 +       case AFTER_ITEM:
8114 +               coord->between = AT_UNIT;
8115 +               coord->unit_pos = coord_last_unit_pos(coord);
8116 +               return 0;
8117 +
8118 +       case BEFORE_UNIT:
8119 +               if (coord->unit_pos > 0) {
8120 +                       coord->unit_pos -= 1;
8121 +                       coord->between = AT_UNIT;
8122 +                       return 0;
8123 +               } else {
8124 +
8125 +                       if (coord->item_pos == 0) {
8126 +                               coord->between = BEFORE_ITEM;
8127 +                               return 1;
8128 +                       }
8129 +
8130 +                       coord->unit_pos = coord_last_unit_pos(coord);
8131 +                       coord_dec_item_pos(coord);
8132 +                       coord->between = AT_UNIT;
8133 +                       return 0;
8134 +               }
8135 +
8136 +       case BEFORE_ITEM:
8137 +               if (coord->item_pos == 0) {
8138 +                       return 1;
8139 +               }
8140 +
8141 +               coord_dec_item_pos(coord);
8142 +               coord->unit_pos = coord_last_unit_pos(coord);
8143 +               coord->between = AT_UNIT;
8144 +               return 0;
8145 +
8146 +       case EMPTY_NODE:
8147 +               return 1;
8148 +
8149 +       case INVALID_COORD:
8150 +               break;
8151 +       }
8152 +
8153 +       impossible("jmacd-9920", "unreachable");
8154 +       return 0;
8155 +}
8156 +
8157 +static const char *coord_tween_tostring(between_enum n)
8158 +{
8159 +       switch (n) {
8160 +       case BEFORE_UNIT:
8161 +               return "before unit";
8162 +       case BEFORE_ITEM:
8163 +               return "before item";
8164 +       case AT_UNIT:
8165 +               return "at unit";
8166 +       case AFTER_UNIT:
8167 +               return "after unit";
8168 +       case AFTER_ITEM:
8169 +               return "after item";
8170 +       case EMPTY_NODE:
8171 +               return "empty node";
8172 +       case INVALID_COORD:
8173 +               return "invalid";
8174 +       default:
8175 +       {
8176 +               static char buf[30];
8177 +
8178 +               sprintf(buf, "unknown: %i", n);
8179 +               return buf;
8180 +       }
8181 +       }
8182 +}
8183 +
8184 +void print_coord(const char *mes, const coord_t * coord, int node)
8185 +{
8186 +       if (coord == NULL) {
8187 +               printk("%s: null\n", mes);
8188 +               return;
8189 +       }
8190 +       printk("%s: item_pos = %d, unit_pos %d, tween=%s, iplug=%d\n",
8191 +              mes, coord->item_pos, coord->unit_pos,
8192 +              coord_tween_tostring(coord->between), coord->iplugid);
8193 +}
8194 +
8195 +int
8196 +item_utmost_child_real_block(const coord_t * coord, sideof side,
8197 +                            reiser4_block_nr * blk)
8198 +{
8199 +       return item_plugin_by_coord(coord)->f.utmost_child_real_block(coord,
8200 +                                                                     side,
8201 +                                                                     blk);
8202 +}
8203 +
8204 +int item_utmost_child(const coord_t * coord, sideof side, jnode ** child)
8205 +{
8206 +       return item_plugin_by_coord(coord)->f.utmost_child(coord, side, child);
8207 +}
8208 +
8209 +/* @count bytes of flow @f got written, update correspondingly f->length,
8210 +   f->data and f->key */
8211 +void move_flow_forward(flow_t * f, unsigned count)
8212 +{
8213 +       if (f->data)
8214 +               f->data += count;
8215 +       f->length -= count;
8216 +       set_key_offset(&f->key, get_key_offset(&f->key) + count);
8217 +}
8218 +
8219 +/*
8220 +   Local variables:
8221 +   c-indentation-style: "K&R"
8222 +   mode-name: "LC"
8223 +   c-basic-offset: 8
8224 +   tab-width: 8
8225 +   fill-column: 120
8226 +   scroll-step: 1
8227 +   End:
8228 +*/
8229 diff --git a/fs/reiser4/coord.h b/fs/reiser4/coord.h
8230 new file mode 100644
8231 index 0000000..313e615
8232 --- /dev/null
8233 +++ b/fs/reiser4/coord.h
8234 @@ -0,0 +1,389 @@
8235 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8236 +
8237 +/* Coords */
8238 +
8239 +#if !defined( __REISER4_COORD_H__ )
8240 +#define __REISER4_COORD_H__
8241 +
8242 +#include "forward.h"
8243 +#include "debug.h"
8244 +#include "dformat.h"
8245 +#include "key.h"
8246 +
8247 +/* insertions happen between coords in the tree, so we need some means
8248 +   of specifying the sense of betweenness. */
8249 +typedef enum {
8250 +       BEFORE_UNIT,            /* Note: we/init_coord depends on this value being zero. */
8251 +       AT_UNIT,
8252 +       AFTER_UNIT,
8253 +       BEFORE_ITEM,
8254 +       AFTER_ITEM,
8255 +       INVALID_COORD,
8256 +       EMPTY_NODE,
8257 +} between_enum;
8258 +
8259 +/* location of coord w.r.t. its node */
8260 +typedef enum {
8261 +       COORD_ON_THE_LEFT = -1,
8262 +       COORD_ON_THE_RIGHT = +1,
8263 +       COORD_INSIDE = 0
8264 +} coord_wrt_node;
8265 +
8266 +typedef enum {
8267 +       COORD_CMP_SAME = 0, COORD_CMP_ON_LEFT = -1, COORD_CMP_ON_RIGHT = +1
8268 +} coord_cmp;
8269 +
8270 +struct coord {
8271 +       /* node in a tree */
8272 +       /*  0 */ znode *node;
8273 +
8274 +       /* position of item within node */
8275 +       /*  4 */ pos_in_node_t item_pos;
8276 +       /* position of unit within item */
8277 +       /*  6 */ pos_in_node_t unit_pos;
8278 +       /* optimization: plugin of item is stored in coord_t. Until this was
8279 +          implemented, item_plugin_by_coord() was major CPU consumer. ->iplugid
8280 +          is invalidated (set to 0xff) on each modification of ->item_pos,
8281 +          and all such modifications are funneled through coord_*_item_pos()
8282 +          functions below.
8283 +        */
8284 +       /*  8 */ char iplugid;
8285 +       /* position of coord w.r.t. to neighboring items and/or units.
8286 +          Values are taken from &between_enum above.
8287 +        */
8288 +       /*  9 */ char between;
8289 +       /* padding. It will be added by the compiler anyway to conform to the
8290 +        * C language alignment requirements. We keep it here to be on the
8291 +        * safe side and to have a clear picture of the memory layout of this
8292 +        * structure. */
8293 +       /* 10 */ __u16 pad;
8294 +       /* 12 */ int offset;
8295 +#if REISER4_DEBUG
8296 +       unsigned long plug_v;
8297 +       unsigned long body_v;
8298 +#endif
8299 +};
8300 +
8301 +#define INVALID_PLUGID  ((char)((1 << 8) - 1))
8302 +#define INVALID_OFFSET -1
8303 +
8304 +static inline void coord_clear_iplug(coord_t * coord)
8305 +{
8306 +       assert("nikita-2835", coord != NULL);
8307 +       coord->iplugid = INVALID_PLUGID;
8308 +       coord->offset = INVALID_OFFSET;
8309 +}
8310 +
8311 +static inline int coord_is_iplug_set(const coord_t * coord)
8312 +{
8313 +       assert("nikita-2836", coord != NULL);
8314 +       return coord->iplugid != INVALID_PLUGID;
8315 +}
8316 +
8317 +static inline void coord_set_item_pos(coord_t * coord, pos_in_node_t pos)
8318 +{
8319 +       assert("nikita-2478", coord != NULL);
8320 +       coord->item_pos = pos;
8321 +       coord_clear_iplug(coord);
8322 +}
8323 +
8324 +static inline void coord_dec_item_pos(coord_t * coord)
8325 +{
8326 +       assert("nikita-2480", coord != NULL);
8327 +       --coord->item_pos;
8328 +       coord_clear_iplug(coord);
8329 +}
8330 +
8331 +static inline void coord_inc_item_pos(coord_t * coord)
8332 +{
8333 +       assert("nikita-2481", coord != NULL);
8334 +       ++coord->item_pos;
8335 +       coord_clear_iplug(coord);
8336 +}
8337 +
8338 +static inline void coord_add_item_pos(coord_t * coord, int delta)
8339 +{
8340 +       assert("nikita-2482", coord != NULL);
8341 +       coord->item_pos += delta;
8342 +       coord_clear_iplug(coord);
8343 +}
8344 +
8345 +static inline void coord_invalid_item_pos(coord_t * coord)
8346 +{
8347 +       assert("nikita-2832", coord != NULL);
8348 +       coord->item_pos = (unsigned short)~0;
8349 +       coord_clear_iplug(coord);
8350 +}
8351 +
8352 +/* Reverse a direction. */
8353 +static inline sideof sideof_reverse(sideof side)
8354 +{
8355 +       return side == LEFT_SIDE ? RIGHT_SIDE : LEFT_SIDE;
8356 +}
8357 +
8358 +/* NOTE: There is a somewhat odd mixture of the following opposed terms:
8359 +
8360 +   "first" and "last"
8361 +   "next" and "prev"
8362 +   "before" and "after"
8363 +   "leftmost" and "rightmost"
8364 +
8365 +   But I think the chosen names are decent the way they are.
8366 +*/
8367 +
8368 +/* COORD INITIALIZERS */
8369 +
8370 +/* Initialize an invalid coordinate. */
8371 +extern void coord_init_invalid(coord_t * coord, const znode * node);
8372 +
8373 +extern void coord_init_first_unit_nocheck(coord_t * coord, const znode * node);
8374 +
8375 +/* Initialize a coordinate to point at the first unit of the first item.  If the node is
8376 +   empty, it is positioned at the EMPTY_NODE. */
8377 +extern void coord_init_first_unit(coord_t * coord, const znode * node);
8378 +
8379 +/* Initialize a coordinate to point at the last unit of the last item.  If the node is
8380 +   empty, it is positioned at the EMPTY_NODE. */
8381 +extern void coord_init_last_unit(coord_t * coord, const znode * node);
8382 +
8383 +/* Initialize a coordinate to before the first item.  If the node is empty, it is
8384 +   positioned at the EMPTY_NODE. */
8385 +extern void coord_init_before_first_item(coord_t * coord, const znode * node);
8386 +
8387 +/* Initialize a coordinate to after the last item.  If the node is empty, it is positioned
8388 +   at the EMPTY_NODE. */
8389 +extern void coord_init_after_last_item(coord_t * coord, const znode * node);
8390 +
8391 +/* Initialize a coordinate to after last unit in the item. Coord must be set
8392 +   already to existing item */
8393 +void coord_init_after_item_end(coord_t * coord);
8394 +
8395 +/* Initialize a coordinate to before the item. Coord must be set already to existing item */
8396 +void coord_init_before_item(coord_t *);
8397 +/* Initialize a coordinate to after the item. Coord must be set already to existing item */
8398 +void coord_init_after_item(coord_t *);
8399 +
8400 +/* Calls either coord_init_first_unit or coord_init_last_unit depending on sideof argument. */
8401 +extern void coord_init_sideof_unit(coord_t * coord, const znode * node,
8402 +                                  sideof dir);
8403 +
8404 +/* Initialize a coordinate by 0s. Used in places where init_coord was used and
8405 +   it was not clear how actually
8406 +   FIXME-VS: added by vs (2002, june, 8) */
8407 +extern void coord_init_zero(coord_t * coord);
8408 +
8409 +/* COORD METHODS */
8410 +
8411 +/* after shifting of node content, coord previously set properly may become
8412 +   invalid, try to "normalize" it. */
8413 +void coord_normalize(coord_t * coord);
8414 +
8415 +/* Copy a coordinate. */
8416 +extern void coord_dup(coord_t * coord, const coord_t * old_coord);
8417 +
8418 +/* Copy a coordinate without check. */
8419 +void coord_dup_nocheck(coord_t * coord, const coord_t * old_coord);
8420 +
8421 +unsigned coord_num_units(const coord_t * coord);
8422 +
8423 +/* Return the last valid unit number at the present item (i.e.,
8424 +   coord_num_units() - 1). */
8425 +static inline unsigned coord_last_unit_pos(const coord_t * coord)
8426 +{
8427 +       return coord_num_units(coord) - 1;
8428 +}
8429 +
8430 +#if REISER4_DEBUG
8431 +/* For assertions only, checks for a valid coordinate. */
8432 +extern int coord_check(const coord_t * coord);
8433 +
8434 +extern unsigned long znode_times_locked(const znode * z);
8435 +
8436 +static inline void coord_update_v(coord_t * coord)
8437 +{
8438 +       coord->plug_v = coord->body_v = znode_times_locked(coord->node);
8439 +}
8440 +#endif
8441 +
8442 +extern int coords_equal(const coord_t * c1, const coord_t * c2);
8443 +
8444 +extern void print_coord(const char *mes, const coord_t * coord, int print_node);
8445 +
8446 +/* If coord_is_after_rightmost return NCOORD_ON_THE_RIGHT, if coord_is_after_leftmost
8447 +   return NCOORD_ON_THE_LEFT, otherwise return NCOORD_INSIDE. */
8448 +extern coord_wrt_node coord_wrt(const coord_t * coord);
8449 +
8450 +/* Returns true if the coordinates are positioned at adjacent units, regardless of
8451 +   before-after or item boundaries. */
8452 +extern int coord_are_neighbors(coord_t * c1, coord_t * c2);
8453 +
8454 +/* Assuming two coordinates are positioned in the same node, return NCOORD_CMP_ON_RIGHT,
8455 +   NCOORD_CMP_ON_LEFT, or NCOORD_CMP_SAME depending on c1's position relative to c2.  */
8456 +extern coord_cmp coord_compare(coord_t * c1, coord_t * c2);
8457 +
8458 +/* COORD PREDICATES */
8459 +
8460 +/* Returns true if the coord was initializewd by coord_init_invalid (). */
8461 +extern int coord_is_invalid(const coord_t * coord);
8462 +
8463 +/* Returns true if the coordinate is positioned at an existing item, not before or after
8464 +   an item.  It may be placed at, before, or after any unit within the item, whether
8465 +   existing or not.  If this is true you can call methods of the item plugin.  */
8466 +extern int coord_is_existing_item(const coord_t * coord);
8467 +
8468 +/* Returns true if the coordinate is positioned after a item, before a item, after the
8469 +   last unit of an item, before the first unit of an item, or at an empty node. */
8470 +extern int coord_is_between_items(const coord_t * coord);
8471 +
8472 +/* Returns true if the coordinate is positioned at an existing unit, not before or after a
8473 +   unit. */
8474 +extern int coord_is_existing_unit(const coord_t * coord);
8475 +
8476 +/* Returns true if the coordinate is positioned at an empty node. */
8477 +extern int coord_is_empty(const coord_t * coord);
8478 +
8479 +/* Returns true if the coordinate is positioned at the first unit of the first item.  Not
8480 +   true for empty nodes nor coordinates positioned before the first item. */
8481 +extern int coord_is_leftmost_unit(const coord_t * coord);
8482 +
8483 +/* Returns true if the coordinate is positioned after the last item or after the last unit
8484 +   of the last item or it is an empty node. */
8485 +extern int coord_is_after_rightmost(const coord_t * coord);
8486 +
8487 +/* Returns true if the coordinate is positioned before the first item or it is an empty
8488 +   node. */
8489 +extern int coord_is_before_leftmost(const coord_t * coord);
8490 +
8491 +/* Calls either coord_is_before_leftmost or coord_is_after_rightmost depending on sideof
8492 +   argument. */
8493 +extern int coord_is_after_sideof_unit(coord_t * coord, sideof dir);
8494 +
8495 +/* COORD MODIFIERS */
8496 +
8497 +/* Advances the coordinate by one unit to the right.  If empty, no change.  If
8498 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
8499 +   an existing unit. */
8500 +extern int coord_next_unit(coord_t * coord);
8501 +
8502 +/* Advances the coordinate by one item to the right.  If empty, no change.  If
8503 +   coord_is_rightmost_unit, advances to AFTER THE LAST ITEM.  Returns 0 if new position is
8504 +   an existing item. */
8505 +extern int coord_next_item(coord_t * coord);
8506 +
8507 +/* Advances the coordinate by one unit to the left.  If empty, no change.  If
8508 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
8509 +   is an existing unit. */
8510 +extern int coord_prev_unit(coord_t * coord);
8511 +
8512 +/* Advances the coordinate by one item to the left.  If empty, no change.  If
8513 +   coord_is_leftmost_unit, advances to BEFORE THE FIRST ITEM.  Returns 0 if new position
8514 +   is an existing item. */
8515 +extern int coord_prev_item(coord_t * coord);
8516 +
8517 +/* If the coordinate is between items, shifts it to the right.  Returns 0 on success and
8518 +   non-zero if there is no position to the right. */
8519 +extern int coord_set_to_right(coord_t * coord);
8520 +
8521 +/* If the coordinate is between items, shifts it to the left.  Returns 0 on success and
8522 +   non-zero if there is no position to the left. */
8523 +extern int coord_set_to_left(coord_t * coord);
8524 +
8525 +/* If the coordinate is at an existing unit, set to after that unit.  Returns 0 on success
8526 +   and non-zero if the unit did not exist. */
8527 +extern int coord_set_after_unit(coord_t * coord);
8528 +
8529 +/* Calls either coord_next_unit or coord_prev_unit depending on sideof argument. */
8530 +extern int coord_sideof_unit(coord_t * coord, sideof dir);
8531 +
8532 +/* iterate over all units in @node */
8533 +#define for_all_units( coord, node )                                   \
8534 +       for( coord_init_before_first_item( ( coord ), ( node ) ) ;      \
8535 +            coord_next_unit( coord ) == 0 ; )
8536 +
8537 +/* iterate over all items in @node */
8538 +#define for_all_items( coord, node )                                   \
8539 +       for( coord_init_before_first_item( ( coord ), ( node ) ) ;      \
8540 +            coord_next_item( coord ) == 0 ; )
8541 +
8542 +/* COORD/ITEM METHODS */
8543 +
8544 +extern int item_utmost_child_real_block(const coord_t * coord, sideof side,
8545 +                                       reiser4_block_nr * blk);
8546 +extern int item_utmost_child(const coord_t * coord, sideof side,
8547 +                            jnode ** child);
8548 +
8549 +/* a flow is a sequence of bytes being written to or read from the tree.  The
8550 +   tree will slice the flow into items while storing it into nodes, but all of
8551 +   that is hidden from anything outside the tree.  */
8552 +
8553 +struct flow {
8554 +       reiser4_key key;        /* key of start of flow's sequence of bytes */
8555 +       loff_t length;          /* length of flow's sequence of bytes */
8556 +       char *data;             /* start of flow's sequence of bytes */
8557 +       int user;               /* if 1 data is user space, 0 - kernel space */
8558 +       rw_op op;               /* NIKITA-FIXME-HANS: comment is where?  */
8559 +};
8560 +
8561 +void move_flow_forward(flow_t * f, unsigned count);
8562 +
8563 +/* &reiser4_item_data - description of data to be inserted or pasted
8564 +
8565 +   Q: articulate the reasons for the difference between this and flow.
8566 +
8567 +   A: Becides flow we insert into tree other things: stat data, directory
8568 +   entry, etc.  To insert them into tree one has to provide this structure. If
8569 +   one is going to insert flow - he can use insert_flow, where this structure
8570 +   does not have to be created
8571 +*/
8572 +struct reiser4_item_data {
8573 +       /* actual data to be inserted. If NULL, ->create_item() will not
8574 +          do xmemcpy itself, leaving this up to the caller. This can
8575 +          save some amount of unnecessary memory copying, for example,
8576 +          during insertion of stat data.
8577 +
8578 +        */
8579 +       char *data;
8580 +       /* 1 if 'char * data' contains pointer to user space and 0 if it is
8581 +          kernel space */
8582 +       int user;
8583 +       /* amount of data we are going to insert or paste */
8584 +       int length;
8585 +       /* "Arg" is opaque data that is passed down to the
8586 +          ->create_item() method of node layout, which in turn
8587 +          hands it to the ->create_hook() of item being created. This
8588 +          arg is currently used by:
8589 +
8590 +          .  ->create_hook() of internal item
8591 +          (fs/reiser4/plugin/item/internal.c:internal_create_hook()),
8592 +          . ->paste() method of directory item.
8593 +          . ->create_hook() of extent item
8594 +
8595 +          For internal item, this is left "brother" of new node being
8596 +          inserted and it is used to add new node into sibling list
8597 +          after parent to it was just inserted into parent.
8598 +
8599 +          While ->arg does look somewhat of unnecessary compication,
8600 +          it actually saves a lot of headache in many places, because
8601 +          all data necessary to insert or paste new data into tree are
8602 +          collected in one place, and this eliminates a lot of extra
8603 +          argument passing and storing everywhere.
8604 +
8605 +        */
8606 +       void *arg;
8607 +       /* plugin of item we are inserting */
8608 +       item_plugin *iplug;
8609 +};
8610 +
8611 +/* __REISER4_COORD_H__ */
8612 +#endif
8613 +
8614 +/* Make Linus happy.
8615 +   Local variables:
8616 +   c-indentation-style: "K&R"
8617 +   mode-name: "LC"
8618 +   c-basic-offset: 8
8619 +   tab-width: 8
8620 +   fill-column: 120
8621 +   scroll-step: 1
8622 +   End:
8623 +*/
8624 diff --git a/fs/reiser4/debug.c b/fs/reiser4/debug.c
8625 new file mode 100644
8626 index 0000000..3c55fe8
8627 --- /dev/null
8628 +++ b/fs/reiser4/debug.c
8629 @@ -0,0 +1,308 @@
8630 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
8631 + * reiser4/README */
8632 +
8633 +/* Debugging facilities. */
8634 +
8635 +/*
8636 + * This file contains generic debugging functions used by reiser4. Roughly
8637 + * following:
8638 + *
8639 + *     panicking: reiser4_do_panic(), reiser4_print_prefix().
8640 + *
8641 + *     locking:
8642 + *     reiser4_schedulable(), reiser4_lock_counters(), print_lock_counters(),
8643 + *     reiser4_no_counters_are_held(), reiser4_commit_check_locks()
8644 + *
8645 + *     error code monitoring (see comment before RETERR macro):
8646 + *     reiser4_return_err(), reiser4_report_err().
8647 + *
8648 + *     stack back-tracing: fill_backtrace()
8649 + *
8650 + *     miscellaneous: reiser4_preempt_point(), call_on_each_assert(),
8651 + *     reiser4_debugtrap().
8652 + *
8653 + */
8654 +
8655 +#include "reiser4.h"
8656 +#include "context.h"
8657 +#include "super.h"
8658 +#include "txnmgr.h"
8659 +#include "znode.h"
8660 +
8661 +#include <linux/sysfs.h>
8662 +#include <linux/slab.h>
8663 +#include <linux/types.h>
8664 +#include <linux/fs.h>
8665 +#include <linux/spinlock.h>
8666 +#include <linux/kallsyms.h>
8667 +#include <linux/vmalloc.h>
8668 +#include <linux/ctype.h>
8669 +#include <linux/sysctl.h>
8670 +#include <linux/hardirq.h>
8671 +
8672 +#if 0
8673 +#if REISER4_DEBUG
8674 +static void reiser4_report_err(void);
8675 +#else
8676 +#define reiser4_report_err() noop
8677 +#endif
8678 +#endif  /*  0  */
8679 +
8680 +/*
8681 + * global buffer where message given to reiser4_panic is formatted.
8682 + */
8683 +static char panic_buf[REISER4_PANIC_MSG_BUFFER_SIZE];
8684 +
8685 +/*
8686 + * lock protecting consistency of panic_buf under concurrent panics
8687 + */
8688 +static DEFINE_SPINLOCK(panic_guard);
8689 +
8690 +/* Your best friend. Call it on each occasion.  This is called by
8691 +    fs/reiser4/debug.h:reiser4_panic(). */
8692 +void reiser4_do_panic(const char *format /* format string */ , ... /* rest */ )
8693 +{
8694 +       static int in_panic = 0;
8695 +       va_list args;
8696 +
8697 +       /*
8698 +        * check for recursive panic.
8699 +        */
8700 +       if (in_panic == 0) {
8701 +               in_panic = 1;
8702 +
8703 +               spin_lock(&panic_guard);
8704 +               va_start(args, format);
8705 +               vsnprintf(panic_buf, sizeof(panic_buf), format, args);
8706 +               va_end(args);
8707 +               printk(KERN_EMERG "reiser4 panicked cowardly: %s", panic_buf);
8708 +               spin_unlock(&panic_guard);
8709 +
8710 +               /*
8711 +                * if kernel debugger is configured---drop in. Early dropping
8712 +                * into kgdb is not always convenient, because panic message
8713 +                * is not yet printed most of the times. But:
8714 +                *
8715 +                *     (1) message can be extracted from printk_buf[]
8716 +                *     (declared static inside of printk()), and
8717 +                *
8718 +                *     (2) sometimes serial/kgdb combo dies while printing
8719 +                *     long panic message, so it's more prudent to break into
8720 +                *     debugger earlier.
8721 +                *
8722 +                */
8723 +               DEBUGON(1);
8724 +       }
8725 +       /* to make gcc happy about noreturn attribute */
8726 +       panic("%s", panic_buf);
8727 +}
8728 +
8729 +#if 0
8730 +void
8731 +reiser4_print_prefix(const char *level, int reperr, const char *mid,
8732 +                    const char *function, const char *file, int lineno)
8733 +{
8734 +       const char *comm;
8735 +       int pid;
8736 +
8737 +       if (unlikely(in_interrupt() || in_irq())) {
8738 +               comm = "interrupt";
8739 +               pid = 0;
8740 +       } else {
8741 +               comm = current->comm;
8742 +               pid = current->pid;
8743 +       }
8744 +       printk("%sreiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n",
8745 +              level, comm, pid, function, file, lineno, mid);
8746 +       if (reperr)
8747 +               reiser4_report_err();
8748 +}
8749 +#endif  /*  0  */
8750 +
8751 +/* Preemption point: this should be called periodically during long running
8752 +   operations (carry, allocate, and squeeze are best examples) */
8753 +int reiser4_preempt_point(void)
8754 +{
8755 +       assert("nikita-3008", reiser4_schedulable());
8756 +       cond_resched();
8757 +       return signal_pending(current);
8758 +}
8759 +
8760 +#if REISER4_DEBUG
8761 +/* Debugging aid: return struct where information about locks taken by current
8762 +   thread is accumulated. This can be used to formulate lock ordering
8763 +   constraints and various assertions.
8764 +
8765 +*/
8766 +reiser4_lock_counters_info *reiser4_lock_counters(void)
8767 +{
8768 +       reiser4_context *ctx = get_current_context();
8769 +       assert("jmacd-1123", ctx != NULL);
8770 +       return &ctx->locks;
8771 +}
8772 +
8773 +/*
8774 + * print human readable information about locks held by the reiser4 context.
8775 + */
8776 +static void print_lock_counters(const char *prefix,
8777 +                               const reiser4_lock_counters_info * info)
8778 +{
8779 +       printk("%s: jnode: %i, tree: %i (r:%i,w:%i), dk: %i (r:%i,w:%i)\n"
8780 +              "jload: %i, "
8781 +              "txnh: %i, atom: %i, stack: %i, txnmgr: %i, "
8782 +              "ktxnmgrd: %i, fq: %i\n"
8783 +              "inode: %i, "
8784 +              "cbk_cache: %i (r:%i,w%i), "
8785 +              "eflush: %i, "
8786 +              "zlock: %i,\n"
8787 +              "spin: %i, long: %i inode_sem: (r:%i,w:%i)\n"
8788 +              "d: %i, x: %i, t: %i\n", prefix,
8789 +              info->spin_locked_jnode,
8790 +              info->rw_locked_tree, info->read_locked_tree,
8791 +              info->write_locked_tree,
8792 +              info->rw_locked_dk, info->read_locked_dk, info->write_locked_dk,
8793 +              info->spin_locked_jload,
8794 +              info->spin_locked_txnh,
8795 +              info->spin_locked_atom, info->spin_locked_stack,
8796 +              info->spin_locked_txnmgr, info->spin_locked_ktxnmgrd,
8797 +              info->spin_locked_fq,
8798 +              info->spin_locked_inode,
8799 +              info->rw_locked_cbk_cache,
8800 +              info->read_locked_cbk_cache,
8801 +              info->write_locked_cbk_cache,
8802 +              info->spin_locked_super_eflush,
8803 +              info->spin_locked_zlock,
8804 +              info->spin_locked,
8805 +              info->long_term_locked_znode,
8806 +              info->inode_sem_r, info->inode_sem_w,
8807 +              info->d_refs, info->x_refs, info->t_refs);
8808 +}
8809 +
8810 +/* check that no spinlocks are held */
8811 +int reiser4_schedulable(void)
8812 +{
8813 +       if (get_current_context_check() != NULL) {
8814 +               if (!LOCK_CNT_NIL(spin_locked)) {
8815 +                       print_lock_counters("in atomic", reiser4_lock_counters());
8816 +                       return 0;
8817 +               }
8818 +       }
8819 +       might_sleep();
8820 +       return 1;
8821 +}
8822 +/*
8823 + * return true, iff no locks are held.
8824 + */
8825 +int reiser4_no_counters_are_held(void)
8826 +{
8827 +       reiser4_lock_counters_info *counters;
8828 +
8829 +       counters = reiser4_lock_counters();
8830 +       return
8831 +           (counters->spin_locked_zlock == 0) &&
8832 +           (counters->spin_locked_jnode == 0) &&
8833 +           (counters->rw_locked_tree == 0) &&
8834 +           (counters->read_locked_tree == 0) &&
8835 +           (counters->write_locked_tree == 0) &&
8836 +           (counters->rw_locked_dk == 0) &&
8837 +           (counters->read_locked_dk == 0) &&
8838 +           (counters->write_locked_dk == 0) &&
8839 +           (counters->spin_locked_txnh == 0) &&
8840 +           (counters->spin_locked_atom == 0) &&
8841 +           (counters->spin_locked_stack == 0) &&
8842 +           (counters->spin_locked_txnmgr == 0) &&
8843 +           (counters->spin_locked_inode == 0) &&
8844 +           (counters->spin_locked == 0) &&
8845 +           (counters->long_term_locked_znode == 0) &&
8846 +           (counters->inode_sem_r == 0) &&
8847 +           (counters->inode_sem_w == 0) && (counters->d_refs == 0);
8848 +}
8849 +
8850 +/*
8851 + * return true, iff transaction commit can be done under locks held by the
8852 + * current thread.
8853 + */
8854 +int reiser4_commit_check_locks(void)
8855 +{
8856 +       reiser4_lock_counters_info *counters;
8857 +       int inode_sem_r;
8858 +       int inode_sem_w;
8859 +       int result;
8860 +
8861 +       /*
8862 +        * inode's read/write semaphore is the only reiser4 lock that can be
8863 +        * held during commit.
8864 +        */
8865 +
8866 +       counters = reiser4_lock_counters();
8867 +       inode_sem_r = counters->inode_sem_r;
8868 +       inode_sem_w = counters->inode_sem_w;
8869 +
8870 +       counters->inode_sem_r = counters->inode_sem_w = 0;
8871 +       result = reiser4_no_counters_are_held();
8872 +       counters->inode_sem_r = inode_sem_r;
8873 +       counters->inode_sem_w = inode_sem_w;
8874 +       return result;
8875 +}
8876 +
8877 +/*
8878 + * fill "error site" in the current reiser4 context. See comment before RETERR
8879 + * macro for more details.
8880 + */
8881 +void reiser4_return_err(int code, const char *file, int line)
8882 +{
8883 +       if (code < 0 && is_in_reiser4_context()) {
8884 +               reiser4_context *ctx = get_current_context();
8885 +
8886 +               if (ctx != NULL) {
8887 +                       ctx->err.code = code;
8888 +                       ctx->err.file = file;
8889 +                       ctx->err.line = line;
8890 +               }
8891 +       }
8892 +}
8893 +
8894 +#if 0
8895 +/*
8896 + * report error information recorder by reiser4_return_err().
8897 + */
8898 +static void reiser4_report_err(void)
8899 +{
8900 +       reiser4_context *ctx = get_current_context_check();
8901 +
8902 +       if (ctx != NULL) {
8903 +               if (ctx->err.code != 0) {
8904 +                       printk("code: %i at %s:%i\n",
8905 +                              ctx->err.code, ctx->err.file, ctx->err.line);
8906 +               }
8907 +       }
8908 +}
8909 +#endif  /*  0  */
8910 +
8911 +#endif                         /* REISER4_DEBUG */
8912 +
8913 +#if KERNEL_DEBUGGER
8914 +
8915 +/*
8916 + * this functions just drops into kernel debugger. It is a convenient place to
8917 + * put breakpoint in.
8918 + */
8919 +void reiser4_debugtrap(void)
8920 +{
8921 +       /* do nothing. Put break point here. */
8922 +#if defined(CONFIG_KGDB) && !defined(CONFIG_REISER4_FS_MODULE)
8923 +       extern void breakpoint(void);
8924 +       breakpoint();
8925 +#endif
8926 +}
8927 +#endif
8928 +
8929 +/* Make Linus happy.
8930 +   Local variables:
8931 +   c-indentation-style: "K&R"
8932 +   mode-name: "LC"
8933 +   c-basic-offset: 8
8934 +   tab-width: 8
8935 +   fill-column: 120
8936 +   End:
8937 +*/
8938 diff --git a/fs/reiser4/debug.h b/fs/reiser4/debug.h
8939 new file mode 100644
8940 index 0000000..68e7f31
8941 --- /dev/null
8942 +++ b/fs/reiser4/debug.h
8943 @@ -0,0 +1,350 @@
8944 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
8945 +
8946 +/* Declarations of debug macros. */
8947 +
8948 +#if !defined( __FS_REISER4_DEBUG_H__ )
8949 +#define __FS_REISER4_DEBUG_H__
8950 +
8951 +#include "forward.h"
8952 +#include "reiser4.h"
8953 +
8954 +/* generic function to produce formatted output, decorating it with
8955 +   whatever standard prefixes/postfixes we want. "Fun" is a function
8956 +   that will be actually called, can be printk, panic etc.
8957 +   This is for use by other debugging macros, not by users. */
8958 +#define DCALL(lev, fun, reperr, label, format, ...)                    \
8959 +({                                                                     \
8960 +       fun(lev "reiser4[%.16s(%i)]: %s (%s:%i)[%s]:\n" format "\n" ,   \
8961 +           current->comm, current->pid, __FUNCTION__,                  \
8962 +           __FILE__, __LINE__, label, ## __VA_ARGS__);                 \
8963 +})
8964 +
8965 +/*
8966 + * cause kernel to crash
8967 + */
8968 +#define reiser4_panic(mid, format, ...)                                \
8969 +       DCALL("", reiser4_do_panic, 1, mid, format , ## __VA_ARGS__)
8970 +
8971 +/* print message with indication of current process, file, line and
8972 +   function */
8973 +#define reiser4_log(label, format, ...)                                \
8974 +       DCALL(KERN_DEBUG, printk, 0, label, format , ## __VA_ARGS__)
8975 +
8976 +/* Assertion checked during compilation.
8977 +    If "cond" is false (0) we get duplicate case label in switch.
8978 +    Use this to check something like famous
8979 +       cassert (sizeof(struct reiserfs_journal_commit) == 4096) ;
8980 +    in 3.x journal.c. If cassertion fails you get compiler error,
8981 +    so no "maintainer-id".
8982 +*/
8983 +#define cassert(cond) ({ switch(-1) { case (cond): case 0: break; } })
8984 +
8985 +#define noop   do {;} while(0)
8986 +
8987 +#if REISER4_DEBUG
8988 +/* version of info that only actually prints anything when _d_ebugging
8989 +    is on */
8990 +#define dinfo(format, ...) printk(format , ## __VA_ARGS__)
8991 +/* macro to catch logical errors. Put it into `default' clause of
8992 +    switch() statement. */
8993 +#define impossible(label, format, ...)                         \
8994 +         reiser4_panic(label, "impossible: " format , ## __VA_ARGS__)
8995 +/* assert assures that @cond is true. If it is not, reiser4_panic() is
8996 +   called. Use this for checking logical consistency and _never_ call
8997 +   this to check correctness of external data: disk blocks and user-input . */
8998 +#define assert(label, cond)                                                    \
8999 +({                                                                             \
9000 +       /* call_on_each_assert(); */                                            \
9001 +       if (cond) {                                                             \
9002 +               /* put negated check to avoid using !(cond) that would lose     \
9003 +                * warnings for things like assert(a = b); */                   \
9004 +               ;                                                               \
9005 +       } else {                                                                \
9006 +               DEBUGON(1);                                                     \
9007 +               reiser4_panic(label, "assertion failed: %s", #cond);            \
9008 +       }                                                                       \
9009 +})
9010 +
9011 +/* like assertion, but @expr is evaluated even if REISER4_DEBUG is off. */
9012 +#define check_me( label, expr )        assert( label, ( expr ) )
9013 +
9014 +#define ON_DEBUG( exp ) exp
9015 +
9016 +extern int reiser4_schedulable(void);
9017 +extern void call_on_each_assert(void);
9018 +
9019 +#else
9020 +
9021 +#define dinfo( format, args... ) noop
9022 +#define impossible( label, format, args... ) noop
9023 +#define assert( label, cond ) noop
9024 +#define check_me( label, expr )        ( ( void ) ( expr ) )
9025 +#define ON_DEBUG( exp )
9026 +#define reiser4_schedulable() might_sleep()
9027 +
9028 +/* REISER4_DEBUG */
9029 +#endif
9030 +
9031 +#if REISER4_DEBUG
9032 +/* per-thread information about lock acquired by this thread. Used by lock
9033 + * ordering checking in spin_macros.h */
9034 +typedef struct reiser4_lock_counters_info {
9035 +       int rw_locked_tree;
9036 +       int read_locked_tree;
9037 +       int write_locked_tree;
9038 +
9039 +       int rw_locked_dk;
9040 +       int read_locked_dk;
9041 +       int write_locked_dk;
9042 +
9043 +       int rw_locked_cbk_cache;
9044 +       int read_locked_cbk_cache;
9045 +       int write_locked_cbk_cache;
9046 +
9047 +       int spin_locked_zlock;
9048 +       int spin_locked_jnode;
9049 +       int spin_locked_jload;
9050 +       int spin_locked_txnh;
9051 +       int spin_locked_atom;
9052 +       int spin_locked_stack;
9053 +       int spin_locked_txnmgr;
9054 +       int spin_locked_ktxnmgrd;
9055 +       int spin_locked_fq;
9056 +       int spin_locked_inode;
9057 +       int spin_locked_super_eflush;
9058 +       int spin_locked;
9059 +       int long_term_locked_znode;
9060 +
9061 +       int inode_sem_r;
9062 +       int inode_sem_w;
9063 +
9064 +       int d_refs;
9065 +       int x_refs;
9066 +       int t_refs;
9067 +} reiser4_lock_counters_info;
9068 +
9069 +extern reiser4_lock_counters_info *reiser4_lock_counters(void);
9070 +#define IN_CONTEXT(a, b) (is_in_reiser4_context() ? (a) : (b))
9071 +
9072 +/* increment lock-counter @counter, if present */
9073 +#define LOCK_CNT_INC(counter)                                  \
9074 +       IN_CONTEXT(++(reiser4_lock_counters()->counter), 0)
9075 +
9076 +/* decrement lock-counter @counter, if present */
9077 +#define LOCK_CNT_DEC(counter)                                  \
9078 +       IN_CONTEXT(--(reiser4_lock_counters()->counter), 0)
9079 +
9080 +/* check that lock-counter is zero. This is for use in assertions */
9081 +#define LOCK_CNT_NIL(counter)                                  \
9082 +       IN_CONTEXT(reiser4_lock_counters()->counter == 0, 1)
9083 +
9084 +/* check that lock-counter is greater than zero. This is for use in
9085 + * assertions */
9086 +#define LOCK_CNT_GTZ(counter)                                  \
9087 +       IN_CONTEXT(reiser4_lock_counters()->counter > 0, 1)
9088 +#define LOCK_CNT_LT(counter,n)                                 \
9089 +       IN_CONTEXT(reiser4_lock_counters()->counter < n, 1)
9090 +
9091 +#else                          /* REISER4_DEBUG */
9092 +
9093 +/* no-op versions on the above */
9094 +
9095 +typedef struct reiser4_lock_counters_info {
9096 +} reiser4_lock_counters_info;
9097 +
9098 +#define reiser4_lock_counters() ((reiser4_lock_counters_info *)NULL)
9099 +#define LOCK_CNT_INC(counter) noop
9100 +#define LOCK_CNT_DEC(counter) noop
9101 +#define LOCK_CNT_NIL(counter) (1)
9102 +#define LOCK_CNT_GTZ(counter) (1)
9103 +#define LOCK_CNT_LT(counter,n) (1)
9104 +
9105 +#endif                         /* REISER4_DEBUG */
9106 +
9107 +#define assert_spin_not_locked(lock) BUG_ON(0)
9108 +#define assert_rw_write_locked(lock) BUG_ON(0)
9109 +#define assert_rw_read_locked(lock) BUG_ON(0)
9110 +#define assert_rw_locked(lock) BUG_ON(0)
9111 +#define assert_rw_not_write_locked(lock) BUG_ON(0)
9112 +#define assert_rw_not_read_locked(lock) BUG_ON(0)
9113 +#define assert_rw_not_locked(lock) BUG_ON(0)
9114 +
9115 +/* flags controlling debugging behavior. Are set through debug_flags=N mount
9116 +   option. */
9117 +typedef enum {
9118 +       /* print a lot of information during panic. When this is on all jnodes
9119 +        * are listed. This can be *very* large output. Usually you don't want
9120 +        * this. Especially over serial line. */
9121 +       REISER4_VERBOSE_PANIC = 0x00000001,
9122 +       /* print a lot of information during umount */
9123 +       REISER4_VERBOSE_UMOUNT = 0x00000002,
9124 +       /* print gathered statistics on umount */
9125 +       REISER4_STATS_ON_UMOUNT = 0x00000004,
9126 +       /* check node consistency */
9127 +       REISER4_CHECK_NODE = 0x00000008
9128 +} reiser4_debug_flags;
9129 +
9130 +extern int is_in_reiser4_context(void);
9131 +
9132 +/*
9133 + * evaluate expression @e only if with reiser4 context
9134 + */
9135 +#define ON_CONTEXT(e)  do {                    \
9136 +       if(is_in_reiser4_context()) {           \
9137 +               e;                              \
9138 +       } } while(0)
9139 +
9140 +/*
9141 + * evaluate expression @e only when within reiser4_context and debugging is
9142 + * on.
9143 + */
9144 +#define ON_DEBUG_CONTEXT( e ) ON_DEBUG( ON_CONTEXT( e ) )
9145 +
9146 +/*
9147 + * complain about unexpected function result and crash. Used in "default"
9148 + * branches of switch statements and alike to assert that invalid results are
9149 + * not silently ignored.
9150 + */
9151 +#define wrong_return_value( label, function )                          \
9152 +       impossible( label, "wrong return value from " function )
9153 +
9154 +/* Issue different types of reiser4 messages to the console */
9155 +#define warning( label, format, ... )                                  \
9156 +       DCALL( KERN_WARNING,                                            \
9157 +              printk, 1, label, "WARNING: " format , ## __VA_ARGS__ )
9158 +#define notice( label, format, ... )                                   \
9159 +       DCALL( KERN_NOTICE,                                             \
9160 +              printk, 1, label, "NOTICE: " format , ## __VA_ARGS__ )
9161 +
9162 +/* mark not yet implemented functionality */
9163 +#define not_yet( label, format, ... )                          \
9164 +       reiser4_panic( label, "NOT YET IMPLEMENTED: " format , ## __VA_ARGS__ )
9165 +
9166 +extern void reiser4_do_panic(const char *format, ...)
9167 +    __attribute__ ((noreturn, format(printf, 1, 2)));
9168 +
9169 +extern int reiser4_preempt_point(void);
9170 +extern void reiser4_print_stats(void);
9171 +
9172 +#if REISER4_DEBUG
9173 +extern int reiser4_no_counters_are_held(void);
9174 +extern int reiser4_commit_check_locks(void);
9175 +#else
9176 +#define reiser4_no_counters_are_held() (1)
9177 +#define reiser4_commit_check_locks() (1)
9178 +#endif
9179 +
9180 +/* true if @i is power-of-two. Useful for rate-limited warnings, etc. */
9181 +#define IS_POW(i)                              \
9182 +({                                             \
9183 +       typeof(i) __i;                          \
9184 +                                               \
9185 +       __i = (i);                              \
9186 +       !(__i & (__i - 1));                     \
9187 +})
9188 +
9189 +#define KERNEL_DEBUGGER (1)
9190 +
9191 +#if KERNEL_DEBUGGER
9192 +
9193 +extern void reiser4_debugtrap(void);
9194 +
9195 +/*
9196 + * Check condition @cond and drop into kernel debugger (kgdb) if it's true. If
9197 + * kgdb is not compiled in, do nothing.
9198 + */
9199 +#define DEBUGON(cond)                                  \
9200 +({                                                     \
9201 +       if (unlikely(cond))                             \
9202 +               reiser4_debugtrap();                    \
9203 +})
9204 +#else
9205 +#define DEBUGON(cond) noop
9206 +#endif
9207 +
9208 +/*
9209 + * Error code tracing facility. (Idea is borrowed from XFS code.)
9210 + *
9211 + * Suppose some strange and/or unexpected code is returned from some function
9212 + * (for example, write(2) returns -EEXIST). It is possible to place a
9213 + * breakpoint in the reiser4_write(), but it is too late here. How to find out
9214 + * in what particular place -EEXIST was generated first?
9215 + *
9216 + * In reiser4 all places where actual error codes are produced (that is,
9217 + * statements of the form
9218 + *
9219 + *     return -EFOO;        // (1), or
9220 + *
9221 + *     result = -EFOO;      // (2)
9222 + *
9223 + * are replaced with
9224 + *
9225 + *     return RETERR(-EFOO);        // (1a), and
9226 + *
9227 + *     result = RETERR(-EFOO);      // (2a) respectively
9228 + *
9229 + * RETERR() macro fills a backtrace in reiser4_context. This back-trace is
9230 + * printed in error and warning messages. Moreover, it's possible to put a
9231 + * conditional breakpoint in reiser4_return_err (low-level function called
9232 + * by RETERR() to do the actual work) to break into debugger immediately
9233 + * when particular error happens.
9234 + *
9235 + */
9236 +
9237 +#if REISER4_DEBUG
9238 +
9239 +/*
9240 + * data-type to store information about where error happened ("error site").
9241 + */
9242 +typedef struct err_site {
9243 +       int code;               /* error code */
9244 +       const char *file;       /* source file, filled by __FILE__ */
9245 +       int line;               /* source file line, filled by __LINE__ */
9246 +} err_site;
9247 +
9248 +extern void reiser4_return_err(int code, const char *file, int line);
9249 +
9250 +/*
9251 + * fill &get_current_context()->err_site with error information.
9252 + */
9253 +#define RETERR(code)                                   \
9254 +({                                                     \
9255 +       typeof(code) __code;                            \
9256 +                                                       \
9257 +       __code = (code);                                \
9258 +       reiser4_return_err(__code, __FILE__, __LINE__); \
9259 +       __code;                                         \
9260 +})
9261 +
9262 +#else
9263 +
9264 +/*
9265 + * no-op versions of the above
9266 + */
9267 +
9268 +typedef struct err_site {
9269 +} err_site;
9270 +#define RETERR(code) code
9271 +#endif
9272 +
9273 +#if REISER4_LARGE_KEY
9274 +/*
9275 + * conditionally compile arguments only if REISER4_LARGE_KEY is on.
9276 + */
9277 +#define ON_LARGE_KEY(...) __VA_ARGS__
9278 +#else
9279 +#define ON_LARGE_KEY(...)
9280 +#endif
9281 +
9282 +/* __FS_REISER4_DEBUG_H__ */
9283 +#endif
9284 +
9285 +/* Make Linus happy.
9286 +   Local variables:
9287 +   c-indentation-style: "K&R"
9288 +   mode-name: "LC"
9289 +   c-basic-offset: 8
9290 +   tab-width: 8
9291 +   fill-column: 120
9292 +   End:
9293 +*/
9294 diff --git a/fs/reiser4/dformat.h b/fs/reiser4/dformat.h
9295 new file mode 100644
9296 index 0000000..8bca29e
9297 --- /dev/null
9298 +++ b/fs/reiser4/dformat.h
9299 @@ -0,0 +1,70 @@
9300 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
9301 +
9302 +/* Formats of on-disk data and conversion functions. */
9303 +
9304 +/* put all item formats in the files describing the particular items,
9305 +   our model is, everything you need to do to add an item to reiser4,
9306 +   (excepting the changes to the plugin that uses the item which go
9307 +   into the file defining that plugin), you put into one file. */
9308 +/* Data on disk are stored in little-endian format.
9309 +   To declare fields of on-disk structures, use d8, d16, d32 and d64.
9310 +   d??tocpu() and cputod??() to convert. */
9311 +
9312 +#if !defined( __FS_REISER4_DFORMAT_H__ )
9313 +#define __FS_REISER4_DFORMAT_H__
9314 +
9315 +#include <asm/byteorder.h>
9316 +#include <asm/unaligned.h>
9317 +#include <linux/types.h>
9318 +
9319 +typedef __u8 d8;
9320 +typedef __le16 d16;
9321 +typedef __le32 d32;
9322 +typedef __le64 d64;
9323 +
9324 +#define PACKED __attribute__((packed))
9325 +
9326 +/* data-type for block number */
9327 +typedef __u64 reiser4_block_nr;
9328 +
9329 +/* data-type for block number on disk, disk format */
9330 +typedef __le64 reiser4_dblock_nr;
9331 +
9332 +/**
9333 + * disk_addr_eq - compare disk addresses
9334 + * @b1: pointer to block number ot compare
9335 + * @b2: pointer to block number ot compare
9336 + *
9337 + * Returns true if if disk addresses are the same
9338 + */
9339 +static inline int disk_addr_eq(const reiser4_block_nr *b1,
9340 +                              const reiser4_block_nr * b2)
9341 +{
9342 +       assert("nikita-1033", b1 != NULL);
9343 +       assert("nikita-1266", b2 != NULL);
9344 +
9345 +       return !memcmp(b1, b2, sizeof *b1);
9346 +}
9347 +
9348 +/* structure of master reiser4 super block */
9349 +typedef struct reiser4_master_sb {
9350 +       char magic[16];         /* "ReIsEr4" */
9351 +       __le16 disk_plugin_id;  /* id of disk layout plugin */
9352 +       __le16 blocksize;
9353 +       char uuid[16];          /* unique id */
9354 +       char label[16];         /* filesystem label */
9355 +       __le64 diskmap;         /* location of the diskmap. 0 if not present */
9356 +} reiser4_master_sb;
9357 +
9358 +/* __FS_REISER4_DFORMAT_H__ */
9359 +#endif
9360 +
9361 +/*
9362 + * Local variables:
9363 + * c-indentation-style: "K&R"
9364 + * mode-name: "LC"
9365 + * c-basic-offset: 8
9366 + * tab-width: 8
9367 + * fill-column: 79
9368 + * End:
9369 + */
9370 diff --git a/fs/reiser4/dscale.c b/fs/reiser4/dscale.c
9371 new file mode 100644
9372 index 0000000..a9bc224
9373 --- /dev/null
9374 +++ b/fs/reiser4/dscale.c
9375 @@ -0,0 +1,174 @@
9376 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9377 + * reiser4/README */
9378 +
9379 +/* Scalable on-disk integers */
9380 +
9381 +/*
9382 + * Various on-disk structures contain integer-like structures. Stat-data
9383 + * contain [yes, "data" is plural, check the dictionary] file size, link
9384 + * count; extent unit contains extent width etc. To accommodate for general
9385 + * case enough space is reserved to keep largest possible value. 64 bits in
9386 + * all cases above. But in overwhelming majority of cases numbers actually
9387 + * stored in these fields will be comparatively small and reserving 8 bytes is
9388 + * a waste of precious disk bandwidth.
9389 + *
9390 + * Scalable integers are one way to solve this problem. dscale_write()
9391 + * function stores __u64 value in the given area consuming from 1 to 9 bytes,
9392 + * depending on the magnitude of the value supplied. dscale_read() reads value
9393 + * previously stored by dscale_write().
9394 + *
9395 + * dscale_write() produces format not completely unlike of UTF: two highest
9396 + * bits of the first byte are used to store "tag". One of 4 possible tag
9397 + * values is chosen depending on the number being encoded:
9398 + *
9399 + *           0 ... 0x3f               => 0           [table 1]
9400 + *        0x40 ... 0x3fff             => 1
9401 + *      0x4000 ... 0x3fffffff         => 2
9402 + *  0x40000000 ... 0xffffffffffffffff => 3
9403 + *
9404 + * (see dscale_range() function)
9405 + *
9406 + * Values in the range 0x40000000 ... 0xffffffffffffffff require 8 full bytes
9407 + * to be stored, so in this case there is no place in the first byte to store
9408 + * tag. For such values tag is stored in an extra 9th byte.
9409 + *
9410 + * As _highest_ bits are used for the test (which is natural) scaled integers
9411 + * are stored in BIG-ENDIAN format in contrast with the rest of reiser4 which
9412 + * uses LITTLE-ENDIAN.
9413 + *
9414 + */
9415 +
9416 +#include "debug.h"
9417 +#include "dscale.h"
9418 +
9419 +/* return tag of scaled integer stored at @address */
9420 +static int gettag(const unsigned char *address)
9421 +{
9422 +       /* tag is stored in two highest bits */
9423 +       return (*address) >> 6;
9424 +}
9425 +
9426 +/* clear tag from value. Clear tag embedded into @value. */
9427 +static void cleartag(__u64 * value, int tag)
9428 +{
9429 +       /*
9430 +        * W-w-what ?!
9431 +        *
9432 +        * Actually, this is rather simple: @value passed here was read by
9433 +        * dscale_read(), converted from BIG-ENDIAN, and padded to __u64 by
9434 +        * zeroes. Tag is still stored in the highest (arithmetically)
9435 +        * non-zero bits of @value, but relative position of tag within __u64
9436 +        * depends on @tag.
9437 +        *
9438 +        * For example if @tag is 0, it's stored 2 highest bits of lowest
9439 +        * byte, and its offset (counting from lowest bit) is 8 - 2 == 6 bits.
9440 +        *
9441 +        * If tag is 1, it's stored in two highest bits of 2nd lowest byte,
9442 +        * and it's offset if (2 * 8) - 2 == 14 bits.
9443 +        *
9444 +        * See table 1 above for details.
9445 +        *
9446 +        * All these cases are captured by the formula:
9447 +        */
9448 +       *value &= ~(3 << (((1 << tag) << 3) - 2));
9449 +       /*
9450 +        * That is, clear two (3 == 0t11) bits at the offset
9451 +        *
9452 +        *                  8 * (2 ^ tag) - 2,
9453 +        *
9454 +        * that is, two highest bits of (2 ^ tag)-th byte of @value.
9455 +        */
9456 +}
9457 +
9458 +/* return tag for @value. See table 1 above for details. */
9459 +static int dscale_range(__u64 value)
9460 +{
9461 +       if (value > 0x3fffffff)
9462 +               return 3;
9463 +       if (value > 0x3fff)
9464 +               return 2;
9465 +       if (value > 0x3f)
9466 +               return 1;
9467 +       return 0;
9468 +}
9469 +
9470 +/* restore value stored at @adderss by dscale_write() and return number of
9471 + * bytes consumed */
9472 +int dscale_read(unsigned char *address, __u64 * value)
9473 +{
9474 +       int tag;
9475 +
9476 +       /* read tag */
9477 +       tag = gettag(address);
9478 +       switch (tag) {
9479 +       case 3:
9480 +               /* In this case tag is stored in an extra byte, skip this byte
9481 +                * and decode value stored in the next 8 bytes.*/
9482 +               *value = __be64_to_cpu(get_unaligned((__be64 *)(address + 1)));
9483 +               /* worst case: 8 bytes for value itself plus one byte for
9484 +                * tag. */
9485 +               return 9;
9486 +       case 0:
9487 +               *value = get_unaligned(address);
9488 +               break;
9489 +       case 1:
9490 +               *value = __be16_to_cpu(get_unaligned((__be16 *)address));
9491 +               break;
9492 +       case 2:
9493 +               *value = __be32_to_cpu(get_unaligned((__be32 *)address));
9494 +               break;
9495 +       default:
9496 +               return RETERR(-EIO);
9497 +       }
9498 +       /* clear tag embedded into @value */
9499 +       cleartag(value, tag);
9500 +       /* number of bytes consumed is (2 ^ tag)---see table 1. */
9501 +       return 1 << tag;
9502 +}
9503 +
9504 +/* store @value at @address and return number of bytes consumed */
9505 +int dscale_write(unsigned char *address, __u64 value)
9506 +{
9507 +       int tag;
9508 +       int shift;
9509 +       __be64 v;
9510 +       unsigned char *valarr;
9511 +
9512 +       tag = dscale_range(value);
9513 +       v = __cpu_to_be64(value);
9514 +       valarr = (unsigned char *)&v;
9515 +       shift = (tag == 3) ? 1 : 0;
9516 +       memcpy(address + shift, valarr + sizeof v - (1 << tag), 1 << tag);
9517 +       *address |= (tag << 6);
9518 +       return shift + (1 << tag);
9519 +}
9520 +
9521 +/* number of bytes required to store @value */
9522 +int dscale_bytes(__u64 value)
9523 +{
9524 +       int bytes;
9525 +
9526 +       bytes = 1 << dscale_range(value);
9527 +       if (bytes == 8)
9528 +               ++bytes;
9529 +       return bytes;
9530 +}
9531 +
9532 +/* returns true if @value and @other require the same number of bytes to be
9533 + * stored. Used by detect when data structure (like stat-data) has to be
9534 + * expanded or contracted. */
9535 +int dscale_fit(__u64 value, __u64 other)
9536 +{
9537 +       return dscale_range(value) == dscale_range(other);
9538 +}
9539 +
9540 +/* Make Linus happy.
9541 +   Local variables:
9542 +   c-indentation-style: "K&R"
9543 +   mode-name: "LC"
9544 +   c-basic-offset: 8
9545 +   tab-width: 8
9546 +   fill-column: 120
9547 +   scroll-step: 1
9548 +   End:
9549 +*/
9550 diff --git a/fs/reiser4/dscale.h b/fs/reiser4/dscale.h
9551 new file mode 100644
9552 index 0000000..545e111
9553 --- /dev/null
9554 +++ b/fs/reiser4/dscale.h
9555 @@ -0,0 +1,27 @@
9556 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
9557 + * reiser4/README */
9558 +
9559 +/* Scalable on-disk integers. See dscale.h for details. */
9560 +
9561 +#if !defined( __FS_REISER4_DSCALE_H__ )
9562 +#define __FS_REISER4_DSCALE_H__
9563 +
9564 +#include "dformat.h"
9565 +
9566 +extern int dscale_read(unsigned char *address, __u64 * value);
9567 +extern int dscale_write(unsigned char *address, __u64 value);
9568 +extern int dscale_bytes(__u64 value);
9569 +extern int dscale_fit(__u64 value, __u64 other);
9570 +
9571 +/* __FS_REISER4_DSCALE_H__ */
9572 +#endif
9573 +
9574 +/* Make Linus happy.
9575 +   Local variables:
9576 +   c-indentation-style: "K&R"
9577 +   mode-name: "LC"
9578 +   c-basic-offset: 8
9579 +   tab-width: 8
9580 +   fill-column: 120
9581 +   End:
9582 +*/
9583 diff --git a/fs/reiser4/entd.c b/fs/reiser4/entd.c
9584 new file mode 100644
9585 index 0000000..1be9fff
9586 --- /dev/null
9587 +++ b/fs/reiser4/entd.c
9588 @@ -0,0 +1,335 @@
9589 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
9590 + * reiser4/README */
9591 +
9592 +/* Ent daemon. */
9593 +
9594 +#include "debug.h"
9595 +#include "txnmgr.h"
9596 +#include "tree.h"
9597 +#include "entd.h"
9598 +#include "super.h"
9599 +#include "context.h"
9600 +#include "reiser4.h"
9601 +#include "vfs_ops.h"
9602 +#include "page_cache.h"
9603 +#include "inode.h"
9604 +
9605 +#include <linux/sched.h>       /* struct task_struct */
9606 +#include <linux/suspend.h>
9607 +#include <linux/kernel.h>
9608 +#include <linux/writeback.h>
9609 +#include <linux/time.h>                /* INITIAL_JIFFIES */
9610 +#include <linux/backing-dev.h> /* bdi_write_congested */
9611 +#include <linux/wait.h>
9612 +#include <linux/kthread.h>
9613 +#include <linux/freezer.h>
9614 +
9615 +#define DEF_PRIORITY 12
9616 +#define MAX_ENTD_ITERS 10
9617 +
9618 +static void entd_flush(struct super_block *, struct wbq *);
9619 +static int entd(void *arg);
9620 +
9621 +/*
9622 + * set ->comm field of end thread to make its state visible to the user level
9623 + */
9624 +#define entd_set_comm(state)                                   \
9625 +       snprintf(current->comm, sizeof(current->comm),  \
9626 +                "ent:%s%s", super->s_id, (state))
9627 +
9628 +/**
9629 + * reiser4_init_entd - initialize entd context and start kernel daemon
9630 + * @super: super block to start ent thread for
9631 + *
9632 + * Creates entd contexts, starts kernel thread and waits until it
9633 + * initializes.
9634 + */
9635 +int reiser4_init_entd(struct super_block *super)
9636 +{
9637 +       entd_context *ctx;
9638 +
9639 +       assert("nikita-3104", super != NULL);
9640 +
9641 +       ctx = get_entd_context(super);
9642 +
9643 +       memset(ctx, 0, sizeof *ctx);
9644 +       spin_lock_init(&ctx->guard);
9645 +       init_waitqueue_head(&ctx->wait);
9646 +#if REISER4_DEBUG
9647 +       INIT_LIST_HEAD(&ctx->flushers_list);
9648 +#endif
9649 +       /* lists of writepage requests */
9650 +       INIT_LIST_HEAD(&ctx->todo_list);
9651 +       INIT_LIST_HEAD(&ctx->done_list);
9652 +       /* start entd */
9653 +       ctx->tsk = kthread_run(entd, super, "ent:%s", super->s_id);
9654 +       if (IS_ERR(ctx->tsk))
9655 +               return PTR_ERR(ctx->tsk);
9656 +       return 0;
9657 +}
9658 +
9659 +static void put_wbq(struct wbq *rq)
9660 +{
9661 +       iput(rq->mapping->host);
9662 +       complete(&rq->completion);
9663 +}
9664 +
9665 +/* ent should be locked */
9666 +static struct wbq *__get_wbq(entd_context * ent)
9667 +{
9668 +       struct wbq *wbq;
9669 +
9670 +       if (list_empty(&ent->todo_list))
9671 +               return NULL;
9672 +
9673 +       ent->nr_todo_reqs --;
9674 +       wbq = list_entry(ent->todo_list.next, struct wbq, link);
9675 +       list_del_init(&wbq->link);
9676 +       return wbq;
9677 +}
9678 +
9679 +/* ent thread function */
9680 +static int entd(void *arg)
9681 +{
9682 +       struct super_block *super;
9683 +       entd_context *ent;
9684 +       int done = 0;
9685 +
9686 +       super = arg;
9687 +       /* do_fork() just copies task_struct into the new
9688 +          thread. ->fs_context shouldn't be copied of course. This shouldn't
9689 +          be a problem for the rest of the code though.
9690 +        */
9691 +       current->journal_info = NULL;
9692 +
9693 +       ent = get_entd_context(super);
9694 +
9695 +       while (!done) {
9696 +               try_to_freeze();
9697 +
9698 +               spin_lock(&ent->guard);
9699 +               while (ent->nr_todo_reqs != 0) {
9700 +                       struct wbq *rq;
9701 +
9702 +                       assert("", list_empty(&ent->done_list));
9703 +
9704 +                       /* take request from the queue head */
9705 +                       rq = __get_wbq(ent);
9706 +                       assert("", rq != NULL);
9707 +                       ent->cur_request = rq;
9708 +                       spin_unlock(&ent->guard);
9709 +
9710 +                       entd_set_comm("!");
9711 +                       entd_flush(super, rq);
9712 +
9713 +                       put_wbq(rq);
9714 +
9715 +                       /*
9716 +                        * wakeup all requestors and iput their inodes
9717 +                        */
9718 +                       spin_lock(&ent->guard);
9719 +                       while (!list_empty(&ent->done_list)) {
9720 +                               rq = list_entry(ent->done_list.next, struct wbq, link);
9721 +                               list_del_init(&rq->link);
9722 +                               ent->nr_done_reqs --;
9723 +                               spin_unlock(&ent->guard);
9724 +                               assert("", rq->written == 1);
9725 +                               put_wbq(rq);
9726 +                               spin_lock(&ent->guard);
9727 +                       }
9728 +               }
9729 +               spin_unlock(&ent->guard);
9730 +
9731 +               entd_set_comm(".");
9732 +
9733 +               {
9734 +                       DEFINE_WAIT(__wait);
9735 +
9736 +                       do {
9737 +                               prepare_to_wait(&ent->wait, &__wait, TASK_INTERRUPTIBLE);
9738 +                               if (kthread_should_stop()) {
9739 +                                       done = 1;
9740 +                                       break;
9741 +                               }
9742 +                               if (ent->nr_todo_reqs != 0)
9743 +                                       break;
9744 +                               schedule();
9745 +                       } while (0);
9746 +                       finish_wait(&ent->wait, &__wait);
9747 +               }
9748 +       }
9749 +       BUG_ON(ent->nr_todo_reqs != 0);
9750 +       return 0;
9751 +}
9752 +
9753 +/**
9754 + * reiser4_done_entd - stop entd kernel thread
9755 + * @super: super block to stop ent thread for
9756 + *
9757 + * It is called on umount. Sends stop signal to entd and wait until it handles
9758 + * it.
9759 + */
9760 +void reiser4_done_entd(struct super_block *super)
9761 +{
9762 +       entd_context *ent;
9763 +
9764 +       assert("nikita-3103", super != NULL);
9765 +
9766 +       ent = get_entd_context(super);
9767 +       assert("zam-1055", ent->tsk != NULL);
9768 +       kthread_stop(ent->tsk);
9769 +}
9770 +
9771 +/* called at the beginning of jnode_flush to register flusher thread with ent
9772 + * daemon */
9773 +void reiser4_enter_flush(struct super_block *super)
9774 +{
9775 +       entd_context *ent;
9776 +
9777 +       assert("zam-1029", super != NULL);
9778 +       ent = get_entd_context(super);
9779 +
9780 +       assert("zam-1030", ent != NULL);
9781 +
9782 +       spin_lock(&ent->guard);
9783 +       ent->flushers++;
9784 +#if REISER4_DEBUG
9785 +       list_add(&get_current_context()->flushers_link, &ent->flushers_list);
9786 +#endif
9787 +       spin_unlock(&ent->guard);
9788 +}
9789 +
9790 +/* called at the end of jnode_flush */
9791 +void reiser4_leave_flush(struct super_block *super)
9792 +{
9793 +       entd_context *ent;
9794 +       int wake_up_ent;
9795 +
9796 +       assert("zam-1027", super != NULL);
9797 +       ent = get_entd_context(super);
9798 +
9799 +       assert("zam-1028", ent != NULL);
9800 +
9801 +       spin_lock(&ent->guard);
9802 +       ent->flushers--;
9803 +       wake_up_ent = (ent->flushers == 0 && ent->nr_todo_reqs != 0);
9804 +#if REISER4_DEBUG
9805 +       list_del_init(&get_current_context()->flushers_link);
9806 +#endif
9807 +       spin_unlock(&ent->guard);
9808 +       if (wake_up_ent)
9809 +               wake_up(&ent->wait);
9810 +}
9811 +
9812 +#define ENTD_CAPTURE_APAGE_BURST SWAP_CLUSTER_MAX
9813 +
9814 +static void entd_flush(struct super_block *super, struct wbq *rq)
9815 +{
9816 +       reiser4_context ctx;
9817 +       int tmp;
9818 +
9819 +       init_stack_context(&ctx, super);
9820 +       ctx.entd = 1;
9821 +       ctx.gfp_mask = GFP_NOFS;
9822 +
9823 +       rq->wbc->range_start = page_offset(rq->page);
9824 +       rq->wbc->range_end = rq->wbc->range_start +
9825 +               (ENTD_CAPTURE_APAGE_BURST << PAGE_CACHE_SHIFT);
9826 +       tmp = rq->wbc->nr_to_write;
9827 +       rq->mapping->a_ops->writepages(rq->mapping, rq->wbc);
9828 +
9829 +       if (rq->wbc->nr_to_write > 0) {
9830 +               rq->wbc->range_start = 0;
9831 +               rq->wbc->range_end = 0;
9832 +               generic_sync_sb_inodes(super, rq->wbc);
9833 +       }
9834 +       rq->wbc->nr_to_write = ENTD_CAPTURE_APAGE_BURST;
9835 +       reiser4_writeout(super, rq->wbc);
9836 +
9837 +       context_set_commit_async(&ctx);
9838 +       reiser4_exit_context(&ctx);
9839 +}
9840 +
9841 +/**
9842 + * write_page_by_ent - ask entd thread to flush this page as part of slum
9843 + * @page: page to be written
9844 + * @wbc: writeback control passed to reiser4_writepage
9845 + *
9846 + * Creates a request, puts it on entd list of requests, wakeups entd if
9847 + * necessary, waits until entd completes with the request.
9848 + */
9849 +int write_page_by_ent(struct page *page, struct writeback_control *wbc)
9850 +{
9851 +       struct super_block *sb;
9852 +       struct inode *inode;
9853 +       entd_context *ent;
9854 +       struct wbq rq;
9855 +
9856 +       assert("", PageLocked(page));
9857 +       assert("", page->mapping != NULL);
9858 +
9859 +       sb = page->mapping->host->i_sb;
9860 +       ent = get_entd_context(sb);
9861 +       assert("", ent && ent->done == 0);
9862 +
9863 +       /*
9864 +        * we are going to unlock page and ask ent thread to write the
9865 +        * page. Re-dirty page before unlocking so that if ent thread fails to
9866 +        * write it - it will remain dirty
9867 +        */
9868 +       reiser4_set_page_dirty_internal(page);
9869 +
9870 +       /*
9871 +        * pin inode in memory, unlock page, entd_flush will iput. We can not
9872 +        * iput here becasue we can not allow delete_inode to be called here
9873 +        */
9874 +       inode = igrab(page->mapping->host);
9875 +       unlock_page(page);
9876 +       if (inode == NULL)
9877 +               /* inode is getting freed */
9878 +               return 0;
9879 +
9880 +       /* init wbq */
9881 +       INIT_LIST_HEAD(&rq.link);
9882 +       rq.magic = WBQ_MAGIC;
9883 +       rq.wbc = wbc;
9884 +       rq.page = page;
9885 +       rq.mapping = inode->i_mapping;
9886 +       rq.node = NULL;
9887 +       rq.written = 0;
9888 +       init_completion(&rq.completion);
9889 +
9890 +       /* add request to entd's list of writepage requests */
9891 +       spin_lock(&ent->guard);
9892 +       ent->nr_todo_reqs++;
9893 +       list_add_tail(&rq.link, &ent->todo_list);
9894 +       if (ent->nr_todo_reqs == 1)
9895 +               wake_up(&ent->wait);
9896 +
9897 +       spin_unlock(&ent->guard);
9898 +
9899 +       /* wait until entd finishes */
9900 +       wait_for_completion(&rq.completion);
9901 +
9902 +       if (rq.written)
9903 +               /* Eventually ENTD has written the page to disk. */
9904 +               return 0;
9905 +       return 0;
9906 +}
9907 +
9908 +int wbq_available(void)
9909 +{
9910 +       struct super_block *sb = reiser4_get_current_sb();
9911 +       entd_context *ent = get_entd_context(sb);
9912 +       return ent->nr_todo_reqs;
9913 +}
9914 +
9915 +/*
9916 + * Local variables:
9917 + * c-indentation-style: "K&R"
9918 + * mode-name: "LC"
9919 + * c-basic-offset: 8
9920 + * tab-width: 8
9921 + * fill-column: 79
9922 + * End:
9923 + */
9924 diff --git a/fs/reiser4/entd.h b/fs/reiser4/entd.h
9925 new file mode 100644
9926 index 0000000..4f79a57
9927 --- /dev/null
9928 +++ b/fs/reiser4/entd.h
9929 @@ -0,0 +1,90 @@
9930 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
9931 +
9932 +/* Ent daemon. */
9933 +
9934 +#ifndef __ENTD_H__
9935 +#define __ENTD_H__
9936 +
9937 +#include "context.h"
9938 +
9939 +#include <linux/fs.h>
9940 +#include <linux/completion.h>
9941 +#include <linux/wait.h>
9942 +#include <linux/spinlock.h>
9943 +#include <linux/sched.h>       /* for struct task_struct */
9944 +
9945 +#define WBQ_MAGIC 0x7876dc76
9946 +
9947 +/* write-back request. */
9948 +struct wbq {
9949 +       int magic;
9950 +       struct list_head link; /* list head of this list is in entd context */
9951 +       struct writeback_control *wbc;
9952 +       struct page *page;
9953 +       struct address_space *mapping;
9954 +       struct completion completion;
9955 +       jnode *node; /* set if ent thread captured requested page */
9956 +       int written; /* set if ent thread wrote requested page */
9957 +};
9958 +
9959 +/* ent-thread context. This is used to synchronize starting/stopping ent
9960 + * threads. */
9961 +typedef struct entd_context {
9962 +        /* wait queue that ent thread waits on for more work. It's
9963 +         * signaled by write_page_by_ent(). */
9964 +       wait_queue_head_t wait;
9965 +       /* spinlock protecting other fields */
9966 +       spinlock_t guard;
9967 +       /* ent thread */
9968 +       struct task_struct *tsk;
9969 +       /* set to indicate that ent thread should leave. */
9970 +       int done;
9971 +       /* counter of active flushers */
9972 +       int flushers;
9973 +       /*
9974 +        * when reiser4_writepage asks entd to write a page - it adds struct
9975 +        * wbq to this list
9976 +        */
9977 +       struct list_head todo_list;
9978 +       /* number of elements on the above list */
9979 +       int nr_todo_reqs;
9980 +
9981 +       struct wbq *cur_request;
9982 +       /*
9983 +        * when entd writes a page it moves write-back request from todo_list
9984 +        * to done_list. This list is used at the end of entd iteration to
9985 +        * wakeup requestors and iput inodes.
9986 +        */
9987 +       struct list_head done_list;
9988 +       /* number of elements on the above list */
9989 +       int nr_done_reqs;
9990 +
9991 +#if REISER4_DEBUG
9992 +       /* list of all active flushers */
9993 +       struct list_head flushers_list;
9994 +#endif
9995 +} entd_context;
9996 +
9997 +extern int  reiser4_init_entd(struct super_block *);
9998 +extern void reiser4_done_entd(struct super_block *);
9999 +
10000 +extern void reiser4_enter_flush(struct super_block *);
10001 +extern void reiser4_leave_flush(struct super_block *);
10002 +
10003 +extern int write_page_by_ent(struct page *, struct writeback_control *);
10004 +extern int wbq_available(void);
10005 +extern void ent_writes_page(struct super_block *, struct page *);
10006 +
10007 +extern jnode *get_jnode_by_wbq(struct super_block *, struct wbq *);
10008 +/* __ENTD_H__ */
10009 +#endif
10010 +
10011 +/* Make Linus happy.
10012 +   Local variables:
10013 +   c-indentation-style: "K&R"
10014 +   mode-name: "LC"
10015 +   c-basic-offset: 8
10016 +   tab-width: 8
10017 +   fill-column: 120
10018 +   End:
10019 +*/
10020 diff --git a/fs/reiser4/eottl.c b/fs/reiser4/eottl.c
10021 new file mode 100644
10022 index 0000000..f921b19
10023 --- /dev/null
10024 +++ b/fs/reiser4/eottl.c
10025 @@ -0,0 +1,509 @@
10026 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10027 +
10028 +#include "forward.h"
10029 +#include "debug.h"
10030 +#include "key.h"
10031 +#include "coord.h"
10032 +#include "plugin/item/item.h"
10033 +#include "plugin/node/node.h"
10034 +#include "znode.h"
10035 +#include "block_alloc.h"
10036 +#include "tree_walk.h"
10037 +#include "tree_mod.h"
10038 +#include "carry.h"
10039 +#include "tree.h"
10040 +#include "super.h"
10041 +
10042 +#include <linux/types.h>       /* for __u??  */
10043 +
10044 +/*
10045 + * Extents on the twig level (EOTTL) handling.
10046 + *
10047 + * EOTTL poses some problems to the tree traversal, that are better explained
10048 + * by example.
10049 + *
10050 + * Suppose we have block B1 on the twig level with the following items:
10051 + *
10052 + * 0. internal item I0 with key (0:0:0:0) (locality, key-type, object-id,
10053 + * offset)
10054 + * 1. extent item E1 with key (1:4:100:0), having 10 blocks of 4k each
10055 + * 2. internal item I2 with key (10:0:0:0)
10056 + *
10057 + * We are trying to insert item with key (5:0:0:0). Lookup finds node B1, and
10058 + * then intra-node lookup is done. This lookup finished on the E1, because the
10059 + * key we are looking for is larger than the key of E1 and is smaller than key
10060 + * the of I2.
10061 + *
10062 + * Here search is stuck.
10063 + *
10064 + * After some thought it is clear what is wrong here: extents on the twig level
10065 + * break some basic property of the *search* tree (on the pretext, that they
10066 + * restore property of balanced tree).
10067 + *
10068 + * Said property is the following: if in the internal node of the search tree
10069 + * we have [ ... Key1 Pointer Key2 ... ] then, all data that are or will be
10070 + * keyed in the tree with the Key such that Key1 <= Key < Key2 are accessible
10071 + * through the Pointer.
10072 + *
10073 + * This is not true, when Pointer is Extent-Pointer, simply because extent
10074 + * cannot expand indefinitely to the right to include any item with
10075 + *
10076 + *   Key1 <= Key <= Key2.
10077 + *
10078 + * For example, our E1 extent is only responsible for the data with keys
10079 + *
10080 + *   (1:4:100:0) <= key <= (1:4:100:0xffffffffffffffff), and
10081 + *
10082 + * so, key range
10083 + *
10084 + *   ( (1:4:100:0xffffffffffffffff), (10:0:0:0) )
10085 + *
10086 + * is orphaned: there is no way to get there from the tree root.
10087 + *
10088 + * In other words, extent pointers are different than normal child pointers as
10089 + * far as search tree is concerned, and this creates such problems.
10090 + *
10091 + * Possible solution for this problem is to insert our item into node pointed
10092 + * to by I2. There are some problems through:
10093 + *
10094 + * (1) I2 can be in a different node.
10095 + * (2) E1 can be immediately followed by another extent E2.
10096 + *
10097 + * (1) is solved by calling reiser4_get_right_neighbor() and accounting
10098 + * for locks/coords as necessary.
10099 + *
10100 + * (2) is more complex. Solution here is to insert new empty leaf node and
10101 + * insert internal item between E1 and E2 pointing to said leaf node. This is
10102 + * further complicated by possibility that E2 is in a different node, etc.
10103 + *
10104 + * Problems:
10105 + *
10106 + * (1) if there was internal item I2 immediately on the right of an extent E1
10107 + * we and we decided to insert new item S1 into node N2 pointed to by I2, then
10108 + * key of S1 will be less than smallest key in the N2. Normally, search key
10109 + * checks that key we are looking for is in the range of keys covered by the
10110 + * node key is being looked in. To work around of this situation, while
10111 + * preserving useful consistency check new flag CBK_TRUST_DK was added to the
10112 + * cbk falgs bitmask. This flag is automatically set on entrance to the
10113 + * coord_by_key() and is only cleared when we are about to enter situation
10114 + * described above.
10115 + *
10116 + * (2) If extent E1 is immediately followed by another extent E2 and we are
10117 + * searching for the key that is between E1 and E2 we only have to insert new
10118 + * empty leaf node when coord_by_key was called for insertion, rather than just
10119 + * for lookup. To distinguish these cases, new flag CBK_FOR_INSERT was added to
10120 + * the cbk falgs bitmask. This flag is automatically set by coord_by_key calls
10121 + * performed by insert_by_key() and friends.
10122 + *
10123 + * (3) Insertion of new empty leaf node (possibly) requires balancing. In any
10124 + * case it requires modification of node content which is only possible under
10125 + * write lock. It may well happen that we only have read lock on the node where
10126 + * new internal pointer is to be inserted (common case: lookup of non-existent
10127 + * stat-data that fells between two extents). If only read lock is held, tree
10128 + * traversal is restarted with lock_level modified so that next time we hit
10129 + * this problem, write lock will be held. Once we have write lock, balancing
10130 + * will be performed.
10131 + */
10132 +
10133 +/**
10134 + * is_next_item_internal - check whether next item is internal
10135 + * @coord: coordinate of extent item in twig node
10136 + * @key: search key
10137 + * @lh: twig node lock handle
10138 + *
10139 + * Looks at the unit next to @coord. If it is an internal one - 1 is returned,
10140 + * @coord is set to that unit. If that unit is in right neighbor, @lh is moved
10141 + * to that node, @coord is set to its first unit. If next item is not internal
10142 + * or does not exist then 0 is returned, @coord and @lh are left unchanged. 2
10143 + * is returned if search restart has to be done.
10144 + */
10145 +static int
10146 +is_next_item_internal(coord_t *coord, const reiser4_key *key,
10147 +                     lock_handle *lh)
10148 +{
10149 +       coord_t next;
10150 +       lock_handle rn;
10151 +       int result;
10152 +
10153 +       coord_dup(&next, coord);
10154 +       if (coord_next_unit(&next) == 0) {
10155 +               /* next unit is in this node */
10156 +               if (item_is_internal(&next)) {
10157 +                       coord_dup(coord, &next);
10158 +                       return 1;
10159 +               }
10160 +               assert("vs-3", item_is_extent(&next));
10161 +               return 0;
10162 +       }
10163 +
10164 +       /*
10165 +        * next unit either does not exist or is in right neighbor. If it is in
10166 +        * right neighbor we have to check right delimiting key because
10167 +        * concurrent thread could get their first and insert item with a key
10168 +        * smaller than @key
10169 +        */
10170 +       read_lock_dk(current_tree);
10171 +       result = keycmp(key, znode_get_rd_key(coord->node));
10172 +       read_unlock_dk(current_tree);
10173 +       assert("vs-6", result != EQUAL_TO);
10174 +       if (result == GREATER_THAN)
10175 +               return 2;
10176 +
10177 +       /* lock right neighbor */
10178 +       init_lh(&rn);
10179 +       result = reiser4_get_right_neighbor(&rn, coord->node,
10180 +                                           znode_is_wlocked(coord->node) ?
10181 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
10182 +                                           GN_CAN_USE_UPPER_LEVELS);
10183 +       if (result == -E_NO_NEIGHBOR) {
10184 +               /* we are on the rightmost edge of the tree */
10185 +               done_lh(&rn);
10186 +               return 0;
10187 +       }
10188 +
10189 +       if (result) {
10190 +               assert("vs-4", result < 0);
10191 +               done_lh(&rn);
10192 +               return result;
10193 +       }
10194 +
10195 +       /*
10196 +        * check whether concurrent thread managed to insert item with a key
10197 +        * smaller than @key
10198 +        */
10199 +       read_lock_dk(current_tree);
10200 +       result = keycmp(key, znode_get_ld_key(rn.node));
10201 +       read_unlock_dk(current_tree);
10202 +       assert("vs-6", result != EQUAL_TO);
10203 +       if (result == GREATER_THAN) {
10204 +               done_lh(&rn);
10205 +               return 2;
10206 +       }
10207 +
10208 +       result = zload(rn.node);
10209 +       if (result) {
10210 +               assert("vs-5", result < 0);
10211 +               done_lh(&rn);
10212 +               return result;
10213 +       }
10214 +
10215 +       coord_init_first_unit(&next, rn.node);
10216 +       if (item_is_internal(&next)) {
10217 +               /*
10218 +                * next unit is in right neighbor and it is an unit of internal
10219 +                * item. Unlock coord->node. Move @lh to right neighbor. @coord
10220 +                * is set to the first unit of right neighbor.
10221 +                */
10222 +               coord_dup(coord, &next);
10223 +               zrelse(rn.node);
10224 +               done_lh(lh);
10225 +               move_lh(lh, &rn);
10226 +               return 1;
10227 +       }
10228 +
10229 +       /*
10230 +        * next unit is unit of extent item. Return without chaning @lh and
10231 +        * @coord.
10232 +        */
10233 +       assert("vs-6", item_is_extent(&next));
10234 +       zrelse(rn.node);
10235 +       done_lh(&rn);
10236 +       return 0;
10237 +}
10238 +
10239 +/**
10240 + * rd_key - calculate key of an item next to the given one
10241 + * @coord: position in a node
10242 + * @key: storage for result key
10243 + *
10244 + * @coord is set between items or after the last item in a node. Calculate key
10245 + * of item to the right of @coord.
10246 + */
10247 +static reiser4_key *rd_key(const coord_t *coord, reiser4_key *key)
10248 +{
10249 +       coord_t dup;
10250 +
10251 +       assert("nikita-2281", coord_is_between_items(coord));
10252 +       coord_dup(&dup, coord);
10253 +
10254 +       if (coord_set_to_right(&dup) == 0)
10255 +               /* next item is in this node. Return its key. */
10256 +               unit_key_by_coord(&dup, key);
10257 +       else {
10258 +               /*
10259 +                * next item either does not exist or is in right
10260 +                * neighbor. Return znode's right delimiting key.
10261 +                */
10262 +               read_lock_dk(current_tree);
10263 +               *key = *znode_get_rd_key(coord->node);
10264 +               read_unlock_dk(current_tree);
10265 +       }
10266 +       return key;
10267 +}
10268 +
10269 +/**
10270 + * add_empty_leaf - insert empty leaf between two extents
10271 + * @insert_coord: position in twig node between two extents
10272 + * @lh: twig node lock handle
10273 + * @key: left delimiting key of new node
10274 + * @rdkey: right delimiting key of new node
10275 + *
10276 + * Inserts empty leaf node between two extent items. It is necessary when we
10277 + * have to insert an item on leaf level between two extents (items on the twig
10278 + * level).
10279 + */
10280 +static int
10281 +add_empty_leaf(coord_t *insert_coord, lock_handle *lh,
10282 +              const reiser4_key *key, const reiser4_key *rdkey)
10283 +{
10284 +       int result;
10285 +       carry_pool *pool;
10286 +       carry_level *todo;
10287 +       reiser4_item_data *item;
10288 +       carry_insert_data *cdata;
10289 +       carry_op *op;
10290 +       znode *node;
10291 +       reiser4_tree *tree;
10292 +
10293 +       assert("vs-49827", znode_contains_key_lock(insert_coord->node, key));
10294 +       tree = znode_get_tree(insert_coord->node);
10295 +       node = reiser4_new_node(insert_coord->node, LEAF_LEVEL);
10296 +       if (IS_ERR(node))
10297 +               return PTR_ERR(node);
10298 +
10299 +       /* setup delimiting keys for node being inserted */
10300 +       write_lock_dk(tree);
10301 +       znode_set_ld_key(node, key);
10302 +       znode_set_rd_key(node, rdkey);
10303 +       ON_DEBUG(node->creator = current);
10304 +       ON_DEBUG(node->first_key = *key);
10305 +       write_unlock_dk(tree);
10306 +
10307 +       ZF_SET(node, JNODE_ORPHAN);
10308 +
10309 +       /*
10310 +        * allocate carry_pool, 3 carry_level-s, reiser4_item_data and
10311 +        * carry_insert_data
10312 +        */
10313 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
10314 +                              sizeof(*item) + sizeof(*cdata));
10315 +       if (IS_ERR(pool))
10316 +               return PTR_ERR(pool);
10317 +       todo = (carry_level *) (pool + 1);
10318 +       init_carry_level(todo, pool);
10319 +
10320 +       item = (reiser4_item_data *) (todo + 3);
10321 +       cdata = (carry_insert_data *) (item + 1);
10322 +
10323 +       op = reiser4_post_carry(todo, COP_INSERT, insert_coord->node, 0);
10324 +       if (!IS_ERR(op)) {
10325 +               cdata->coord = insert_coord;
10326 +               cdata->key = key;
10327 +               cdata->data = item;
10328 +               op->u.insert.d = cdata;
10329 +               op->u.insert.type = COPT_ITEM_DATA;
10330 +               build_child_ptr_data(node, item);
10331 +               item->arg = NULL;
10332 +               /* have @insert_coord to be set at inserted item after
10333 +                  insertion is done */
10334 +               todo->track_type = CARRY_TRACK_CHANGE;
10335 +               todo->tracked = lh;
10336 +
10337 +               result = reiser4_carry(todo, NULL);
10338 +               if (result == 0) {
10339 +                       /*
10340 +                        * pin node in memory. This is necessary for
10341 +                        * znode_make_dirty() below.
10342 +                        */
10343 +                       result = zload(node);
10344 +                       if (result == 0) {
10345 +                               lock_handle local_lh;
10346 +
10347 +                               /*
10348 +                                * if we inserted new child into tree we have
10349 +                                * to mark it dirty so that flush will be able
10350 +                                * to process it.
10351 +                                */
10352 +                               init_lh(&local_lh);
10353 +                               result = longterm_lock_znode(&local_lh, node,
10354 +                                                            ZNODE_WRITE_LOCK,
10355 +                                                            ZNODE_LOCK_LOPRI);
10356 +                               if (result == 0) {
10357 +                                       znode_make_dirty(node);
10358 +
10359 +                                       /*
10360 +                                        * when internal item pointing to @node
10361 +                                        * was inserted into twig node
10362 +                                        * create_hook_internal did not connect
10363 +                                        * it properly because its right
10364 +                                        * neighbor was not known. Do it
10365 +                                        * here
10366 +                                        */
10367 +                                       write_lock_tree(tree);
10368 +                                       assert("nikita-3312",
10369 +                                              znode_is_right_connected(node));
10370 +                                       assert("nikita-2984",
10371 +                                              node->right == NULL);
10372 +                                       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
10373 +                                       write_unlock_tree(tree);
10374 +                                       result =
10375 +                                           connect_znode(insert_coord, node);
10376 +                                       ON_DEBUG(if (result == 0) check_dkeys(node););
10377 +
10378 +                                       done_lh(lh);
10379 +                                       move_lh(lh, &local_lh);
10380 +                                       assert("vs-1676", node_is_empty(node));
10381 +                                       coord_init_first_unit(insert_coord,
10382 +                                                             node);
10383 +                               } else {
10384 +                                       warning("nikita-3136",
10385 +                                               "Cannot lock child");
10386 +                               }
10387 +                               done_lh(&local_lh);
10388 +                               zrelse(node);
10389 +                       }
10390 +               }
10391 +       } else
10392 +               result = PTR_ERR(op);
10393 +       zput(node);
10394 +       done_carry_pool(pool);
10395 +       return result;
10396 +}
10397 +
10398 +/**
10399 + * handle_eottl - handle extent-on-the-twig-level cases in tree traversal
10400 + * @h: search handle
10401 + * @outcome: flag saying whether search has to restart or is done
10402 + *
10403 + * Handles search on twig level. If this function completes search itself then
10404 + * it returns 1. If search has to go one level down then 0 is returned. If
10405 + * error happens then LOOKUP_DONE is returned via @outcome and error code is saved
10406 + * in @h->result.
10407 + */
10408 +int handle_eottl(cbk_handle *h, int *outcome)
10409 +{
10410 +       int result;
10411 +       reiser4_key key;
10412 +       coord_t *coord;
10413 +
10414 +       coord = h->coord;
10415 +
10416 +       if (h->level != TWIG_LEVEL ||
10417 +           (coord_is_existing_item(coord) && item_is_internal(coord))) {
10418 +               /* Continue to traverse tree downward. */
10419 +               return 0;
10420 +       }
10421 +
10422 +       /*
10423 +        * make sure that @h->coord is set to twig node and that it is either
10424 +        * set to extent item or after extent item
10425 +        */
10426 +       assert("vs-356", h->level == TWIG_LEVEL);
10427 +       assert("vs-357", ( {
10428 +                         coord_t lcoord;
10429 +                         coord_dup(&lcoord, coord);
10430 +                         check_me("vs-733", coord_set_to_left(&lcoord) == 0);
10431 +                         item_is_extent(&lcoord);
10432 +                         }
10433 +              ));
10434 +
10435 +       if (*outcome == NS_FOUND) {
10436 +               /* we have found desired key on twig level in extent item */
10437 +               h->result = CBK_COORD_FOUND;
10438 +               *outcome = LOOKUP_DONE;
10439 +               return 1;
10440 +       }
10441 +
10442 +       if (!(h->flags & CBK_FOR_INSERT)) {
10443 +               /* tree traversal is not for insertion. Just return
10444 +                  CBK_COORD_NOTFOUND. */
10445 +               h->result = CBK_COORD_NOTFOUND;
10446 +               *outcome = LOOKUP_DONE;
10447 +               return 1;
10448 +       }
10449 +
10450 +       /* take a look at the item to the right of h -> coord */
10451 +       result = is_next_item_internal(coord, h->key, h->active_lh);
10452 +       if (unlikely(result < 0)) {
10453 +               h->error = "get_right_neighbor failed";
10454 +               h->result = result;
10455 +               *outcome = LOOKUP_DONE;
10456 +               return 1;
10457 +       }
10458 +       if (result == 0) {
10459 +               /*
10460 +                * item to the right is also an extent one. Allocate a new node
10461 +                * and insert pointer to it after item h -> coord.
10462 +                *
10463 +                * This is a result of extents being located at the twig
10464 +                * level. For explanation, see comment just above
10465 +                * is_next_item_internal().
10466 +                */
10467 +               znode *loaded;
10468 +
10469 +               if (cbk_lock_mode(h->level, h) != ZNODE_WRITE_LOCK) {
10470 +                       /*
10471 +                        * we got node read locked, restart coord_by_key to
10472 +                        * have write lock on twig level
10473 +                        */
10474 +                       h->lock_level = TWIG_LEVEL;
10475 +                       h->lock_mode = ZNODE_WRITE_LOCK;
10476 +                       *outcome = LOOKUP_REST;
10477 +                       return 1;
10478 +               }
10479 +
10480 +               loaded = coord->node;
10481 +               result =
10482 +                   add_empty_leaf(coord, h->active_lh, h->key,
10483 +                                  rd_key(coord, &key));
10484 +               if (result) {
10485 +                       h->error = "could not add empty leaf";
10486 +                       h->result = result;
10487 +                       *outcome = LOOKUP_DONE;
10488 +                       return 1;
10489 +               }
10490 +               /* added empty leaf is locked (h->active_lh), its parent node
10491 +                  is unlocked, h->coord is set as EMPTY */
10492 +               assert("vs-13", coord->between == EMPTY_NODE);
10493 +               assert("vs-14", znode_is_write_locked(coord->node));
10494 +               assert("vs-15",
10495 +                      WITH_DATA(coord->node, node_is_empty(coord->node)));
10496 +               assert("vs-16", jnode_is_leaf(ZJNODE(coord->node)));
10497 +               assert("vs-17", coord->node == h->active_lh->node);
10498 +               *outcome = LOOKUP_DONE;
10499 +               h->result = CBK_COORD_NOTFOUND;
10500 +               return 1;
10501 +       } else if (result == 1) {
10502 +               /*
10503 +                * this is special case mentioned in the comment on
10504 +                * tree.h:cbk_flags. We have found internal item immediately on
10505 +                * the right of extent, and we are going to insert new item
10506 +                * there. Key of item we are going to insert is smaller than
10507 +                * leftmost key in the node pointed to by said internal item
10508 +                * (otherwise search wouldn't come to the extent in the first
10509 +                * place).
10510 +                *
10511 +                * This is a result of extents being located at the twig
10512 +                * level. For explanation, see comment just above
10513 +                * is_next_item_internal().
10514 +                */
10515 +               h->flags &= ~CBK_TRUST_DK;
10516 +       } else {
10517 +               assert("vs-8", result == 2);
10518 +               *outcome = LOOKUP_REST;
10519 +               return 1;
10520 +       }
10521 +       assert("vs-362", WITH_DATA(coord->node, item_is_internal(coord)));
10522 +       return 0;
10523 +}
10524 +
10525 +/*
10526 + * Local variables:
10527 + * c-indentation-style: "K&R"
10528 + * mode-name: "LC"
10529 + * c-basic-offset: 8
10530 + * tab-width: 8
10531 + * fill-column: 120
10532 + * scroll-step: 1
10533 + * End:
10534 + */
10535 diff --git a/fs/reiser4/estimate.c b/fs/reiser4/estimate.c
10536 new file mode 100644
10537 index 0000000..656c20b
10538 --- /dev/null
10539 +++ b/fs/reiser4/estimate.c
10540 @@ -0,0 +1,120 @@
10541 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10542 +
10543 +#include "debug.h"
10544 +#include "dformat.h"
10545 +#include "tree.h"
10546 +#include "carry.h"
10547 +#include "inode.h"
10548 +#include "plugin/cluster.h"
10549 +#include "plugin/item/ctail.h"
10550 +
10551 +/* this returns how many nodes might get dirty and added nodes if @children nodes are dirtied
10552 +
10553 +   Amount of internals which will get dirty or get allocated we estimate as 5% of the childs + 1 balancing. 1 balancing
10554 +   is 2 neighbours, 2 new blocks and the current block on the leaf level, 2 neighbour nodes + the current (or 1
10555 +   neighbour and 1 new and the current) on twig level, 2 neighbour nodes on upper levels and 1 for a new root. So 5 for
10556 +   leaf level, 3 for twig level, 2 on upper + 1 for root.
10557 +
10558 +   Do not calculate the current node of the lowest level here - this is overhead only.
10559 +
10560 +   children is almost always 1 here. Exception is flow insertion
10561 +*/
10562 +static reiser4_block_nr
10563 +max_balance_overhead(reiser4_block_nr childen, tree_level tree_height)
10564 +{
10565 +       reiser4_block_nr ten_percent;
10566 +
10567 +       ten_percent = ((103 * childen) >> 10);
10568 +
10569 +       /* If we have too many balancings at the time, tree height can raise on more
10570 +          then 1. Assume that if tree_height is 5, it can raise on 1 only. */
10571 +       return ((tree_height < 5 ? 5 : tree_height) * 2 + (4 + ten_percent));
10572 +}
10573 +
10574 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10575 +   perform insertion of one item into the tree */
10576 +/* it is only called when tree height changes, or gets initialized */
10577 +reiser4_block_nr calc_estimate_one_insert(tree_level height)
10578 +{
10579 +       return 1 + max_balance_overhead(1, height);
10580 +}
10581 +
10582 +reiser4_block_nr estimate_one_insert_item(reiser4_tree * tree)
10583 +{
10584 +       return tree->estimate_one_insert;
10585 +}
10586 +
10587 +/* this returns maximal possible number of nodes which can be modified plus number of new nodes which can be required to
10588 +   perform insertion of one unit into an item in the tree */
10589 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree * tree)
10590 +{
10591 +       /* estimate insert into item just like item insertion */
10592 +       return tree->estimate_one_insert;
10593 +}
10594 +
10595 +reiser4_block_nr estimate_one_item_removal(reiser4_tree * tree)
10596 +{
10597 +       /* on item removal reiser4 does not try to pack nodes more complact, so, only one node may be dirtied on leaf
10598 +          level */
10599 +       return tree->estimate_one_insert;
10600 +}
10601 +
10602 +/* on leaf level insert_flow may add CARRY_FLOW_NEW_NODES_LIMIT new nodes and dirty 3 existing nodes (insert point and
10603 +   both its neighbors). Max_balance_overhead should estimate number of blocks which may change/get added on internal
10604 +   levels */
10605 +reiser4_block_nr estimate_insert_flow(tree_level height)
10606 +{
10607 +       return 3 + CARRY_FLOW_NEW_NODES_LIMIT + max_balance_overhead(3 +
10608 +                                                                    CARRY_FLOW_NEW_NODES_LIMIT,
10609 +                                                                    height);
10610 +}
10611 +
10612 +/* returnes max number of nodes can be occupied by disk cluster */
10613 +static reiser4_block_nr estimate_cluster(struct inode * inode, int unprepped)
10614 +{
10615 +       int per_cluster;
10616 +       per_cluster = (unprepped ? 1 : cluster_nrpages(inode));
10617 +       return 3 + per_cluster +
10618 +               max_balance_overhead(3 + per_cluster,
10619 +                                    REISER4_MAX_ZTREE_HEIGHT);
10620 +}
10621 +
10622 +/* how many nodes might get dirty and added
10623 +   during insertion of a disk cluster */
10624 +reiser4_block_nr estimate_insert_cluster(struct inode * inode)
10625 +{
10626 +       return estimate_cluster(inode, 1); /* 24 */
10627 +}
10628 +
10629 +/* how many nodes might get dirty and added
10630 +   during update of a (prepped or unprepped) disk cluster */
10631 +reiser4_block_nr estimate_update_cluster(struct inode * inode)
10632 +{
10633 +       return estimate_cluster(inode, 0); /* 44, for 64K-cluster */
10634 +}
10635 +
10636 +/* How many nodes occupied by a disk cluster might get dirty.
10637 +   Note that this estimation is not precise (i.e. disk cluster
10638 +   can occupy more nodes).
10639 +   Q: Why we don't use precise estimation?
10640 +   A: 1.Because precise estimation is fairly bad: 65536 nodes
10641 +        for 64K logical cluster, it means 256M of dead space on
10642 +       a partition
10643 +      2.It is a very rare case when disk cluster occupies more
10644 +        nodes then this estimation returns.
10645 +*/
10646 +reiser4_block_nr estimate_dirty_cluster(struct inode * inode)
10647 +{
10648 +       return cluster_nrpages(inode) + 4;
10649 +}
10650 +
10651 +/* Make Linus happy.
10652 +   Local variables:
10653 +   c-indentation-style: "K&R"
10654 +   mode-name: "LC"
10655 +   c-basic-offset: 8
10656 +   tab-width: 8
10657 +   fill-column: 120
10658 +   scroll-step: 1
10659 +   End:
10660 +*/
10661 diff --git a/fs/reiser4/export_ops.c b/fs/reiser4/export_ops.c
10662 new file mode 100644
10663 index 0000000..b75afe7
10664 --- /dev/null
10665 +++ b/fs/reiser4/export_ops.c
10666 @@ -0,0 +1,295 @@
10667 +/* Copyright 2005 by Hans Reiser, licensing governed by
10668 + * reiser4/README */
10669 +
10670 +#include "inode.h"
10671 +#include "plugin/plugin.h"
10672 +
10673 +/*
10674 + * Supported file-handle types
10675 + */
10676 +typedef enum {
10677 +       FH_WITH_PARENT = 0x10,  /* file handle with parent */
10678 +       FH_WITHOUT_PARENT = 0x11        /* file handle without parent */
10679 +} reiser4_fhtype;
10680 +
10681 +#define NFSERROR (255)
10682 +
10683 +/* initialize place-holder for object */
10684 +static void object_on_wire_init(reiser4_object_on_wire *o)
10685 +{
10686 +       o->plugin = NULL;
10687 +}
10688 +
10689 +/* finish with @o */
10690 +static void object_on_wire_done(reiser4_object_on_wire *o)
10691 +{
10692 +       if (o->plugin != NULL)
10693 +               o->plugin->wire.done(o);
10694 +}
10695 +
10696 +/*
10697 + * read serialized object identity from @addr and store information about
10698 + * object in @obj. This is dual to encode_inode().
10699 + */
10700 +static char *decode_inode(struct super_block *s, char *addr,
10701 +                         reiser4_object_on_wire * obj)
10702 +{
10703 +       file_plugin *fplug;
10704 +
10705 +       /* identifier of object plugin is stored in the first two bytes,
10706 +        * followed by... */
10707 +       fplug = file_plugin_by_disk_id(reiser4_get_tree(s), (d16 *) addr);
10708 +       if (fplug != NULL) {
10709 +               addr += sizeof(d16);
10710 +               obj->plugin = fplug;
10711 +               assert("nikita-3520", fplug->wire.read != NULL);
10712 +               /* plugin specific encoding of object identity. */
10713 +               addr = fplug->wire.read(addr, obj);
10714 +       } else
10715 +               addr = ERR_PTR(RETERR(-EINVAL));
10716 +       return addr;
10717 +}
10718 +
10719 +/**
10720 + * reiser4_decode_fh - decode_fh of export operations
10721 + * @super: super block
10722 + * @fh: nfsd file handle
10723 + * @len: length of file handle
10724 + * @fhtype: type of file handle
10725 + * @acceptable: acceptability testing function
10726 + * @context: argument for @acceptable
10727 + *
10728 + * Returns dentry referring to the same file as @fh.
10729 + */
10730 +static struct dentry *reiser4_decode_fh(struct super_block *super, __u32 *fh,
10731 +                                       int len, int fhtype,
10732 +                                       int (*acceptable) (void *context,
10733 +                                                          struct dentry *de),
10734 +                                       void *context)
10735 +{
10736 +       reiser4_context *ctx;
10737 +       reiser4_object_on_wire object;
10738 +       reiser4_object_on_wire parent;
10739 +       char *addr;
10740 +       int with_parent;
10741 +
10742 +       ctx = reiser4_init_context(super);
10743 +       if (IS_ERR(ctx))
10744 +               return (struct dentry *)ctx;
10745 +
10746 +       assert("vs-1482",
10747 +              fhtype == FH_WITH_PARENT || fhtype == FH_WITHOUT_PARENT);
10748 +
10749 +       with_parent = (fhtype == FH_WITH_PARENT);
10750 +
10751 +       addr = (char *)fh;
10752 +
10753 +       object_on_wire_init(&object);
10754 +       object_on_wire_init(&parent);
10755 +
10756 +       addr = decode_inode(super, addr, &object);
10757 +       if (!IS_ERR(addr)) {
10758 +               if (with_parent)
10759 +                       addr = decode_inode(super, addr, &parent);
10760 +               if (!IS_ERR(addr)) {
10761 +                       struct dentry *d;
10762 +                       typeof(super->s_export_op->find_exported_dentry) fn;
10763 +
10764 +                       fn = super->s_export_op->find_exported_dentry;
10765 +                       assert("nikita-3521", fn != NULL);
10766 +                       d = fn(super, &object, with_parent ? &parent : NULL,
10767 +                              acceptable, context);
10768 +                       if (d != NULL && !IS_ERR(d))
10769 +                               /* FIXME check for -ENOMEM */
10770 +                               reiser4_get_dentry_fsdata(d)->stateless = 1;
10771 +                       addr = (char *)d;
10772 +               }
10773 +       }
10774 +
10775 +       object_on_wire_done(&object);
10776 +       object_on_wire_done(&parent);
10777 +
10778 +       reiser4_exit_context(ctx);
10779 +       return (void *)addr;
10780 +}
10781 +
10782 +/*
10783 + * Object serialization support.
10784 + *
10785 + * To support knfsd file system provides export_operations that are used to
10786 + * construct and interpret NFS file handles. As a generalization of this,
10787 + * reiser4 object plugins have serialization support: it provides methods to
10788 + * create on-wire representation of identity of reiser4 object, and
10789 + * re-create/locate object given its on-wire identity.
10790 + *
10791 + */
10792 +
10793 +/*
10794 + * return number of bytes that on-wire representation of @inode's identity
10795 + * consumes.
10796 + */
10797 +static int encode_inode_size(struct inode *inode)
10798 +{
10799 +       assert("nikita-3514", inode != NULL);
10800 +       assert("nikita-3515", inode_file_plugin(inode) != NULL);
10801 +       assert("nikita-3516", inode_file_plugin(inode)->wire.size != NULL);
10802 +
10803 +       return inode_file_plugin(inode)->wire.size(inode) + sizeof(d16);
10804 +}
10805 +
10806 +/*
10807 + * store on-wire representation of @inode's identity at the area beginning at
10808 + * @start.
10809 + */
10810 +static char *encode_inode(struct inode *inode, char *start)
10811 +{
10812 +       assert("nikita-3517", inode != NULL);
10813 +       assert("nikita-3518", inode_file_plugin(inode) != NULL);
10814 +       assert("nikita-3519", inode_file_plugin(inode)->wire.write != NULL);
10815 +
10816 +       /*
10817 +        * first, store two-byte identifier of object plugin, then
10818 +        */
10819 +       save_plugin_id(file_plugin_to_plugin(inode_file_plugin(inode)),
10820 +                      (d16 *) start);
10821 +       start += sizeof(d16);
10822 +       /*
10823 +        * call plugin to serialize object's identity
10824 +        */
10825 +       return inode_file_plugin(inode)->wire.write(inode, start);
10826 +}
10827 +
10828 +/* this returns number of 32 bit long numbers encoded in @lenp. 255 is
10829 + * returned if file handle can not be stored */
10830 +/**
10831 + * reiser4_encode_fh - encode_fh of export operations
10832 + * @dentry:
10833 + * @fh:
10834 + * @lenp:
10835 + * @need_parent:
10836 + *
10837 + */
10838 +static int
10839 +reiser4_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
10840 +                 int need_parent)
10841 +{
10842 +       struct inode *inode;
10843 +       struct inode *parent;
10844 +       char *addr;
10845 +       int need;
10846 +       int delta;
10847 +       int result;
10848 +       reiser4_context *ctx;
10849 +
10850 +       /*
10851 +        * knfsd asks as to serialize object in @dentry, and, optionally its
10852 +        * parent (if need_parent != 0).
10853 +        *
10854 +        * encode_inode() and encode_inode_size() is used to build
10855 +        * representation of object and its parent. All hard work is done by
10856 +        * object plugins.
10857 +        */
10858 +       inode = dentry->d_inode;
10859 +       parent = dentry->d_parent->d_inode;
10860 +
10861 +       addr = (char *)fh;
10862 +
10863 +       need = encode_inode_size(inode);
10864 +       if (need < 0)
10865 +               return NFSERROR;
10866 +       if (need_parent) {
10867 +               delta = encode_inode_size(parent);
10868 +               if (delta < 0)
10869 +                       return NFSERROR;
10870 +               need += delta;
10871 +       }
10872 +
10873 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
10874 +       if (IS_ERR(ctx))
10875 +               return PTR_ERR(ctx);
10876 +
10877 +       if (need <= sizeof(__u32) * (*lenp)) {
10878 +               addr = encode_inode(inode, addr);
10879 +               if (need_parent)
10880 +                       addr = encode_inode(parent, addr);
10881 +
10882 +               /* store in lenp number of 32bit words required for file
10883 +                * handle. */
10884 +               *lenp = (need + sizeof(__u32) - 1) >> 2;
10885 +               result = need_parent ? FH_WITH_PARENT : FH_WITHOUT_PARENT;
10886 +       } else
10887 +               /* no enough space in file handle */
10888 +               result = NFSERROR;
10889 +       reiser4_exit_context(ctx);
10890 +       return result;
10891 +}
10892 +
10893 +/**
10894 + * reiser4_get_dentry_parent - get_parent of export operations
10895 + * @child:
10896 + *
10897 + */
10898 +static struct dentry *reiser4_get_dentry_parent(struct dentry *child)
10899 +{
10900 +       struct inode *dir;
10901 +       dir_plugin *dplug;
10902 +
10903 +       assert("nikita-3527", child != NULL);
10904 +       /* see comment in reiser4_get_dentry() about following assertion */
10905 +       assert("nikita-3528", is_in_reiser4_context());
10906 +
10907 +       dir = child->d_inode;
10908 +       assert("nikita-3529", dir != NULL);
10909 +       dplug = inode_dir_plugin(dir);
10910 +       assert("nikita-3531", ergo(dplug != NULL, dplug->get_parent != NULL));
10911 +       if (dplug != NULL)
10912 +               return dplug->get_parent(dir);
10913 +       else
10914 +               return ERR_PTR(RETERR(-ENOTDIR));
10915 +}
10916 +
10917 +/**
10918 + * reiser4_get_dentry - get_dentry of export operations
10919 + * @super:
10920 + * @data:
10921 + *
10922 + *
10923 + */
10924 +static struct dentry *reiser4_get_dentry(struct super_block *super, void *data)
10925 +{
10926 +       reiser4_object_on_wire *o;
10927 +
10928 +       assert("nikita-3522", super != NULL);
10929 +       assert("nikita-3523", data != NULL);
10930 +       /*
10931 +        * this is only supposed to be called by
10932 +        *
10933 +        *     reiser4_decode_fh->find_exported_dentry
10934 +        *
10935 +        * so, reiser4_context should be here already.
10936 +        */
10937 +       assert("nikita-3526", is_in_reiser4_context());
10938 +
10939 +       o = (reiser4_object_on_wire *)data;
10940 +       assert("nikita-3524", o->plugin != NULL);
10941 +       assert("nikita-3525", o->plugin->wire.get != NULL);
10942 +
10943 +       return o->plugin->wire.get(super, o);
10944 +}
10945 +
10946 +struct export_operations reiser4_export_operations = {
10947 +       .encode_fh = reiser4_encode_fh,
10948 +       .decode_fh = reiser4_decode_fh,
10949 +       .get_parent = reiser4_get_dentry_parent,
10950 +       .get_dentry = reiser4_get_dentry
10951 +};
10952 +
10953 +/*
10954 + * Local variables:
10955 + * c-indentation-style: "K&R"
10956 + * mode-name: "LC"
10957 + * c-basic-offset: 8
10958 + * tab-width: 8
10959 + * fill-column: 79
10960 + * End:
10961 + */
10962 diff --git a/fs/reiser4/flush.c b/fs/reiser4/flush.c
10963 new file mode 100644
10964 index 0000000..49b6ca5
10965 --- /dev/null
10966 +++ b/fs/reiser4/flush.c
10967 @@ -0,0 +1,3622 @@
10968 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
10969 +
10970 +/* The design document for this file is at http://www.namesys.com/v4/v4.html. */
10971 +
10972 +#include "forward.h"
10973 +#include "debug.h"
10974 +#include "dformat.h"
10975 +#include "key.h"
10976 +#include "coord.h"
10977 +#include "plugin/item/item.h"
10978 +#include "plugin/plugin.h"
10979 +#include "plugin/object.h"
10980 +#include "txnmgr.h"
10981 +#include "jnode.h"
10982 +#include "znode.h"
10983 +#include "block_alloc.h"
10984 +#include "tree_walk.h"
10985 +#include "carry.h"
10986 +#include "tree.h"
10987 +#include "vfs_ops.h"
10988 +#include "inode.h"
10989 +#include "page_cache.h"
10990 +#include "wander.h"
10991 +#include "super.h"
10992 +#include "entd.h"
10993 +#include "reiser4.h"
10994 +#include "flush.h"
10995 +#include "writeout.h"
10996 +
10997 +#include <asm/atomic.h>
10998 +#include <linux/fs.h>          /* for struct super_block  */
10999 +#include <linux/mm.h>          /* for struct page */
11000 +#include <linux/bio.h>         /* for struct bio */
11001 +#include <linux/pagemap.h>
11002 +#include <linux/blkdev.h>
11003 +
11004 +/* IMPLEMENTATION NOTES */
11005 +
11006 +/* PARENT-FIRST: Some terminology: A parent-first traversal is a way of assigning a total
11007 +   order to the nodes of the tree in which the parent is placed before its children, which
11008 +   are ordered (recursively) in left-to-right order.  When we speak of a "parent-first preceder", it
11009 +   describes the node that "came before in forward parent-first order".  When we speak of a
11010 +   "parent-first follower", it describes the node that "comes next in parent-first
11011 +   order" (alternatively the node that "came before in reverse parent-first order").
11012 +
11013 +   The following pseudo-code prints the nodes of a tree in forward parent-first order:
11014 +
11015 +   void parent_first (node)
11016 +   {
11017 +     print_node (node);
11018 +     if (node->level > leaf) {
11019 +       for (i = 0; i < num_children; i += 1) {
11020 +         parent_first (node->child[i]);
11021 +       }
11022 +     }
11023 +   }
11024 +*/
11025 +
11026 +/* JUST WHAT ARE WE TRYING TO OPTIMIZE, HERE?  The idea is to optimize block allocation so
11027 +   that a left-to-right scan of the tree's data (i.e., the leaves in left-to-right order)
11028 +   can be accomplished with sequential reads, which results in reading nodes in their
11029 +   parent-first order.  This is a read-optimization aspect of the flush algorithm, and
11030 +   there is also a write-optimization aspect, which is that we wish to make large
11031 +   sequential writes to the disk by allocating or reallocating blocks so that they can be
11032 +   written in sequence.  Sometimes the read-optimization and write-optimization goals
11033 +   conflict with each other, as we discuss in more detail below.
11034 +*/
11035 +
11036 +/* STATE BITS: The flush code revolves around the state of the jnodes it covers.  Here are
11037 +   the relevant jnode->state bits and their relevence to flush:
11038 +
11039 +     JNODE_DIRTY: If a node is dirty, it must be flushed.  But in order to be written it
11040 +     must be allocated first.  In order to be considered allocated, the jnode must have
11041 +     exactly one of { JNODE_OVRWR, JNODE_RELOC } set.  These two bits are exclusive, and
11042 +     all dirtied jnodes eventually have one of these bits set during each transaction.
11043 +
11044 +     JNODE_CREATED: The node was freshly created in its transaction and has no previous
11045 +     block address, so it is unconditionally assigned to be relocated, although this is
11046 +     mainly for code-convenience.  It is not being 'relocated' from anything, but in
11047 +     almost every regard it is treated as part of the relocate set.  The JNODE_CREATED bit
11048 +     remains set even after JNODE_RELOC is set, so the actual relocate can be
11049 +     distinguished from the created-and-allocated set easily: relocate-set members
11050 +     (belonging to the preserve-set) have (JNODE_RELOC) set and created-set members which
11051 +     have no previous location to preserve have (JNODE_RELOC | JNODE_CREATED) set.
11052 +
11053 +     JNODE_OVRWR: The node belongs to atom's overwrite set. The flush algorithm made the
11054 +     decision to maintain the pre-existing location for this node and it will be written
11055 +     to the wandered-log.
11056 +
11057 +     JNODE_RELOC: The flush algorithm made the decision to relocate this block (if it was
11058 +     not created, see note above).  A block with JNODE_RELOC set is eligible for
11059 +     early-flushing and may be submitted during flush_empty_queues.  When the JNODE_RELOC
11060 +     bit is set on a znode, the parent node's internal item is modified and the znode is
11061 +     rehashed.
11062 +
11063 +     JNODE_SQUEEZABLE: Before shifting everything left, the flush algorithm scans the node
11064 +     and calls plugin->f.squeeze() method for its items. By this technology we update disk
11065 +     clusters of cryptcompress objects. Also if leftmost point that was found by flush scan
11066 +     has this flag (races with write(), rare case) the flush algorythm makes the decision
11067 +     to pass it to squalloc() in spite of its flushprepped status for squeezing, not for
11068 +     repeated allocation.
11069 +
11070 +     JNODE_FLUSH_QUEUED: This bit is set when a call to flush enters the jnode into its
11071 +     flush queue.  This means the jnode is not on any clean or dirty list, instead it is
11072 +     moved to one of the flush queue (see flush_queue.h) object private list. This
11073 +     prevents multiple concurrent flushes from attempting to start flushing from the
11074 +     same node.
11075 +
11076 +     (DEAD STATE BIT) JNODE_FLUSH_BUSY: This bit was set during the bottom-up
11077 +     squeeze-and-allocate on a node while its children are actively being squeezed and
11078 +     allocated.  This flag was created to avoid submitting a write request for a node
11079 +     while its children are still being allocated and squeezed. Then flush queue was
11080 +     re-implemented to allow unlimited number of nodes be queued. This flag support was
11081 +     commented out in source code because we decided that there was no reason to submit
11082 +     queued nodes before jnode_flush() finishes.  However, current code calls fq_write()
11083 +     during a slum traversal and may submit "busy nodes" to disk. Probably we can
11084 +     re-enable the JNODE_FLUSH_BUSY bit support in future.
11085 +
11086 +   With these state bits, we describe a test used frequently in the code below,
11087 +   jnode_is_flushprepped() (and the spin-lock-taking jnode_check_flushprepped()).  The
11088 +   test for "flushprepped" returns true if any of the following are true:
11089 +
11090 +     - The node is not dirty
11091 +     - The node has JNODE_RELOC set
11092 +     - The node has JNODE_OVRWR set
11093 +
11094 +   If either the node is not dirty or it has already been processed by flush (and assigned
11095 +   JNODE_OVRWR or JNODE_RELOC), then it is prepped.  If jnode_is_flushprepped() returns
11096 +   true then flush has work to do on that node.
11097 +*/
11098 +
11099 +/* FLUSH_PREP_ONCE_PER_TRANSACTION: Within a single transaction a node is never
11100 +   flushprepped twice (unless an explicit call to flush_unprep is made as described in
11101 +   detail below).  For example a node is dirtied, allocated, and then early-flushed to
11102 +   disk and set clean.  Before the transaction commits, the page is dirtied again and, due
11103 +   to memory pressure, the node is flushed again.  The flush algorithm will not relocate
11104 +   the node to a new disk location, it will simply write it to the same, previously
11105 +   relocated position again.
11106 +*/
11107 +
11108 +/* THE BOTTOM-UP VS. TOP-DOWN ISSUE: This code implements a bottom-up algorithm where we
11109 +   start at a leaf node and allocate in parent-first order by iterating to the right.  At
11110 +   each step of the iteration, we check for the right neighbor.  Before advancing to the
11111 +   right neighbor, we check if the current position and the right neighbor share the same
11112 +   parent.  If they do not share the same parent, the parent is allocated before the right
11113 +   neighbor.
11114 +
11115 +   This process goes recursively up the tree and squeeze nodes level by level as long as
11116 +   the right neighbor and the current position have different parents, then it allocates
11117 +   the right-neighbors-with-different-parents on the way back down.  This process is
11118 +   described in more detail in flush_squalloc_changed_ancestor and the recursive function
11119 +   squalloc_one_changed_ancestor.  But the purpose here is not to discuss the
11120 +   specifics of the bottom-up approach as it is to contrast the bottom-up and top-down
11121 +   approaches.
11122 +
11123 +   The top-down algorithm was implemented earlier (April-May 2002).  In the top-down
11124 +   approach, we find a starting point by scanning left along each level past dirty nodes,
11125 +   then going up and repeating the process until the left node and the parent node are
11126 +   clean.  We then perform a parent-first traversal from the starting point, which makes
11127 +   allocating in parent-first order trivial.  After one subtree has been allocated in this
11128 +   manner, we move to the right, try moving upward, then repeat the parent-first
11129 +   traversal.
11130 +
11131 +   Both approaches have problems that need to be addressed.  Both are approximately the
11132 +   same amount of code, but the bottom-up approach has advantages in the order it acquires
11133 +   locks which, at the very least, make it the better approach.  At first glance each one
11134 +   makes the other one look simpler, so it is important to remember a few of the problems
11135 +   with each one.
11136 +
11137 +   Main problem with the top-down approach: When you encounter a clean child during the
11138 +   parent-first traversal, what do you do?  You would like to avoid searching through a
11139 +   large tree of nodes just to find a few dirty leaves at the bottom, and there is not an
11140 +   obvious solution.  One of the advantages of the top-down approach is that during the
11141 +   parent-first traversal you check every child of a parent to see if it is dirty.  In
11142 +   this way, the top-down approach easily handles the main problem of the bottom-up
11143 +   approach: unallocated children.
11144 +
11145 +   The unallocated children problem is that before writing a node to disk we must make
11146 +   sure that all of its children are allocated.  Otherwise, the writing the node means
11147 +   extra I/O because the node will have to be written again when the child is finally
11148 +   allocated.
11149 +
11150 +   WE HAVE NOT YET ELIMINATED THE UNALLOCATED CHILDREN PROBLEM.  Except for bugs, this
11151 +   should not cause any file system corruption, it only degrades I/O performance because a
11152 +   node may be written when it is sure to be written at least one more time in the same
11153 +   transaction when the remaining children are allocated.  What follows is a description
11154 +   of how we will solve the problem.
11155 +*/
11156 +
11157 +/* HANDLING UNALLOCATED CHILDREN: During flush we may allocate a parent node then,
11158 +   proceeding in parent first order, allocate some of its left-children, then encounter a
11159 +   clean child in the middle of the parent.  We do not allocate the clean child, but there
11160 +   may remain unallocated (dirty) children to the right of the clean child.  If we were to
11161 +   stop flushing at this moment and write everything to disk, the parent might still
11162 +   contain unallocated children.
11163 +
11164 +   We could try to allocate all the descendents of every node that we allocate, but this
11165 +   is not necessary.  Doing so could result in allocating the entire tree: if the root
11166 +   node is allocated then every unallocated node would have to be allocated before
11167 +   flushing.  Actually, we do not have to write a node just because we allocate it.  It is
11168 +   possible to allocate but not write a node during flush, when it still has unallocated
11169 +   children.  However, this approach is probably not optimal for the following reason.
11170 +
11171 +   The flush algorithm is designed to allocate nodes in parent-first order in an attempt
11172 +   to optimize reads that occur in the same order.  Thus we are read-optimizing for a
11173 +   left-to-right scan through all the leaves in the system, and we are hoping to
11174 +   write-optimize at the same time because those nodes will be written together in batch.
11175 +   What happens, however, if we assign a block number to a node in its read-optimized
11176 +   order but then avoid writing it because it has unallocated children?  In that
11177 +   situation, we lose out on the write-optimization aspect because a node will have to be
11178 +   written again to the its location on the device, later, which likely means seeking back
11179 +   to that location.
11180 +
11181 +   So there are tradeoffs. We can choose either:
11182 +
11183 +   A. Allocate all unallocated children to preserve both write-optimization and
11184 +   read-optimization, but this is not always desirable because it may mean having to
11185 +   allocate and flush very many nodes at once.
11186 +
11187 +   B. Defer writing nodes with unallocated children, keep their read-optimized locations,
11188 +   but sacrifice write-optimization because those nodes will be written again.
11189 +
11190 +   C. Defer writing nodes with unallocated children, but do not keep their read-optimized
11191 +   locations.  Instead, choose to write-optimize them later, when they are written.  To
11192 +   facilitate this, we "undo" the read-optimized allocation that was given to the node so
11193 +   that later it can be write-optimized, thus "unpreparing" the flush decision.  This is a
11194 +   case where we disturb the FLUSH_PREP_ONCE_PER_TRANSACTION rule described above.  By a
11195 +   call to flush_unprep() we will: if the node was wandered, unset the JNODE_OVRWR bit;
11196 +   if the node was relocated, unset the JNODE_RELOC bit, non-deferred-deallocate its block
11197 +   location, and set the JNODE_CREATED bit, effectively setting the node back to an
11198 +   unallocated state.
11199 +
11200 +   We will take the following approach in v4.0: for twig nodes we will always finish
11201 +   allocating unallocated children (A).  For nodes with (level > TWIG) we will defer
11202 +   writing and choose write-optimization (C).
11203 +
11204 +   To summarize, there are several parts to a solution that avoids the problem with
11205 +   unallocated children:
11206 +
11207 +   FIXME-ZAM: Still no one approach is implemented to eliminate the "UNALLOCATED CHILDREN"
11208 +   problem because there was an experiment which was done showed that we have 1-2 nodes
11209 +   with unallocated children for thousands of written nodes.  The experiment was simple
11210 +   like coping / deletion of linux kernel sources.  However the problem can arise in more
11211 +   complex tests.  I think we have jnode_io_hook to insert a check for unallocated
11212 +   children and see what kind of problem we have.
11213 +
11214 +   1. When flush reaches a stopping point (e.g., a clean node), it should continue calling
11215 +   squeeze-and-allocate on any remaining unallocated children.  FIXME: Difficulty to
11216 +   implement: should be simple -- amounts to adding a while loop to jnode_flush, see
11217 +   comments in that function.
11218 +
11219 +   2. When flush reaches flush_empty_queue(), some of the (level > TWIG) nodes may still
11220 +   have unallocated children.  If the twig level has unallocated children it is an
11221 +   assertion failure.  If a higher-level node has unallocated children, then it should be
11222 +   explicitly de-allocated by a call to flush_unprep().  FIXME: Difficulty to implement:
11223 +   should be simple.
11224 +
11225 +   3. (CPU-Optimization) Checking whether a node has unallocated children may consume more
11226 +   CPU cycles than we would like, and it is possible (but medium complexity) to optimize
11227 +   this somewhat in the case where large sub-trees are flushed.  The following observation
11228 +   helps: if both the left- and right-neighbor of a node are processed by the flush
11229 +   algorithm then the node itself is guaranteed to have all of its children allocated.
11230 +   However, the cost of this check may not be so expensive after all: it is not needed for
11231 +   leaves and flush can guarantee this property for twigs.  That leaves only (level >
11232 +   TWIG) nodes that have to be checked, so this optimization only helps if at least three
11233 +   (level > TWIG) nodes are flushed in one pass, and the savings will be very small unless
11234 +   there are many more (level > TWIG) nodes.  But if there are many (level > TWIG) nodes
11235 +   then the number of blocks being written will be very large, so the savings may be
11236 +   insignificant.  That said, the idea is to maintain both the left and right edges of
11237 +   nodes that are processed in flush.  When flush_empty_queue() is called, a relatively
11238 +   simple test will tell whether the (level > TWIG) node is on the edge.  If it is on the
11239 +   edge, the slow check is necessary, but if it is in the interior then it can be assumed
11240 +   to have all of its children allocated.  FIXME: medium complexity to implement, but
11241 +   simple to verify given that we must have a slow check anyway.
11242 +
11243 +   4. (Optional) This part is optional, not for v4.0--flush should work independently of
11244 +   whether this option is used or not.  Called RAPID_SCAN, the idea is to amend the
11245 +   left-scan operation to take unallocated children into account.  Normally, the left-scan
11246 +   operation goes left as long as adjacent nodes are dirty up until some large maximum
11247 +   value (FLUSH_SCAN_MAXNODES) at which point it stops and begins flushing.  But scan-left
11248 +   may stop at a position where there are unallocated children to the left with the same
11249 +   parent.  When RAPID_SCAN is enabled, the ordinary scan-left operation stops after
11250 +   FLUSH_RELOCATE_THRESHOLD, which is much smaller than FLUSH_SCAN_MAXNODES, then procedes
11251 +   with a rapid scan.  The rapid scan skips all the interior children of a node--if the
11252 +   leftmost child of a twig is dirty, check its left neighbor (the rightmost child of the
11253 +   twig to the left).  If the left neighbor of the leftmost child is also dirty, then
11254 +   continue the scan at the left twig and repeat.  This option will cause flush to
11255 +   allocate more twigs in a single pass, but it also has the potential to write many more
11256 +   nodes than would otherwise be written without the RAPID_SCAN option.  RAPID_SCAN
11257 +   was partially implemented, code removed August 12, 2002 by JMACD.
11258 +*/
11259 +
11260 +/* FLUSH CALLED ON NON-LEAF LEVEL.  Most of our design considerations assume that the
11261 +   starting point for flush is a leaf node, but actually the flush code cares very little
11262 +   about whether or not this is true.  It is possible that all the leaf nodes are flushed
11263 +   and dirty parent nodes still remain, in which case jnode_flush() is called on a
11264 +   non-leaf argument.  Flush doesn't care--it treats the argument node as if it were a
11265 +   leaf, even when it is not.  This is a simple approach, and there may be a more optimal
11266 +   policy but until a problem with this approach is discovered, simplest is probably best.
11267 +
11268 +   NOTE: In this case, the ordering produced by flush is parent-first only if you ignore
11269 +   the leaves.  This is done as a matter of simplicity and there is only one (shaky)
11270 +   justification.  When an atom commits, it flushes all leaf level nodes first, followed
11271 +   by twigs, and so on.  With flushing done in this order, if flush is eventually called
11272 +   on a non-leaf node it means that (somehow) we reached a point where all leaves are
11273 +   clean and only internal nodes need to be flushed.  If that it the case, then it means
11274 +   there were no leaves that were the parent-first preceder/follower of the parent.  This
11275 +   is expected to be a rare case, which is why we do nothing special about it.  However,
11276 +   memory pressure may pass an internal node to flush when there are still dirty leaf
11277 +   nodes that need to be flushed, which could prove our original assumptions
11278 +   "inoperative".  If this needs to be fixed, then scan_left/right should have
11279 +   special checks for the non-leaf levels.  For example, instead of passing from a node to
11280 +   the left neighbor, it should pass from the node to the left neighbor's rightmost
11281 +   descendent (if dirty).
11282 +
11283 +*/
11284 +
11285 +/* UNIMPLEMENTED AS YET: REPACKING AND RESIZING.  We walk the tree in 4MB-16MB chunks, dirtying everything and putting
11286 +   it into a transaction.  We tell the allocator to allocate the blocks as far as possible towards one end of the
11287 +   logical device--the left (starting) end of the device if we are walking from left to right, the right end of the
11288 +   device if we are walking from right to left.  We then make passes in alternating directions, and as we do this the
11289 +   device becomes sorted such that tree order and block number order fully correlate.
11290 +
11291 +   Resizing is done by shifting everything either all the way to the left or all the way
11292 +   to the right, and then reporting the last block.
11293 +*/
11294 +
11295 +/* RELOCATE DECISIONS: The code makes a decision to relocate in several places.  This
11296 +   descibes the policy from the highest level:
11297 +
11298 +   The FLUSH_RELOCATE_THRESHOLD parameter: If we count this many consecutive nodes on the
11299 +   leaf level during flush-scan (right, left), then we unconditionally decide to relocate
11300 +   leaf nodes.
11301 +
11302 +   Otherwise, there are two contexts in which we make a decision to relocate:
11303 +
11304 +   1. The REVERSE PARENT-FIRST context: Implemented in reverse_relocate_test().
11305 +   During the initial stages of flush, after scan-right completes, we want to ask the
11306 +   question: should we relocate this leaf node and thus dirty the parent node.  Then if
11307 +   the node is a leftmost child its parent is its own parent-first preceder, thus we repeat
11308 +   the question at the next level up, and so on.  In these cases we are moving in the
11309 +   reverse-parent first direction.
11310 +
11311 +   There is another case which is considered the reverse direction, which comes at the end
11312 +   of a twig in reverse_relocate_end_of_twig().  As we finish processing a twig we may
11313 +   reach a point where there is a clean twig to the right with a dirty leftmost child.  In
11314 +   this case, we may wish to relocate the child by testing if it should be relocated
11315 +   relative to its parent.
11316 +
11317 +   2. The FORWARD PARENT-FIRST context: Testing for forward relocation is done in
11318 +   allocate_znode.  What distinguishes the forward parent-first case from the
11319 +   reverse-parent first case is that the preceder has already been allocated in the
11320 +   forward case, whereas in the reverse case we don't know what the preceder is until we
11321 +   finish "going in reverse".  That simplifies the forward case considerably, and there we
11322 +   actually use the block allocator to determine whether, e.g., a block closer to the
11323 +   preceder is available.
11324 +*/
11325 +
11326 +/* SQUEEZE_LEFT_EDGE: Unimplemented idea for future consideration.  The idea is, once we
11327 +   finish scan-left and find a starting point, if the parent's left neighbor is dirty then
11328 +   squeeze the parent's left neighbor and the parent.  This may change the
11329 +   flush-starting-node's parent.  Repeat until the child's parent is stable.  If the child
11330 +   is a leftmost child, repeat this left-edge squeezing operation at the next level up.
11331 +   Note that we cannot allocate extents during this or they will be out of parent-first
11332 +   order.  There is also some difficult coordinate maintenence issues.  We can't do a tree
11333 +   search to find coordinates again (because we hold locks), we have to determine them
11334 +   from the two nodes being squeezed.  Looks difficult, but has potential to increase
11335 +   space utilization. */
11336 +
11337 +/* Flush-scan helper functions. */
11338 +static void scan_init(flush_scan * scan);
11339 +static void scan_done(flush_scan * scan);
11340 +
11341 +/* Flush-scan algorithm. */
11342 +static int scan_left(flush_scan * scan, flush_scan * right, jnode * node,
11343 +                    unsigned limit);
11344 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit);
11345 +static int scan_common(flush_scan * scan, flush_scan * other);
11346 +static int scan_formatted(flush_scan * scan);
11347 +static int scan_unformatted(flush_scan * scan, flush_scan * other);
11348 +static int scan_by_coord(flush_scan * scan);
11349 +
11350 +/* Initial flush-point ancestor allocation. */
11351 +static int alloc_pos_and_ancestors(flush_pos_t * pos);
11352 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos);
11353 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos);
11354 +
11355 +/* Main flush algorithm.  Note on abbreviation: "squeeze and allocate" == "squalloc". */
11356 +static int squalloc(flush_pos_t * pos);
11357 +
11358 +/* Flush squeeze implementation. */
11359 +static int squeeze_right_non_twig(znode * left, znode * right);
11360 +static int shift_one_internal_unit(znode * left, znode * right);
11361 +
11362 +/* Flush reverse parent-first relocation routines. */
11363 +static int reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
11364 +                                           const reiser4_block_nr * nblk);
11365 +static int reverse_relocate_test(jnode * node, const coord_t * parent_coord,
11366 +                                flush_pos_t * pos);
11367 +static int reverse_relocate_check_dirty_parent(jnode * node,
11368 +                                              const coord_t * parent_coord,
11369 +                                              flush_pos_t * pos);
11370 +
11371 +/* Flush allocate write-queueing functions: */
11372 +static int allocate_znode(znode * node, const coord_t * parent_coord,
11373 +                         flush_pos_t * pos);
11374 +static int allocate_znode_update(znode * node, const coord_t * parent_coord,
11375 +                                flush_pos_t * pos);
11376 +static int lock_parent_and_allocate_znode(znode *, flush_pos_t *);
11377 +
11378 +/* Flush helper functions: */
11379 +static int jnode_lock_parent_coord(jnode * node,
11380 +                                  coord_t * coord,
11381 +                                  lock_handle * parent_lh,
11382 +                                  load_count * parent_zh,
11383 +                                  znode_lock_mode mode, int try);
11384 +static int neighbor_in_slum(znode * node, lock_handle * right_lock, sideof side,
11385 +                           znode_lock_mode mode, int check_dirty);
11386 +static int znode_same_parents(znode * a, znode * b);
11387 +
11388 +static int znode_check_flushprepped(znode * node)
11389 +{
11390 +       return jnode_check_flushprepped(ZJNODE(node));
11391 +}
11392 +
11393 +/* Flush position functions */
11394 +static void pos_init(flush_pos_t * pos);
11395 +static int pos_valid(flush_pos_t * pos);
11396 +static void pos_done(flush_pos_t * pos);
11397 +static int pos_stop(flush_pos_t * pos);
11398 +
11399 +/* check that @org is first jnode extent unit, if extent is unallocated,
11400 + * because all jnodes of unallocated extent are dirty and of the same atom. */
11401 +#define checkchild(scan)                                               \
11402 +assert("nikita-3435",                                                  \
11403 +       ergo(scan->direction == LEFT_SIDE &&                            \
11404 +            (scan->parent_coord.node->level == TWIG_LEVEL) &&           \
11405 +           jnode_is_unformatted(scan->node) &&                         \
11406 +           extent_is_unallocated(&scan->parent_coord),                 \
11407 +           extent_unit_index(&scan->parent_coord) == index_jnode(scan->node)))
11408 +
11409 +/* This flush_cnt variable is used to track the number of concurrent flush operations,
11410 +   useful for debugging.  It is initialized in txnmgr.c out of laziness (because flush has
11411 +   no static initializer function...) */
11412 +ON_DEBUG(atomic_t flush_cnt;
11413 +    )
11414 +
11415 +/* check fs backing device for write congestion */
11416 +static int check_write_congestion(void)
11417 +{
11418 +       struct super_block *sb;
11419 +       struct backing_dev_info *bdi;
11420 +
11421 +       sb = reiser4_get_current_sb();
11422 +       bdi = reiser4_get_super_fake(sb)->i_mapping->backing_dev_info;
11423 +       return bdi_write_congested(bdi);
11424 +}
11425 +
11426 +/* conditionally write flush queue */
11427 +static int write_prepped_nodes(flush_pos_t * pos)
11428 +{
11429 +       int ret;
11430 +
11431 +       assert("zam-831", pos);
11432 +       assert("zam-832", pos->fq);
11433 +
11434 +       if (!(pos->flags & JNODE_FLUSH_WRITE_BLOCKS))
11435 +               return 0;
11436 +
11437 +       if (check_write_congestion())
11438 +               return 0;
11439 +
11440 +       ret = reiser4_write_fq(pos->fq, pos->nr_written,
11441 +                      WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
11442 +       return ret;
11443 +}
11444 +
11445 +/* Proper release all flush pos. resources then move flush position to new
11446 +   locked node */
11447 +static void move_flush_pos(flush_pos_t * pos, lock_handle * new_lock,
11448 +                          load_count * new_load, const coord_t * new_coord)
11449 +{
11450 +       assert("zam-857", new_lock->node == new_load->node);
11451 +
11452 +       if (new_coord) {
11453 +               assert("zam-858", new_coord->node == new_lock->node);
11454 +               coord_dup(&pos->coord, new_coord);
11455 +       } else {
11456 +               coord_init_first_unit(&pos->coord, new_lock->node);
11457 +       }
11458 +
11459 +       if (pos->child) {
11460 +               jput(pos->child);
11461 +               pos->child = NULL;
11462 +       }
11463 +
11464 +       move_load_count(&pos->load, new_load);
11465 +       done_lh(&pos->lock);
11466 +       move_lh(&pos->lock, new_lock);
11467 +}
11468 +
11469 +/* delete empty node which link from the parent still exists. */
11470 +static int delete_empty_node(znode * node)
11471 +{
11472 +       reiser4_key smallest_removed;
11473 +
11474 +       assert("zam-1019", node != NULL);
11475 +       assert("zam-1020", node_is_empty(node));
11476 +       assert("zam-1023", znode_is_wlocked(node));
11477 +
11478 +       return reiser4_delete_node(node, &smallest_removed, NULL, 1);
11479 +}
11480 +
11481 +/* Prepare flush position for alloc_pos_and_ancestors() and squalloc() */
11482 +static int prepare_flush_pos(flush_pos_t * pos, jnode * org)
11483 +{
11484 +       int ret;
11485 +       load_count load;
11486 +       lock_handle lock;
11487 +
11488 +       init_lh(&lock);
11489 +       init_load_count(&load);
11490 +
11491 +       if (jnode_is_znode(org)) {
11492 +               ret = longterm_lock_znode(&lock, JZNODE(org),
11493 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI);
11494 +               if (ret)
11495 +                       return ret;
11496 +
11497 +               ret = incr_load_count_znode(&load, JZNODE(org));
11498 +               if (ret)
11499 +                       return ret;
11500 +
11501 +               pos->state =
11502 +                   (jnode_get_level(org) ==
11503 +                    LEAF_LEVEL) ? POS_ON_LEAF : POS_ON_INTERNAL;
11504 +               move_flush_pos(pos, &lock, &load, NULL);
11505 +       } else {
11506 +               coord_t parent_coord;
11507 +               ret = jnode_lock_parent_coord(org, &parent_coord, &lock,
11508 +                                             &load, ZNODE_WRITE_LOCK, 0);
11509 +               if (ret)
11510 +                       goto done;
11511 +               if (!item_is_extent(&parent_coord)) {
11512 +                       /* file was converted to tail, org became HB, we found internal
11513 +                          item */
11514 +                       ret = -EAGAIN;
11515 +                       goto done;
11516 +               }
11517 +
11518 +               pos->state = POS_ON_EPOINT;
11519 +               move_flush_pos(pos, &lock, &load, &parent_coord);
11520 +               pos->child = jref(org);
11521 +               if (extent_is_unallocated(&parent_coord)
11522 +                   && extent_unit_index(&parent_coord) != index_jnode(org)) {
11523 +                       /* @org is not first child of its parent unit. This may happen
11524 +                          because longerm lock of its parent node was released between
11525 +                          scan_left and scan_right. For now work around this having flush to repeat */
11526 +                       ret = -EAGAIN;
11527 +               }
11528 +       }
11529 +
11530 +      done:
11531 +       done_load_count(&load);
11532 +       done_lh(&lock);
11533 +       return ret;
11534 +}
11535 +
11536 +/* TODO LIST (no particular order): */
11537 +/* I have labelled most of the legitimate FIXME comments in this file with letters to
11538 +   indicate which issue they relate to.  There are a few miscellaneous FIXMEs with
11539 +   specific names mentioned instead that need to be inspected/resolved. */
11540 +/* B. There is an issue described in reverse_relocate_test having to do with an
11541 +   imprecise is_preceder? check having to do with partially-dirty extents.  The code that
11542 +   sets preceder hints and computes the preceder is basically untested.  Careful testing
11543 +   needs to be done that preceder calculations are done correctly, since if it doesn't
11544 +   affect correctness we will not catch this stuff during regular testing. */
11545 +/* C. EINVAL, E_DEADLOCK, E_NO_NEIGHBOR, ENOENT handling.  It is unclear which of these are
11546 +   considered expected but unlikely conditions.  Flush currently returns 0 (i.e., success
11547 +   but no progress, i.e., restart) whenever it receives any of these in jnode_flush().
11548 +   Many of the calls that may produce one of these return values (i.e.,
11549 +   longterm_lock_znode, reiser4_get_parent, reiser4_get_neighbor, ...) check some of these
11550 +   values themselves and, for instance, stop flushing instead of resulting in a restart.
11551 +   If any of these results are true error conditions then flush will go into a busy-loop,
11552 +   as we noticed during testing when a corrupt tree caused find_child_ptr to return
11553 +   ENOENT.  It needs careful thought and testing of corner conditions.
11554 +*/
11555 +/* D. Atomicity of flush_prep against deletion and flush concurrency.  Suppose a created
11556 +   block is assigned a block number then early-flushed to disk.  It is dirtied again and
11557 +   flush is called again.  Concurrently, that block is deleted, and the de-allocation of
11558 +   its block number does not need to be deferred, since it is not part of the preserve set
11559 +   (i.e., it didn't exist before the transaction).  I think there may be a race condition
11560 +   where flush writes the dirty, created block after the non-deferred deallocated block
11561 +   number is re-allocated, making it possible to write deleted data on top of non-deleted
11562 +   data.  Its just a theory, but it needs to be thought out. */
11563 +/* F. bio_alloc() failure is not handled gracefully. */
11564 +/* G. Unallocated children. */
11565 +/* H. Add a WANDERED_LIST to the atom to clarify the placement of wandered blocks. */
11566 +/* I. Rename flush-scan to scan-point, (flush-pos to flush-point?) */
11567 +
11568 +/* JNODE_FLUSH: MAIN ENTRY POINT */
11569 +/* This is the main entry point for flushing a jnode and its dirty neighborhood (dirty
11570 +   neighborhood is named "slum").  Jnode_flush() is called if reiser4 has to write dirty
11571 +   blocks to disk, it happens when Linux VM decides to reduce number of dirty pages or as
11572 +   a part of transaction commit.
11573 +
11574 +   Our objective here is to prep and flush the slum the jnode belongs to. We want to
11575 +   squish the slum together, and allocate the nodes in it as we squish because allocation
11576 +   of children affects squishing of parents.
11577 +
11578 +   The "argument" @node tells flush where to start.  From there, flush finds the left edge
11579 +   of the slum, and calls squalloc (in which nodes are squeezed and allocated).  To find a
11580 +   "better place" to start squalloc first we perform a flush_scan.
11581 +
11582 +   Flush-scanning may be performed in both left and right directions, but for different
11583 +   purposes.  When scanning to the left, we are searching for a node that precedes a
11584 +   sequence of parent-first-ordered nodes which we will then flush in parent-first order.
11585 +   During flush-scanning, we also take the opportunity to count the number of consecutive
11586 +   leaf nodes.  If this number is past some threshold (FLUSH_RELOCATE_THRESHOLD), then we
11587 +   make a decision to reallocate leaf nodes (thus favoring write-optimization).
11588 +
11589 +   Since the flush argument node can be anywhere in a sequence of dirty leaves, there may
11590 +   also be dirty nodes to the right of the argument.  If the scan-left operation does not
11591 +   count at least FLUSH_RELOCATE_THRESHOLD nodes then we follow it with a right-scan
11592 +   operation to see whether there is, in fact, enough nodes to meet the relocate
11593 +   threshold.  Each right- and left-scan operation uses a single flush_scan object.
11594 +
11595 +   After left-scan and possibly right-scan, we prepare a flush_position object with the
11596 +   starting flush point or parent coordinate, which was determined using scan-left.
11597 +
11598 +   Next we call the main flush routine, squalloc, which iterates along the
11599 +   leaf level, squeezing and allocating nodes (and placing them into the flush queue).
11600 +
11601 +   After squalloc returns we take extra steps to ensure that all the children
11602 +   of the final twig node are allocated--this involves repeating squalloc
11603 +   until we finish at a twig with no unallocated children.
11604 +
11605 +   Finally, we call flush_empty_queue to submit write-requests to disk.  If we encounter
11606 +   any above-twig nodes during flush_empty_queue that still have unallocated children, we
11607 +   flush_unprep them.
11608 +
11609 +   Flush treats several "failure" cases as non-failures, essentially causing them to start
11610 +   over.  E_DEADLOCK is one example.  FIXME:(C) EINVAL, E_NO_NEIGHBOR, ENOENT: these should
11611 +   probably be handled properly rather than restarting, but there are a bunch of cases to
11612 +   audit.
11613 +*/
11614 +
11615 +static int
11616 +jnode_flush(jnode * node, long nr_to_write, long *nr_written,
11617 +           flush_queue_t * fq, int flags)
11618 +{
11619 +       long ret = 0;
11620 +       flush_scan *right_scan;
11621 +       flush_scan *left_scan;
11622 +       flush_pos_t *flush_pos;
11623 +       int todo;
11624 +       struct super_block *sb;
11625 +       reiser4_super_info_data *sbinfo;
11626 +       jnode *leftmost_in_slum = NULL;
11627 +
11628 +       assert("jmacd-76619", lock_stack_isclean(get_current_lock_stack()));
11629 +       assert("nikita-3022", reiser4_schedulable());
11630 +
11631 +       assert("nikita-3185",
11632 +              get_current_super_private()->delete_mutex_owner != current);
11633 +
11634 +       /* allocate right_scan, left_scan and flush_pos */
11635 +       right_scan =
11636 +           kmalloc(2 * sizeof(*right_scan) + sizeof(*flush_pos),
11637 +                   reiser4_ctx_gfp_mask_get());
11638 +       if (right_scan == NULL)
11639 +               return RETERR(-ENOMEM);
11640 +       left_scan = right_scan + 1;
11641 +       flush_pos = (flush_pos_t *) (left_scan + 1);
11642 +
11643 +       sb = reiser4_get_current_sb();
11644 +       sbinfo = get_super_private(sb);
11645 +
11646 +       /* Flush-concurrency debug code */
11647 +#if REISER4_DEBUG
11648 +       atomic_inc(&flush_cnt);
11649 +#endif
11650 +
11651 +       reiser4_enter_flush(sb);
11652 +
11653 +       /* Initialize a flush position. */
11654 +       pos_init(flush_pos);
11655 +
11656 +       flush_pos->nr_written = nr_written;
11657 +       flush_pos->fq = fq;
11658 +       flush_pos->flags = flags;
11659 +       flush_pos->nr_to_write = nr_to_write;
11660 +
11661 +       scan_init(right_scan);
11662 +       scan_init(left_scan);
11663 +
11664 +       /* First scan left and remember the leftmost scan position.  If the leftmost
11665 +          position is unformatted we remember its parent_coord.  We scan until counting
11666 +          FLUSH_SCAN_MAXNODES.
11667 +
11668 +          If starting @node is unformatted, at the beginning of left scan its
11669 +          parent (twig level node, containing extent item) will be long term
11670 +          locked and lock handle will be stored in the
11671 +          @right_scan->parent_lock. This lock is used to start the rightward
11672 +          scan without redoing the tree traversal (necessary to find parent)
11673 +          and, hence, is kept during leftward scan. As a result, we have to
11674 +          use try-lock when taking long term locks during the leftward scan.
11675 +        */
11676 +       ret = scan_left(left_scan, right_scan,
11677 +                       node, sbinfo->flush.scan_maxnodes);
11678 +       if (ret != 0)
11679 +               goto failed;
11680 +
11681 +       leftmost_in_slum = jref(left_scan->node);
11682 +       scan_done(left_scan);
11683 +
11684 +       /* Then possibly go right to decide if we will use a policy of relocating leaves.
11685 +          This is only done if we did not scan past (and count) enough nodes during the
11686 +          leftward scan.  If we do scan right, we only care to go far enough to establish
11687 +          that at least FLUSH_RELOCATE_THRESHOLD number of nodes are being flushed.  The
11688 +          scan limit is the difference between left_scan.count and the threshold. */
11689 +
11690 +       todo = sbinfo->flush.relocate_threshold - left_scan->count;
11691 +       /* scan right is inherently deadlock prone, because we are
11692 +        * (potentially) holding a lock on the twig node at this moment.
11693 +        * FIXME: this is incorrect comment: lock is not held */
11694 +       if (todo > 0) {
11695 +               ret = scan_right(right_scan, node, (unsigned)todo);
11696 +               if (ret != 0)
11697 +                       goto failed;
11698 +       }
11699 +
11700 +       /* Only the right-scan count is needed, release any rightward locks right away. */
11701 +       scan_done(right_scan);
11702 +
11703 +       /* ... and the answer is: we should relocate leaf nodes if at least
11704 +          FLUSH_RELOCATE_THRESHOLD nodes were found. */
11705 +       flush_pos->leaf_relocate = JF_ISSET(node, JNODE_REPACK) ||
11706 +           (left_scan->count + right_scan->count >=
11707 +            sbinfo->flush.relocate_threshold);
11708 +
11709 +       /* Funny business here.  We set the 'point' in the flush_position at prior to
11710 +          starting squalloc regardless of whether the first point is
11711 +          formatted or unformatted.  Without this there would be an invariant, in the
11712 +          rest of the code, that if the flush_position is unformatted then
11713 +          flush_position->point is NULL and flush_position->parent_{lock,coord} is set,
11714 +          and if the flush_position is formatted then flush_position->point is non-NULL
11715 +          and no parent info is set.
11716 +
11717 +          This seems lazy, but it makes the initial calls to reverse_relocate_test
11718 +          (which ask "is it the pos->point the leftmost child of its parent") much easier
11719 +          because we know the first child already.  Nothing is broken by this, but the
11720 +          reasoning is subtle.  Holding an extra reference on a jnode during flush can
11721 +          cause us to see nodes with HEARD_BANSHEE during squalloc, because nodes are not
11722 +          removed from sibling lists until they have zero reference count.  Flush would
11723 +          never observe a HEARD_BANSHEE node on the left-edge of flush, nodes are only
11724 +          deleted to the right.  So if nothing is broken, why fix it?
11725 +
11726 +          NOTE-NIKITA actually, flush can meet HEARD_BANSHEE node at any
11727 +          point and in any moment, because of the concurrent file system
11728 +          activity (for example, truncate). */
11729 +
11730 +       /* Check jnode state after flush_scan completed. Having a lock on this
11731 +          node or its parent (in case of unformatted) helps us in case of
11732 +          concurrent flushing. */
11733 +       if (jnode_check_flushprepped(leftmost_in_slum)
11734 +           && !jnode_convertible(leftmost_in_slum)) {
11735 +               ret = 0;
11736 +               goto failed;
11737 +       }
11738 +
11739 +       /* Now setup flush_pos using scan_left's endpoint. */
11740 +       ret = prepare_flush_pos(flush_pos, leftmost_in_slum);
11741 +       if (ret)
11742 +               goto failed;
11743 +
11744 +       if (znode_get_level(flush_pos->coord.node) == LEAF_LEVEL
11745 +           && node_is_empty(flush_pos->coord.node)) {
11746 +               znode *empty = flush_pos->coord.node;
11747 +
11748 +               assert("zam-1022", !ZF_ISSET(empty, JNODE_HEARD_BANSHEE));
11749 +               ret = delete_empty_node(empty);
11750 +               goto failed;
11751 +       }
11752 +
11753 +       if (jnode_check_flushprepped(leftmost_in_slum)
11754 +           && !jnode_convertible(leftmost_in_slum)) {
11755 +               ret = 0;
11756 +               goto failed;
11757 +       }
11758 +
11759 +       /* Set pos->preceder and (re)allocate pos and its ancestors if it is needed  */
11760 +       ret = alloc_pos_and_ancestors(flush_pos);
11761 +       if (ret)
11762 +               goto failed;
11763 +
11764 +       /* Do the main rightward-bottom-up squeeze and allocate loop. */
11765 +       ret = squalloc(flush_pos);
11766 +       pos_stop(flush_pos);
11767 +       if (ret)
11768 +               goto failed;
11769 +
11770 +       /* FIXME_NFQUCMPD: Here, handle the twig-special case for unallocated children.
11771 +          First, the pos_stop() and pos_valid() routines should be modified
11772 +          so that pos_stop() sets a flush_position->stop flag to 1 without
11773 +          releasing the current position immediately--instead release it in
11774 +          pos_done().  This is a better implementation than the current one anyway.
11775 +
11776 +          It is not clear that all fields of the flush_position should not be released,
11777 +          but at the very least the parent_lock, parent_coord, and parent_load should
11778 +          remain held because they are hold the last twig when pos_stop() is
11779 +          called.
11780 +
11781 +          When we reach this point in the code, if the parent_coord is set to after the
11782 +          last item then we know that flush reached the end of a twig (and according to
11783 +          the new flush queueing design, we will return now).  If parent_coord is not
11784 +          past the last item, we should check if the current twig has any unallocated
11785 +          children to the right (we are not concerned with unallocated children to the
11786 +          left--in that case the twig itself should not have been allocated).  If the
11787 +          twig has unallocated children to the right, set the parent_coord to that
11788 +          position and then repeat the call to squalloc.
11789 +
11790 +          Testing for unallocated children may be defined in two ways: if any internal
11791 +          item has a fake block number, it is unallocated; if any extent item is
11792 +          unallocated then all of its children are unallocated.  But there is a more
11793 +          aggressive approach: if there are any dirty children of the twig to the right
11794 +          of the current position, we may wish to relocate those nodes now.  Checking for
11795 +          potential relocation is more expensive as it requires knowing whether there are
11796 +          any dirty children that are not unallocated.  The extent_needs_allocation
11797 +          should be used after setting the correct preceder.
11798 +
11799 +          When we reach the end of a twig at this point in the code, if the flush can
11800 +          continue (when the queue is ready) it will need some information on the future
11801 +          starting point.  That should be stored away in the flush_handle using a seal, I
11802 +          believe.  Holding a jref() on the future starting point may break other code
11803 +          that deletes that node.
11804 +        */
11805 +
11806 +       /* FIXME_NFQUCMPD: Also, we don't want to do any flushing when flush is called
11807 +          above the twig level.  If the VM calls flush above the twig level, do nothing
11808 +          and return (but figure out why this happens).  The txnmgr should be modified to
11809 +          only flush its leaf-level dirty list.  This will do all the necessary squeeze
11810 +          and allocate steps but leave unallocated branches and possibly unallocated
11811 +          twigs (when the twig's leftmost child is not dirty).  After flushing the leaf
11812 +          level, the remaining unallocated nodes should be given write-optimized
11813 +          locations.  (Possibly, the remaining unallocated twigs should be allocated just
11814 +          before their leftmost child.)
11815 +        */
11816 +
11817 +       /* Any failure reaches this point. */
11818 +      failed:
11819 +
11820 +       switch (ret) {
11821 +       case -E_REPEAT:
11822 +       case -EINVAL:
11823 +       case -E_DEADLOCK:
11824 +       case -E_NO_NEIGHBOR:
11825 +       case -ENOENT:
11826 +               /* FIXME(C): Except for E_DEADLOCK, these should probably be handled properly
11827 +                  in each case.  They already are handled in many cases. */
11828 +               /* Something bad happened, but difficult to avoid...  Try again! */
11829 +               ret = 0;
11830 +       }
11831 +
11832 +       if (leftmost_in_slum)
11833 +               jput(leftmost_in_slum);
11834 +
11835 +       pos_done(flush_pos);
11836 +       scan_done(left_scan);
11837 +       scan_done(right_scan);
11838 +       kfree(right_scan);
11839 +
11840 +       ON_DEBUG(atomic_dec(&flush_cnt));
11841 +
11842 +       reiser4_leave_flush(sb);
11843 +
11844 +       return ret;
11845 +}
11846 +
11847 +/* The reiser4 flush subsystem can be turned into "rapid flush mode" means that
11848 + * flusher should submit all prepped nodes immediately without keeping them in
11849 + * flush queues for long time.  The reason for rapid flush mode is to free
11850 + * memory as fast as possible. */
11851 +
11852 +#if REISER4_USE_RAPID_FLUSH
11853 +
11854 +/**
11855 + * submit all prepped nodes if rapid flush mode is set,
11856 + * turn rapid flush mode off.
11857 + */
11858 +
11859 +static int rapid_flush(flush_pos_t * pos)
11860 +{
11861 +       if (!wbq_available())
11862 +               return 0;
11863 +
11864 +       return write_prepped_nodes(pos);
11865 +}
11866 +
11867 +#else
11868 +
11869 +#define rapid_flush(pos) (0)
11870 +
11871 +#endif                         /* REISER4_USE_RAPID_FLUSH */
11872 +
11873 +static jnode *find_flush_start_jnode(jnode *start, txn_atom *atom,
11874 +                                    flush_queue_t *fq, int *nr_queued,
11875 +                                    int flags)
11876 +{
11877 +       jnode * node;
11878 +
11879 +       if (start != NULL) {
11880 +               spin_lock_jnode(start);
11881 +               if (!jnode_is_flushprepped(start)) {
11882 +                       assert("zam-1056", start->atom == atom);
11883 +                       node = start;
11884 +                       goto enter;
11885 +               }
11886 +               spin_unlock_jnode(start);
11887 +       }
11888 +       /*
11889 +        * In this loop we process all already prepped (RELOC or OVRWR) and dirtied again
11890 +        * nodes. The atom spin lock is not released until all dirty nodes processed or
11891 +        * not prepped node found in the atom dirty lists.
11892 +        */
11893 +       while ((node = find_first_dirty_jnode(atom, flags))) {
11894 +               spin_lock_jnode(node);
11895 +       enter:
11896 +               assert("zam-881", JF_ISSET(node, JNODE_DIRTY));
11897 +               assert("zam-898", !JF_ISSET(node, JNODE_OVRWR));
11898 +
11899 +               if (JF_ISSET(node, JNODE_WRITEBACK)) {
11900 +                       /* move node to the end of atom's writeback list */
11901 +                       list_move_tail(&node->capture_link, ATOM_WB_LIST(atom));
11902 +
11903 +                       /*
11904 +                        * jnode is not necessarily on dirty list: if it was dirtied when
11905 +                        * it was on flush queue - it does not get moved to dirty list
11906 +                        */
11907 +                       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node),
11908 +                                            WB_LIST, 1));
11909 +
11910 +               } else if (jnode_is_znode(node)
11911 +                          && znode_above_root(JZNODE(node))) {
11912 +                       /*
11913 +                        * A special case for znode-above-root.  The above-root (fake)
11914 +                        * znode is captured and dirtied when the tree height changes or
11915 +                        * when the root node is relocated.  This causes atoms to fuse so
11916 +                        * that changes at the root are serialized.  However, this node is
11917 +                        * never flushed.  This special case used to be in lock.c to
11918 +                        * prevent the above-root node from ever being captured, but now
11919 +                        * that it is captured we simply prevent it from flushing.  The
11920 +                        * log-writer code relies on this to properly log superblock
11921 +                        * modifications of the tree height.
11922 +                        */
11923 +                       jnode_make_wander_nolock(node);
11924 +               } else if (JF_ISSET(node, JNODE_RELOC)) {
11925 +                       queue_jnode(fq, node);
11926 +                       ++(*nr_queued);
11927 +               } else
11928 +                       break;
11929 +
11930 +               spin_unlock_jnode(node);
11931 +       }
11932 +       return node;
11933 +}
11934 +
11935 +/* Flush some nodes of current atom, usually slum, return -E_REPEAT if there are more nodes
11936 + * to flush, return 0 if atom's dirty lists empty and keep current atom locked, return
11937 + * other errors as they are. */
11938 +int
11939 +flush_current_atom(int flags, long nr_to_write, long *nr_submitted,
11940 +                  txn_atom ** atom, jnode *start)
11941 +{
11942 +       reiser4_super_info_data *sinfo = get_current_super_private();
11943 +       flush_queue_t *fq = NULL;
11944 +       jnode *node;
11945 +       int nr_queued;
11946 +       int ret;
11947 +
11948 +       assert("zam-889", atom != NULL && *atom != NULL);
11949 +       assert_spin_locked(&((*atom)->alock));
11950 +       assert("zam-892", get_current_context()->trans->atom == *atom);
11951 +
11952 +       nr_to_write = LONG_MAX;
11953 +       while (1) {
11954 +               ret = reiser4_fq_by_atom(*atom, &fq);
11955 +               if (ret != -E_REPEAT)
11956 +                       break;
11957 +               *atom = get_current_atom_locked();
11958 +       }
11959 +       if (ret)
11960 +               return ret;
11961 +
11962 +       assert_spin_locked(&((*atom)->alock));
11963 +
11964 +       /* parallel flushers limit */
11965 +       if (sinfo->tmgr.atom_max_flushers != 0) {
11966 +               while ((*atom)->nr_flushers >= sinfo->tmgr.atom_max_flushers) {
11967 +                       /* An reiser4_atom_send_event() call is inside
11968 +                          reiser4_fq_put_nolock() which is called when flush is
11969 +                          finished and nr_flushers is decremented. */
11970 +                       reiser4_atom_wait_event(*atom);
11971 +                       *atom = get_current_atom_locked();
11972 +               }
11973 +       }
11974 +
11975 +       /* count ourself as a flusher */
11976 +       (*atom)->nr_flushers++;
11977 +
11978 +       writeout_mode_enable();
11979 +
11980 +       nr_queued = 0;
11981 +       node = find_flush_start_jnode(start, *atom, fq, &nr_queued, flags);
11982 +
11983 +       if (node == NULL) {
11984 +               if (nr_queued == 0) {
11985 +                       (*atom)->nr_flushers--;
11986 +                       reiser4_fq_put_nolock(fq);
11987 +                       reiser4_atom_send_event(*atom);
11988 +                       /* current atom remains locked */
11989 +                       writeout_mode_disable();
11990 +                       return 0;
11991 +               }
11992 +               spin_unlock_atom(*atom);
11993 +       } else {
11994 +               jref(node);
11995 +               BUG_ON((*atom)->super != node->tree->super);
11996 +               spin_unlock_atom(*atom);
11997 +               spin_unlock_jnode(node);
11998 +               BUG_ON(nr_to_write == 0);
11999 +               ret = jnode_flush(node, nr_to_write, nr_submitted, fq, flags);
12000 +               jput(node);
12001 +       }
12002 +
12003 +       ret =
12004 +           reiser4_write_fq(fq, nr_submitted,
12005 +                    WRITEOUT_SINGLE_STREAM | WRITEOUT_FOR_PAGE_RECLAIM);
12006 +
12007 +       *atom = get_current_atom_locked();
12008 +       (*atom)->nr_flushers--;
12009 +       reiser4_fq_put_nolock(fq);
12010 +       reiser4_atom_send_event(*atom);
12011 +       spin_unlock_atom(*atom);
12012 +
12013 +       writeout_mode_disable();
12014 +
12015 +       if (ret == 0)
12016 +               ret = -E_REPEAT;
12017 +
12018 +       return ret;
12019 +}
12020 +
12021 +/* REVERSE PARENT-FIRST RELOCATION POLICIES */
12022 +
12023 +/* This implements the is-it-close-enough-to-its-preceder? test for relocation in the
12024 +   reverse parent-first relocate context.  Here all we know is the preceder and the block
12025 +   number.  Since we are going in reverse, the preceder may still be relocated as well, so
12026 +   we can't ask the block allocator "is there a closer block available to relocate?" here.
12027 +   In the _forward_ parent-first relocate context (not here) we actually call the block
12028 +   allocator to try and find a closer location. */
12029 +static int
12030 +reverse_relocate_if_close_enough(const reiser4_block_nr * pblk,
12031 +                                const reiser4_block_nr * nblk)
12032 +{
12033 +       reiser4_block_nr dist;
12034 +
12035 +       assert("jmacd-7710", *pblk != 0 && *nblk != 0);
12036 +       assert("jmacd-7711", !reiser4_blocknr_is_fake(pblk));
12037 +       assert("jmacd-7712", !reiser4_blocknr_is_fake(nblk));
12038 +
12039 +       /* Distance is the absolute value. */
12040 +       dist = (*pblk > *nblk) ? (*pblk - *nblk) : (*nblk - *pblk);
12041 +
12042 +       /* If the block is less than FLUSH_RELOCATE_DISTANCE blocks away from its preceder
12043 +          block, do not relocate. */
12044 +       if (dist <= get_current_super_private()->flush.relocate_distance) {
12045 +               return 0;
12046 +       }
12047 +
12048 +       return 1;
12049 +}
12050 +
12051 +/* This function is a predicate that tests for relocation.  Always called in the
12052 +   reverse-parent-first context, when we are asking whether the current node should be
12053 +   relocated in order to expand the flush by dirtying the parent level (and thus
12054 +   proceeding to flush that level).  When traversing in the forward parent-first direction
12055 +   (not here), relocation decisions are handled in two places: allocate_znode() and
12056 +   extent_needs_allocation(). */
12057 +static int
12058 +reverse_relocate_test(jnode * node, const coord_t * parent_coord,
12059 +                     flush_pos_t * pos)
12060 +{
12061 +       reiser4_block_nr pblk = 0;
12062 +       reiser4_block_nr nblk = 0;
12063 +
12064 +       assert("jmacd-8989", !jnode_is_root(node));
12065 +
12066 +       /*
12067 +        * This function is called only from the
12068 +        * reverse_relocate_check_dirty_parent() and only if the parent
12069 +        * node is clean. This implies that the parent has the real (i.e., not
12070 +        * fake) block number, and, so does the child, because otherwise the
12071 +        * parent would be dirty.
12072 +        */
12073 +
12074 +       /* New nodes are treated as if they are being relocated. */
12075 +       if (JF_ISSET (node, JNODE_CREATED) ||
12076 +           (pos->leaf_relocate && jnode_get_level(node) == LEAF_LEVEL)) {
12077 +               return 1;
12078 +       }
12079 +
12080 +       /* Find the preceder.  FIXME(B): When the child is an unformatted, previously
12081 +          existing node, the coord may be leftmost even though the child is not the
12082 +          parent-first preceder of the parent.  If the first dirty node appears somewhere
12083 +          in the middle of the first extent unit, this preceder calculation is wrong.
12084 +          Needs more logic in here. */
12085 +       if (coord_is_leftmost_unit(parent_coord)) {
12086 +               pblk = *znode_get_block(parent_coord->node);
12087 +       } else {
12088 +               pblk = pos->preceder.blk;
12089 +       }
12090 +       check_preceder(pblk);
12091 +
12092 +       /* If (pblk == 0) then the preceder isn't allocated or isn't known: relocate. */
12093 +       if (pblk == 0) {
12094 +               return 1;
12095 +       }
12096 +
12097 +       nblk = *jnode_get_block(node);
12098 +
12099 +       if (reiser4_blocknr_is_fake(&nblk))
12100 +               /* child is unallocated, mark parent dirty */
12101 +               return 1;
12102 +
12103 +       return reverse_relocate_if_close_enough(&pblk, &nblk);
12104 +}
12105 +
12106 +/* This function calls reverse_relocate_test to make a reverse-parent-first
12107 +   relocation decision and then, if yes, it marks the parent dirty. */
12108 +static int
12109 +reverse_relocate_check_dirty_parent(jnode * node, const coord_t * parent_coord,
12110 +                                   flush_pos_t * pos)
12111 +{
12112 +       int ret;
12113 +
12114 +       if (!JF_ISSET(ZJNODE(parent_coord->node), JNODE_DIRTY)) {
12115 +
12116 +               ret = reverse_relocate_test(node, parent_coord, pos);
12117 +               if (ret < 0) {
12118 +                       return ret;
12119 +               }
12120 +
12121 +               /* FIXME-ZAM
12122 +                  if parent is already relocated - we do not want to grab space, right? */
12123 +               if (ret == 1) {
12124 +                       int grabbed;
12125 +
12126 +                       grabbed = get_current_context()->grabbed_blocks;
12127 +                       if (reiser4_grab_space_force((__u64) 1, BA_RESERVED) !=
12128 +                           0)
12129 +                               reiser4_panic("umka-1250",
12130 +                                             "No space left during flush.");
12131 +
12132 +                       assert("jmacd-18923",
12133 +                              znode_is_write_locked(parent_coord->node));
12134 +                       znode_make_dirty(parent_coord->node);
12135 +                       grabbed2free_mark(grabbed);
12136 +               }
12137 +       }
12138 +
12139 +       return 0;
12140 +}
12141 +
12142 +/* INITIAL ALLOCATE ANCESTORS STEP (REVERSE PARENT-FIRST ALLOCATION BEFORE FORWARD
12143 +   PARENT-FIRST LOOP BEGINS) */
12144 +
12145 +/* Get the leftmost child for given coord. */
12146 +static int get_leftmost_child_of_unit(const coord_t * coord, jnode ** child)
12147 +{
12148 +       int ret;
12149 +
12150 +       ret = item_utmost_child(coord, LEFT_SIDE, child);
12151 +
12152 +       if (ret)
12153 +               return ret;
12154 +
12155 +       if (IS_ERR(*child))
12156 +               return PTR_ERR(*child);
12157 +
12158 +       return 0;
12159 +}
12160 +
12161 +/* This step occurs after the left- and right-scans are completed, before starting the
12162 +   forward parent-first traversal.  Here we attempt to allocate ancestors of the starting
12163 +   flush point, which means continuing in the reverse parent-first direction to the
12164 +   parent, grandparent, and so on (as long as the child is a leftmost child).  This
12165 +   routine calls a recursive process, alloc_one_ancestor, which does the real work,
12166 +   except there is special-case handling here for the first ancestor, which may be a twig.
12167 +   At each level (here and alloc_one_ancestor), we check for relocation and then, if
12168 +   the child is a leftmost child, repeat at the next level.  On the way back down (the
12169 +   recursion), we allocate the ancestors in parent-first order. */
12170 +static int alloc_pos_and_ancestors(flush_pos_t * pos)
12171 +{
12172 +       int ret = 0;
12173 +       lock_handle plock;
12174 +       load_count pload;
12175 +       coord_t pcoord;
12176 +
12177 +       if (znode_check_flushprepped(pos->lock.node))
12178 +               return 0;
12179 +
12180 +       coord_init_invalid(&pcoord, NULL);
12181 +       init_lh(&plock);
12182 +       init_load_count(&pload);
12183 +
12184 +       if (pos->state == POS_ON_EPOINT) {
12185 +               /* a special case for pos on twig level, where we already have
12186 +                  a lock on parent node. */
12187 +               /* The parent may not be dirty, in which case we should decide
12188 +                  whether to relocate the child now. If decision is made to
12189 +                  relocate the child, the parent is marked dirty. */
12190 +               ret =
12191 +                   reverse_relocate_check_dirty_parent(pos->child, &pos->coord,
12192 +                                                       pos);
12193 +               if (ret)
12194 +                       goto exit;
12195 +
12196 +               /* FIXME_NFQUCMPD: We only need to allocate the twig (if child
12197 +                  is leftmost) and the leaf/child, so recursion is not needed.
12198 +                  Levels above the twig will be allocated for
12199 +                  write-optimization before the transaction commits.  */
12200 +
12201 +               /* Do the recursive step, allocating zero or more of our
12202 +                * ancestors. */
12203 +               ret = alloc_one_ancestor(&pos->coord, pos);
12204 +
12205 +       } else {
12206 +               if (!znode_is_root(pos->lock.node)) {
12207 +                       /* all formatted nodes except tree root */
12208 +                       ret =
12209 +                           reiser4_get_parent(&plock, pos->lock.node,
12210 +                                              ZNODE_WRITE_LOCK);
12211 +                       if (ret)
12212 +                               goto exit;
12213 +
12214 +                       ret = incr_load_count_znode(&pload, plock.node);
12215 +                       if (ret)
12216 +                               goto exit;
12217 +
12218 +                       ret =
12219 +                           find_child_ptr(plock.node, pos->lock.node, &pcoord);
12220 +                       if (ret)
12221 +                               goto exit;
12222 +
12223 +                       ret =
12224 +                           reverse_relocate_check_dirty_parent(ZJNODE
12225 +                                                               (pos->lock.
12226 +                                                                node), &pcoord,
12227 +                                                               pos);
12228 +                       if (ret)
12229 +                               goto exit;
12230 +
12231 +                       ret = alloc_one_ancestor(&pcoord, pos);
12232 +                       if (ret)
12233 +                               goto exit;
12234 +               }
12235 +
12236 +               ret = allocate_znode(pos->lock.node, &pcoord, pos);
12237 +       }
12238 +      exit:
12239 +       done_load_count(&pload);
12240 +       done_lh(&plock);
12241 +       return ret;
12242 +}
12243 +
12244 +/* This is the recursive step described in alloc_pos_and_ancestors, above.  Ignoring the
12245 +   call to set_preceder, which is the next function described, this checks if the
12246 +   child is a leftmost child and returns if it is not.  If the child is a leftmost child
12247 +   it checks for relocation, possibly dirtying the parent.  Then it performs the recursive
12248 +   step. */
12249 +static int alloc_one_ancestor(const coord_t * coord, flush_pos_t * pos)
12250 +{
12251 +       int ret = 0;
12252 +       lock_handle alock;
12253 +       load_count aload;
12254 +       coord_t acoord;
12255 +
12256 +       /* As we ascend at the left-edge of the region to flush, take this opportunity at
12257 +          the twig level to find our parent-first preceder unless we have already set
12258 +          it. */
12259 +       if (pos->preceder.blk == 0) {
12260 +               ret = set_preceder(coord, pos);
12261 +               if (ret != 0)
12262 +                       return ret;
12263 +       }
12264 +
12265 +       /* If the ancestor is clean or already allocated, or if the child is not a
12266 +          leftmost child, stop going up, even leaving coord->node not flushprepped. */
12267 +       if (znode_check_flushprepped(coord->node)
12268 +           || !coord_is_leftmost_unit(coord))
12269 +               return 0;
12270 +
12271 +       init_lh(&alock);
12272 +       init_load_count(&aload);
12273 +       coord_init_invalid(&acoord, NULL);
12274 +
12275 +       /* Only ascend to the next level if it is a leftmost child, but write-lock the
12276 +          parent in case we will relocate the child. */
12277 +       if (!znode_is_root(coord->node)) {
12278 +
12279 +               ret =
12280 +                   jnode_lock_parent_coord(ZJNODE(coord->node), &acoord,
12281 +                                           &alock, &aload, ZNODE_WRITE_LOCK,
12282 +                                           0);
12283 +               if (ret != 0) {
12284 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
12285 +                       goto exit;
12286 +               }
12287 +
12288 +               ret =
12289 +                   reverse_relocate_check_dirty_parent(ZJNODE(coord->node),
12290 +                                                       &acoord, pos);
12291 +               if (ret != 0) {
12292 +                       goto exit;
12293 +               }
12294 +
12295 +               /* Recursive call. */
12296 +               if (!znode_check_flushprepped(acoord.node)) {
12297 +                       ret = alloc_one_ancestor(&acoord, pos);
12298 +                       if (ret)
12299 +                               goto exit;
12300 +               }
12301 +       }
12302 +
12303 +       /* Note: we call allocate with the parent write-locked (except at the root) in
12304 +          case we relocate the child, in which case it will modify the parent during this
12305 +          call. */
12306 +       ret = allocate_znode(coord->node, &acoord, pos);
12307 +
12308 +      exit:
12309 +       done_load_count(&aload);
12310 +       done_lh(&alock);
12311 +       return ret;
12312 +}
12313 +
12314 +/* During the reverse parent-first alloc_pos_and_ancestors process described above there is
12315 +   a call to this function at the twig level.  During alloc_pos_and_ancestors we may ask:
12316 +   should this node be relocated (in reverse parent-first context)?  We repeat this
12317 +   process as long as the child is the leftmost child, eventually reaching an ancestor of
12318 +   the flush point that is not a leftmost child.  The preceder of that ancestors, which is
12319 +   not a leftmost child, is actually on the leaf level.  The preceder of that block is the
12320 +   left-neighbor of the flush point.  The preceder of that block is the rightmost child of
12321 +   the twig on the left.  So, when alloc_pos_and_ancestors passes upward through the twig
12322 +   level, it stops momentarily to remember the block of the rightmost child of the twig on
12323 +   the left and sets it to the flush_position's preceder_hint.
12324 +
12325 +   There is one other place where we may set the flush_position's preceder hint, which is
12326 +   during scan-left.
12327 +*/
12328 +static int set_preceder(const coord_t * coord_in, flush_pos_t * pos)
12329 +{
12330 +       int ret;
12331 +       coord_t coord;
12332 +       lock_handle left_lock;
12333 +       load_count left_load;
12334 +
12335 +       coord_dup(&coord, coord_in);
12336 +
12337 +       init_lh(&left_lock);
12338 +       init_load_count(&left_load);
12339 +
12340 +       /* FIXME(B): Same FIXME as in "Find the preceder" in reverse_relocate_test.
12341 +          coord_is_leftmost_unit is not the right test if the unformatted child is in the
12342 +          middle of the first extent unit. */
12343 +       if (!coord_is_leftmost_unit(&coord)) {
12344 +               coord_prev_unit(&coord);
12345 +       } else {
12346 +               ret =
12347 +                   reiser4_get_left_neighbor(&left_lock, coord.node,
12348 +                                             ZNODE_READ_LOCK, GN_SAME_ATOM);
12349 +               if (ret) {
12350 +                       /* If we fail for any reason it doesn't matter because the
12351 +                          preceder is only a hint.  We are low-priority at this point, so
12352 +                          this must be the case. */
12353 +                       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
12354 +                           ret == -ENOENT || ret == -EINVAL
12355 +                           || ret == -E_DEADLOCK) {
12356 +                               ret = 0;
12357 +                       }
12358 +                       goto exit;
12359 +               }
12360 +
12361 +               ret = incr_load_count_znode(&left_load, left_lock.node);
12362 +               if (ret)
12363 +                       goto exit;
12364 +
12365 +               coord_init_last_unit(&coord, left_lock.node);
12366 +       }
12367 +
12368 +       ret =
12369 +           item_utmost_child_real_block(&coord, RIGHT_SIDE,
12370 +                                        &pos->preceder.blk);
12371 +      exit:
12372 +       check_preceder(pos->preceder.blk);
12373 +       done_load_count(&left_load);
12374 +       done_lh(&left_lock);
12375 +       return ret;
12376 +}
12377 +
12378 +/* MAIN SQUEEZE AND ALLOCATE LOOP (THREE BIG FUNCTIONS) */
12379 +
12380 +/* This procedure implements the outer loop of the flush algorithm.  To put this in
12381 +   context, here is the general list of steps taken by the flush routine as a whole:
12382 +
12383 +   1. Scan-left
12384 +   2. Scan-right (maybe)
12385 +   3. Allocate initial flush position and its ancestors
12386 +   4. <handle extents>
12387 +   5. <squeeze and next position and its ancestors to-the-right,
12388 +       then update position to-the-right>
12389 +   6. <repeat from #4 until flush is stopped>
12390 +
12391 +   This procedure implements the loop in steps 4 through 6 in the above listing.
12392 +
12393 +   Step 4: if the current flush position is an extent item (position on the twig level),
12394 +   it allocates the extent (allocate_extent_item_in_place) then shifts to the next
12395 +   coordinate.  If the next coordinate's leftmost child needs flushprep, we will continue.
12396 +   If the next coordinate is an internal item, we descend back to the leaf level,
12397 +   otherwise we repeat a step #4 (labeled ALLOC_EXTENTS below).  If the "next coordinate"
12398 +   brings us past the end of the twig level, then we call
12399 +   reverse_relocate_end_of_twig to possibly dirty the next (right) twig, prior to
12400 +   step #5 which moves to the right.
12401 +
12402 +   Step 5: calls squalloc_changed_ancestors, which initiates a recursive call up the
12403 +   tree to allocate any ancestors of the next-right flush position that are not also
12404 +   ancestors of the current position.  Those ancestors (in top-down order) are the next in
12405 +   parent-first order.  We squeeze adjacent nodes on the way up until the right node and
12406 +   current node share the same parent, then allocate on the way back down.  Finally, this
12407 +   step sets the flush position to the next-right node.  Then repeat steps 4 and 5.
12408 +*/
12409 +
12410 +/* SQUEEZE CODE */
12411 +
12412 +/* squalloc_right_twig helper function, cut a range of extent items from
12413 +   cut node to->node from the beginning up to coord @to. */
12414 +static int squalloc_right_twig_cut(coord_t * to, reiser4_key * to_key,
12415 +                                  znode * left)
12416 +{
12417 +       coord_t from;
12418 +       reiser4_key from_key;
12419 +
12420 +       coord_init_first_unit(&from, to->node);
12421 +       item_key_by_coord(&from, &from_key);
12422 +
12423 +       return cut_node_content(&from, to, &from_key, to_key, NULL);
12424 +}
12425 +
12426 +/* Copy as much of the leading extents from @right to @left, allocating
12427 +   unallocated extents as they are copied.  Returns SQUEEZE_TARGET_FULL or
12428 +   SQUEEZE_SOURCE_EMPTY when no more can be shifted.  If the next item is an
12429 +   internal item it calls shift_one_internal_unit and may then return
12430 +   SUBTREE_MOVED. */
12431 +static int squeeze_right_twig(znode * left, znode * right, flush_pos_t * pos)
12432 +{
12433 +       int ret = SUBTREE_MOVED;
12434 +       coord_t coord;          /* used to iterate over items */
12435 +       reiser4_key stop_key;
12436 +
12437 +       assert("jmacd-2008", !node_is_empty(right));
12438 +       coord_init_first_unit(&coord, right);
12439 +
12440 +       /* FIXME: can be optimized to cut once */
12441 +       while (!node_is_empty(coord.node) && item_is_extent(&coord)) {
12442 +               ON_DEBUG(void *vp);
12443 +
12444 +               assert("vs-1468", coord_is_leftmost_unit(&coord));
12445 +               ON_DEBUG(vp = shift_check_prepare(left, coord.node));
12446 +
12447 +               /* stop_key is used to find what was copied and what to cut */
12448 +               stop_key = *reiser4_min_key();
12449 +               ret = squalloc_extent(left, &coord, pos, &stop_key);
12450 +               if (ret != SQUEEZE_CONTINUE) {
12451 +                       ON_DEBUG(kfree(vp));
12452 +                       break;
12453 +               }
12454 +               assert("vs-1465", !keyeq(&stop_key, reiser4_min_key()));
12455 +
12456 +               /* Helper function to do the cutting. */
12457 +               set_key_offset(&stop_key, get_key_offset(&stop_key) - 1);
12458 +               check_me("vs-1466",
12459 +                        squalloc_right_twig_cut(&coord, &stop_key, left) == 0);
12460 +
12461 +               ON_DEBUG(shift_check(vp, left, coord.node));
12462 +       }
12463 +
12464 +       if (node_is_empty(coord.node))
12465 +               ret = SQUEEZE_SOURCE_EMPTY;
12466 +
12467 +       if (ret == SQUEEZE_TARGET_FULL) {
12468 +               goto out;
12469 +       }
12470 +
12471 +       if (node_is_empty(right)) {
12472 +               /* The whole right node was copied into @left. */
12473 +               assert("vs-464", ret == SQUEEZE_SOURCE_EMPTY);
12474 +               goto out;
12475 +       }
12476 +
12477 +       coord_init_first_unit(&coord, right);
12478 +
12479 +       if (!item_is_internal(&coord)) {
12480 +               /* we do not want to squeeze anything else to left neighbor because "slum"
12481 +                  is over */
12482 +               ret = SQUEEZE_TARGET_FULL;
12483 +               goto out;
12484 +       }
12485 +       assert("jmacd-433", item_is_internal(&coord));
12486 +
12487 +       /* Shift an internal unit.  The child must be allocated before shifting any more
12488 +          extents, so we stop here. */
12489 +       ret = shift_one_internal_unit(left, right);
12490 +
12491 +      out:
12492 +       assert("jmacd-8612", ret < 0 || ret == SQUEEZE_TARGET_FULL
12493 +              || ret == SUBTREE_MOVED || ret == SQUEEZE_SOURCE_EMPTY);
12494 +
12495 +       if (ret == SQUEEZE_TARGET_FULL) {
12496 +               /* We submit prepped nodes here and expect that this @left twig
12497 +                * will not be modified again during this jnode_flush() call. */
12498 +               int ret1;
12499 +
12500 +               /* NOTE: seems like io is done under long term locks. */
12501 +               ret1 = write_prepped_nodes(pos);
12502 +               if (ret1 < 0)
12503 +                       return ret1;
12504 +       }
12505 +
12506 +       return ret;
12507 +}
12508 +
12509 +#if REISER4_DEBUG
12510 +static void item_convert_invariant(flush_pos_t * pos)
12511 +{
12512 +       assert("edward-1225", coord_is_existing_item(&pos->coord));
12513 +       if (chaining_data_present(pos)) {
12514 +               item_plugin *iplug = item_convert_plug(pos);
12515 +
12516 +               assert("edward-1000",
12517 +                      iplug == item_plugin_by_coord(&pos->coord));
12518 +               assert("edward-1001", iplug->f.convert != NULL);
12519 +       } else
12520 +               assert("edward-1226", pos->child == NULL);
12521 +}
12522 +#else
12523 +
12524 +#define item_convert_invariant(pos) noop
12525 +
12526 +#endif
12527 +
12528 +/* Scan node items starting from the first one and apply for each
12529 +   item its flush ->convert() method (if any). This method may
12530 +   resize/kill the item so the tree will be changed.
12531 +*/
12532 +static int convert_node(flush_pos_t * pos, znode * node)
12533 +{
12534 +       int ret = 0;
12535 +       item_plugin *iplug;
12536 +
12537 +       assert("edward-304", pos != NULL);
12538 +       assert("edward-305", pos->child == NULL);
12539 +       assert("edward-475", znode_convertible(node));
12540 +       assert("edward-669", znode_is_wlocked(node));
12541 +       assert("edward-1210", !node_is_empty(node));
12542 +
12543 +       if (znode_get_level(node) != LEAF_LEVEL)
12544 +               /* unsupported */
12545 +               goto exit;
12546 +
12547 +       coord_init_first_unit(&pos->coord, node);
12548 +
12549 +       while (1) {
12550 +               ret = 0;
12551 +               coord_set_to_left(&pos->coord);
12552 +               item_convert_invariant(pos);
12553 +
12554 +               iplug = item_plugin_by_coord(&pos->coord);
12555 +               assert("edward-844", iplug != NULL);
12556 +
12557 +               if (iplug->f.convert) {
12558 +                       ret = iplug->f.convert(pos);
12559 +                       if (ret)
12560 +                               goto exit;
12561 +               }
12562 +               assert("edward-307", pos->child == NULL);
12563 +
12564 +               if (coord_next_item(&pos->coord)) {
12565 +                       /* node is over */
12566 +
12567 +                       if (!chaining_data_present(pos))
12568 +                               /* finished this node */
12569 +                               break;
12570 +                       if (should_chain_next_node(pos)) {
12571 +                               /* go to next node */
12572 +                               move_chaining_data(pos, 0 /* to next node */ );
12573 +                               break;
12574 +                       }
12575 +                       /* repeat this node */
12576 +                       move_chaining_data(pos, 1 /* this node */ );
12577 +                       continue;
12578 +               }
12579 +               /* Node is not over.
12580 +                  Check if there is attached convert data.
12581 +                  If so roll one item position back and repeat
12582 +                  on this node
12583 +                */
12584 +               if (chaining_data_present(pos)) {
12585 +
12586 +                       if (iplug != item_plugin_by_coord(&pos->coord))
12587 +                               set_item_convert_count(pos, 0);
12588 +
12589 +                       ret = coord_prev_item(&pos->coord);
12590 +                       assert("edward-1003", !ret);
12591 +
12592 +                       move_chaining_data(pos, 1 /* this node */ );
12593 +               }
12594 +       }
12595 +       JF_CLR(ZJNODE(node), JNODE_CONVERTIBLE);
12596 +       znode_make_dirty(node);
12597 +      exit:
12598 +       assert("edward-1004", !ret);
12599 +       return ret;
12600 +}
12601 +
12602 +/* Squeeze and allocate the right neighbor.  This is called after @left and
12603 +   its current children have been squeezed and allocated already.  This
12604 +   procedure's job is to squeeze and items from @right to @left.
12605 +
12606 +   If at the leaf level, use the shift_everything_left memcpy-optimized
12607 +   version of shifting (squeeze_right_leaf).
12608 +
12609 +   If at the twig level, extents are allocated as they are shifted from @right
12610 +   to @left (squalloc_right_twig).
12611 +
12612 +   At any other level, shift one internal item and return to the caller
12613 +   (squalloc_parent_first) so that the shifted-subtree can be processed in
12614 +   parent-first order.
12615 +
12616 +   When unit of internal item is moved, squeezing stops and SUBTREE_MOVED is
12617 +   returned.  When all content of @right is squeezed, SQUEEZE_SOURCE_EMPTY is
12618 +   returned.  If nothing can be moved into @left anymore, SQUEEZE_TARGET_FULL
12619 +   is returned.
12620 +*/
12621 +
12622 +static int squeeze_right_neighbor(flush_pos_t * pos, znode * left,
12623 +                                 znode * right)
12624 +{
12625 +       int ret;
12626 +
12627 +       /* FIXME it is possible to see empty hasn't-heard-banshee node in a
12628 +        * tree owing to error (for example, ENOSPC) in write */
12629 +       /* assert("jmacd-9321", !node_is_empty(left)); */
12630 +       assert("jmacd-9322", !node_is_empty(right));
12631 +       assert("jmacd-9323", znode_get_level(left) == znode_get_level(right));
12632 +
12633 +       switch (znode_get_level(left)) {
12634 +       case TWIG_LEVEL:
12635 +               /* Shift with extent allocating until either an internal item
12636 +                  is encountered or everything is shifted or no free space
12637 +                  left in @left */
12638 +               ret = squeeze_right_twig(left, right, pos);
12639 +               break;
12640 +
12641 +       default:
12642 +               /* All other levels can use shift_everything until we implement per-item
12643 +                  flush plugins. */
12644 +               ret = squeeze_right_non_twig(left, right);
12645 +               break;
12646 +       }
12647 +
12648 +       assert("jmacd-2011", (ret < 0 ||
12649 +                             ret == SQUEEZE_SOURCE_EMPTY
12650 +                             || ret == SQUEEZE_TARGET_FULL
12651 +                             || ret == SUBTREE_MOVED));
12652 +       return ret;
12653 +}
12654 +
12655 +static int squeeze_right_twig_and_advance_coord(flush_pos_t * pos,
12656 +                                               znode * right)
12657 +{
12658 +       int ret;
12659 +
12660 +       ret = squeeze_right_twig(pos->lock.node, right, pos);
12661 +       if (ret < 0)
12662 +               return ret;
12663 +       if (ret > 0) {
12664 +               coord_init_after_last_item(&pos->coord, pos->lock.node);
12665 +               return ret;
12666 +       }
12667 +
12668 +       coord_init_last_unit(&pos->coord, pos->lock.node);
12669 +       return 0;
12670 +}
12671 +
12672 +/* forward declaration */
12673 +static int squalloc_upper_levels(flush_pos_t *, znode *, znode *);
12674 +
12675 +/* do a fast check for "same parents" condition before calling
12676 + * squalloc_upper_levels() */
12677 +static inline int check_parents_and_squalloc_upper_levels(flush_pos_t * pos,
12678 +                                                         znode * left,
12679 +                                                         znode * right)
12680 +{
12681 +       if (znode_same_parents(left, right))
12682 +               return 0;
12683 +
12684 +       return squalloc_upper_levels(pos, left, right);
12685 +}
12686 +
12687 +/* Check whether the parent of given @right node needs to be processes
12688 +   ((re)allocated) prior to processing of the child.  If @left and @right do not
12689 +   share at least the parent of the @right is after the @left but before the
12690 +   @right in parent-first order, we have to (re)allocate it before the @right
12691 +   gets (re)allocated. */
12692 +static int squalloc_upper_levels(flush_pos_t * pos, znode * left, znode * right)
12693 +{
12694 +       int ret;
12695 +
12696 +       lock_handle left_parent_lock;
12697 +       lock_handle right_parent_lock;
12698 +
12699 +       load_count left_parent_load;
12700 +       load_count right_parent_load;
12701 +
12702 +       init_lh(&left_parent_lock);
12703 +       init_lh(&right_parent_lock);
12704 +
12705 +       init_load_count(&left_parent_load);
12706 +       init_load_count(&right_parent_load);
12707 +
12708 +       ret = reiser4_get_parent(&left_parent_lock, left, ZNODE_WRITE_LOCK);
12709 +       if (ret)
12710 +               goto out;
12711 +
12712 +       ret = reiser4_get_parent(&right_parent_lock, right, ZNODE_WRITE_LOCK);
12713 +       if (ret)
12714 +               goto out;
12715 +
12716 +       /* Check for same parents */
12717 +       if (left_parent_lock.node == right_parent_lock.node)
12718 +               goto out;
12719 +
12720 +       if (znode_check_flushprepped(right_parent_lock.node)) {
12721 +               /* Keep parent-first order.  In the order, the right parent node stands
12722 +                  before the @right node.  If it is already allocated, we set the
12723 +                  preceder (next block search start point) to its block number, @right
12724 +                  node should be allocated after it.
12725 +
12726 +                  However, preceder is set only if the right parent is on twig level.
12727 +                  The explanation is the following: new branch nodes are allocated over
12728 +                  already allocated children while the tree grows, it is difficult to
12729 +                  keep tree ordered, we assume that only leaves and twings are correctly
12730 +                  allocated.  So, only twigs are used as a preceder for allocating of the
12731 +                  rest of the slum. */
12732 +               if (znode_get_level(right_parent_lock.node) == TWIG_LEVEL) {
12733 +                       pos->preceder.blk =
12734 +                           *znode_get_block(right_parent_lock.node);
12735 +                       check_preceder(pos->preceder.blk);
12736 +               }
12737 +               goto out;
12738 +       }
12739 +
12740 +       ret = incr_load_count_znode(&left_parent_load, left_parent_lock.node);
12741 +       if (ret)
12742 +               goto out;
12743 +
12744 +       ret = incr_load_count_znode(&right_parent_load, right_parent_lock.node);
12745 +       if (ret)
12746 +               goto out;
12747 +
12748 +       ret =
12749 +           squeeze_right_neighbor(pos, left_parent_lock.node,
12750 +                                  right_parent_lock.node);
12751 +       /* We stop if error. We stop if some items/units were shifted (ret == 0)
12752 +        * and thus @right changed its parent. It means we have not process
12753 +        * right_parent node prior to processing of @right. Positive return
12754 +        * values say that shifting items was not happen because of "empty
12755 +        * source" or "target full" conditions. */
12756 +       if (ret <= 0)
12757 +               goto out;
12758 +
12759 +       /* parent(@left) and parent(@right) may have different parents also. We
12760 +        * do a recursive call for checking that. */
12761 +       ret =
12762 +           check_parents_and_squalloc_upper_levels(pos, left_parent_lock.node,
12763 +                                                   right_parent_lock.node);
12764 +       if (ret)
12765 +               goto out;
12766 +
12767 +       /* allocate znode when going down */
12768 +       ret = lock_parent_and_allocate_znode(right_parent_lock.node, pos);
12769 +
12770 +      out:
12771 +       done_load_count(&left_parent_load);
12772 +       done_load_count(&right_parent_load);
12773 +
12774 +       done_lh(&left_parent_lock);
12775 +       done_lh(&right_parent_lock);
12776 +
12777 +       return ret;
12778 +}
12779 +
12780 +/* Check the leftmost child "flushprepped" status, also returns true if child
12781 + * node was not found in cache.  */
12782 +static int leftmost_child_of_unit_check_flushprepped(const coord_t * coord)
12783 +{
12784 +       int ret;
12785 +       int prepped;
12786 +
12787 +       jnode *child;
12788 +
12789 +       ret = get_leftmost_child_of_unit(coord, &child);
12790 +
12791 +       if (ret)
12792 +               return ret;
12793 +
12794 +       if (child) {
12795 +               prepped = jnode_check_flushprepped(child);
12796 +               jput(child);
12797 +       } else {
12798 +               /* We consider not existing child as a node which slum
12799 +                  processing should not continue to.  Not cached node is clean,
12800 +                  so it is flushprepped. */
12801 +               prepped = 1;
12802 +       }
12803 +
12804 +       return prepped;
12805 +}
12806 +
12807 +/* (re)allocate znode with automated getting parent node */
12808 +static int lock_parent_and_allocate_znode(znode * node, flush_pos_t * pos)
12809 +{
12810 +       int ret;
12811 +       lock_handle parent_lock;
12812 +       load_count parent_load;
12813 +       coord_t pcoord;
12814 +
12815 +       assert("zam-851", znode_is_write_locked(node));
12816 +
12817 +       init_lh(&parent_lock);
12818 +       init_load_count(&parent_load);
12819 +
12820 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
12821 +       if (ret)
12822 +               goto out;
12823 +
12824 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
12825 +       if (ret)
12826 +               goto out;
12827 +
12828 +       ret = find_child_ptr(parent_lock.node, node, &pcoord);
12829 +       if (ret)
12830 +               goto out;
12831 +
12832 +       ret = allocate_znode(node, &pcoord, pos);
12833 +
12834 +      out:
12835 +       done_load_count(&parent_load);
12836 +       done_lh(&parent_lock);
12837 +       return ret;
12838 +}
12839 +
12840 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12841 + * slum reached.  */
12842 +static int handle_pos_on_formatted(flush_pos_t * pos)
12843 +{
12844 +       int ret;
12845 +       lock_handle right_lock;
12846 +       load_count right_load;
12847 +
12848 +       init_lh(&right_lock);
12849 +       init_load_count(&right_load);
12850 +
12851 +       if (should_convert_node(pos, pos->lock.node)) {
12852 +               ret = convert_node(pos, pos->lock.node);
12853 +               if (ret)
12854 +                       return ret;
12855 +       }
12856 +
12857 +       while (1) {
12858 +               ret =
12859 +                   neighbor_in_slum(pos->lock.node, &right_lock, RIGHT_SIDE,
12860 +                                    ZNODE_WRITE_LOCK,
12861 +                                    !should_convert_next_node(pos,
12862 +                                                              right_lock.
12863 +                                                              node));
12864 +               if (ret)
12865 +                       break;
12866 +
12867 +               /* we don't prep(allocate) nodes for flushing twice.  This can be suboptimal, or it
12868 +                * can be optimal.  For now we choose to live with the risk that it will
12869 +                * be suboptimal because it would be quite complex to code it to be
12870 +                * smarter. */
12871 +               if (znode_check_flushprepped(right_lock.node)
12872 +                   && !znode_convertible(right_lock.node)) {
12873 +                       assert("edward-1005",
12874 +                              !should_convert_next_node(pos, right_lock.node));
12875 +                       pos_stop(pos);
12876 +                       break;
12877 +               }
12878 +
12879 +               ret = incr_load_count_znode(&right_load, right_lock.node);
12880 +               if (ret)
12881 +                       break;
12882 +
12883 +               if (should_convert_node(pos, right_lock.node)) {
12884 +                       ret = convert_node(pos, right_lock.node);
12885 +                       if (ret)
12886 +                               break;
12887 +                       if (node_is_empty(right_lock.node)) {
12888 +                               /* node became empty after converting, repeat */
12889 +                               done_load_count(&right_load);
12890 +                               done_lh(&right_lock);
12891 +                               continue;
12892 +                       }
12893 +               }
12894 +
12895 +               /* squeeze _before_ going upward. */
12896 +               ret =
12897 +                   squeeze_right_neighbor(pos, pos->lock.node,
12898 +                                          right_lock.node);
12899 +               if (ret < 0)
12900 +                       break;
12901 +
12902 +               if (znode_check_flushprepped(right_lock.node)) {
12903 +                       if (should_convert_next_node(pos, right_lock.node)) {
12904 +                               /* in spite of flushprepped status of the node,
12905 +                                  its right slum neighbor should be converted */
12906 +                               assert("edward-953", convert_data(pos));
12907 +                               assert("edward-954", item_convert_data(pos));
12908 +
12909 +                               if (node_is_empty(right_lock.node)) {
12910 +                                       done_load_count(&right_load);
12911 +                                       done_lh(&right_lock);
12912 +                               } else
12913 +                                       move_flush_pos(pos, &right_lock,
12914 +                                                      &right_load, NULL);
12915 +                               continue;
12916 +                       }
12917 +                       pos_stop(pos);
12918 +                       break;
12919 +               }
12920 +
12921 +               if (node_is_empty(right_lock.node)) {
12922 +                       /* repeat if right node was squeezed completely */
12923 +                       done_load_count(&right_load);
12924 +                       done_lh(&right_lock);
12925 +                       continue;
12926 +               }
12927 +
12928 +               /* parent(right_lock.node) has to be processed before
12929 +                * (right_lock.node) due to "parent-first" allocation order. */
12930 +               ret =
12931 +                   check_parents_and_squalloc_upper_levels(pos, pos->lock.node,
12932 +                                                           right_lock.node);
12933 +               if (ret)
12934 +                       break;
12935 +               /* (re)allocate _after_ going upward */
12936 +               ret = lock_parent_and_allocate_znode(right_lock.node, pos);
12937 +               if (ret)
12938 +                       break;
12939 +
12940 +               if (should_terminate_squalloc(pos)) {
12941 +                       set_item_convert_count(pos, 0);
12942 +                       break;
12943 +               }
12944 +
12945 +               /* advance the flush position to the right neighbor */
12946 +               move_flush_pos(pos, &right_lock, &right_load, NULL);
12947 +
12948 +               ret = rapid_flush(pos);
12949 +               if (ret)
12950 +                       break;
12951 +       }
12952 +
12953 +       assert("edward-1006", !convert_data(pos) || !item_convert_data(pos));
12954 +
12955 +       done_load_count(&right_load);
12956 +       done_lh(&right_lock);
12957 +
12958 +       /* This function indicates via pos whether to stop or go to twig or continue on current
12959 +        * level. */
12960 +       return ret;
12961 +
12962 +}
12963 +
12964 +/* Process nodes on leaf level until unformatted node or rightmost node in the
12965 + * slum reached.  */
12966 +static int handle_pos_on_leaf(flush_pos_t * pos)
12967 +{
12968 +       int ret;
12969 +
12970 +       assert("zam-845", pos->state == POS_ON_LEAF);
12971 +
12972 +       ret = handle_pos_on_formatted(pos);
12973 +
12974 +       if (ret == -E_NO_NEIGHBOR) {
12975 +               /* cannot get right neighbor, go process extents. */
12976 +               pos->state = POS_TO_TWIG;
12977 +               return 0;
12978 +       }
12979 +
12980 +       return ret;
12981 +}
12982 +
12983 +/* Process slum on level > 1 */
12984 +static int handle_pos_on_internal(flush_pos_t * pos)
12985 +{
12986 +       assert("zam-850", pos->state == POS_ON_INTERNAL);
12987 +       return handle_pos_on_formatted(pos);
12988 +}
12989 +
12990 +/* check whether squalloc should stop before processing given extent */
12991 +static int squalloc_extent_should_stop(flush_pos_t * pos)
12992 +{
12993 +       assert("zam-869", item_is_extent(&pos->coord));
12994 +
12995 +       /* pos->child is a jnode handle_pos_on_extent() should start with in
12996 +        * stead of the first child of the first extent unit. */
12997 +       if (pos->child) {
12998 +               int prepped;
12999 +
13000 +               assert("vs-1383", jnode_is_unformatted(pos->child));
13001 +               prepped = jnode_check_flushprepped(pos->child);
13002 +               pos->pos_in_unit =
13003 +                   jnode_get_index(pos->child) -
13004 +                   extent_unit_index(&pos->coord);
13005 +               assert("vs-1470",
13006 +                      pos->pos_in_unit < extent_unit_width(&pos->coord));
13007 +               assert("nikita-3434",
13008 +                      ergo(extent_is_unallocated(&pos->coord),
13009 +                           pos->pos_in_unit == 0));
13010 +               jput(pos->child);
13011 +               pos->child = NULL;
13012 +
13013 +               return prepped;
13014 +       }
13015 +
13016 +       pos->pos_in_unit = 0;
13017 +       if (extent_is_unallocated(&pos->coord))
13018 +               return 0;
13019 +
13020 +       return leftmost_child_of_unit_check_flushprepped(&pos->coord);
13021 +}
13022 +
13023 +/* Handle the case when regular reiser4 tree (znodes connected one to its
13024 + * neighbors by sibling pointers) is interrupted on leaf level by one or more
13025 + * unformatted nodes.  By having a lock on twig level and use extent code
13026 + * routines to process unformatted nodes we swim around an irregular part of
13027 + * reiser4 tree. */
13028 +static int handle_pos_on_twig(flush_pos_t * pos)
13029 +{
13030 +       int ret;
13031 +
13032 +       assert("zam-844", pos->state == POS_ON_EPOINT);
13033 +       assert("zam-843", item_is_extent(&pos->coord));
13034 +
13035 +       /* We decide should we continue slum processing with current extent
13036 +          unit: if leftmost child of current extent unit is flushprepped
13037 +          (i.e. clean or already processed by flush) we stop squalloc().  There
13038 +          is a fast check for unallocated extents which we assume contain all
13039 +          not flushprepped nodes. */
13040 +       /* FIXME: Here we implement simple check, we are only looking on the
13041 +          leftmost child. */
13042 +       ret = squalloc_extent_should_stop(pos);
13043 +       if (ret != 0) {
13044 +               pos_stop(pos);
13045 +               return ret;
13046 +       }
13047 +
13048 +       while (pos_valid(pos) && coord_is_existing_unit(&pos->coord)
13049 +              && item_is_extent(&pos->coord)) {
13050 +               ret = reiser4_alloc_extent(pos);
13051 +               if (ret) {
13052 +                       break;
13053 +               }
13054 +               coord_next_unit(&pos->coord);
13055 +       }
13056 +
13057 +       if (coord_is_after_rightmost(&pos->coord)) {
13058 +               pos->state = POS_END_OF_TWIG;
13059 +               return 0;
13060 +       }
13061 +       if (item_is_internal(&pos->coord)) {
13062 +               pos->state = POS_TO_LEAF;
13063 +               return 0;
13064 +       }
13065 +
13066 +       assert("zam-860", item_is_extent(&pos->coord));
13067 +
13068 +       /* "slum" is over */
13069 +       pos->state = POS_INVALID;
13070 +       return 0;
13071 +}
13072 +
13073 +/* When we about to return flush position from twig to leaf level we can process
13074 + * the right twig node or move position to the leaf.  This processes right twig
13075 + * if it is possible and jump to leaf level if not. */
13076 +static int handle_pos_end_of_twig(flush_pos_t * pos)
13077 +{
13078 +       int ret;
13079 +       lock_handle right_lock;
13080 +       load_count right_load;
13081 +       coord_t at_right;
13082 +       jnode *child = NULL;
13083 +
13084 +       assert("zam-848", pos->state == POS_END_OF_TWIG);
13085 +       assert("zam-849", coord_is_after_rightmost(&pos->coord));
13086 +
13087 +       init_lh(&right_lock);
13088 +       init_load_count(&right_load);
13089 +
13090 +       /* We get a lock on the right twig node even it is not dirty because
13091 +        * slum continues or discontinues on leaf level not on next twig. This
13092 +        * lock on the right twig is needed for getting its leftmost child. */
13093 +       ret =
13094 +           reiser4_get_right_neighbor(&right_lock, pos->lock.node,
13095 +                                      ZNODE_WRITE_LOCK, GN_SAME_ATOM);
13096 +       if (ret)
13097 +               goto out;
13098 +
13099 +       ret = incr_load_count_znode(&right_load, right_lock.node);
13100 +       if (ret)
13101 +               goto out;
13102 +
13103 +       /* right twig could be not dirty */
13104 +       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY)) {
13105 +               /* If right twig node is dirty we always attempt to squeeze it
13106 +                * content to the left... */
13107 +             became_dirty:
13108 +               ret =
13109 +                   squeeze_right_twig_and_advance_coord(pos, right_lock.node);
13110 +               if (ret <= 0) {
13111 +                       /* pos->coord is on internal item, go to leaf level, or
13112 +                        * we have an error which will be caught in squalloc() */
13113 +                       pos->state = POS_TO_LEAF;
13114 +                       goto out;
13115 +               }
13116 +
13117 +               /* If right twig was squeezed completely we wave to re-lock
13118 +                * right twig. now it is done through the top-level squalloc
13119 +                * routine. */
13120 +               if (node_is_empty(right_lock.node))
13121 +                       goto out;
13122 +
13123 +               /* ... and prep it if it is not yet prepped */
13124 +               if (!znode_check_flushprepped(right_lock.node)) {
13125 +                       /* As usual, process parent before ... */
13126 +                       ret =
13127 +                           check_parents_and_squalloc_upper_levels(pos,
13128 +                                                                   pos->lock.
13129 +                                                                   node,
13130 +                                                                   right_lock.
13131 +                                                                   node);
13132 +                       if (ret)
13133 +                               goto out;
13134 +
13135 +                       /* ... processing the child */
13136 +                       ret =
13137 +                           lock_parent_and_allocate_znode(right_lock.node,
13138 +                                                          pos);
13139 +                       if (ret)
13140 +                               goto out;
13141 +               }
13142 +       } else {
13143 +               coord_init_first_unit(&at_right, right_lock.node);
13144 +
13145 +               /* check first child of next twig, should we continue there ? */
13146 +               ret = get_leftmost_child_of_unit(&at_right, &child);
13147 +               if (ret || child == NULL || jnode_check_flushprepped(child)) {
13148 +                       pos_stop(pos);
13149 +                       goto out;
13150 +               }
13151 +
13152 +               /* check clean twig for possible relocation */
13153 +               if (!znode_check_flushprepped(right_lock.node)) {
13154 +                       ret =
13155 +                           reverse_relocate_check_dirty_parent(child,
13156 +                                                               &at_right, pos);
13157 +                       if (ret)
13158 +                               goto out;
13159 +                       if (JF_ISSET(ZJNODE(right_lock.node), JNODE_DIRTY))
13160 +                               goto became_dirty;
13161 +               }
13162 +       }
13163 +
13164 +       assert("zam-875", znode_check_flushprepped(right_lock.node));
13165 +
13166 +       /* Update the preceder by a block number of just processed right twig
13167 +        * node. The code above could miss the preceder updating because
13168 +        * allocate_znode() could not be called for this node. */
13169 +       pos->preceder.blk = *znode_get_block(right_lock.node);
13170 +       check_preceder(pos->preceder.blk);
13171 +
13172 +       coord_init_first_unit(&at_right, right_lock.node);
13173 +       assert("zam-868", coord_is_existing_unit(&at_right));
13174 +
13175 +       pos->state = item_is_extent(&at_right) ? POS_ON_EPOINT : POS_TO_LEAF;
13176 +       move_flush_pos(pos, &right_lock, &right_load, &at_right);
13177 +
13178 +      out:
13179 +       done_load_count(&right_load);
13180 +       done_lh(&right_lock);
13181 +
13182 +       if (child)
13183 +               jput(child);
13184 +
13185 +       return ret;
13186 +}
13187 +
13188 +/* Move the pos->lock to leaf node pointed by pos->coord, check should we
13189 + * continue there. */
13190 +static int handle_pos_to_leaf(flush_pos_t * pos)
13191 +{
13192 +       int ret;
13193 +       lock_handle child_lock;
13194 +       load_count child_load;
13195 +       jnode *child;
13196 +
13197 +       assert("zam-846", pos->state == POS_TO_LEAF);
13198 +       assert("zam-847", item_is_internal(&pos->coord));
13199 +
13200 +       init_lh(&child_lock);
13201 +       init_load_count(&child_load);
13202 +
13203 +       ret = get_leftmost_child_of_unit(&pos->coord, &child);
13204 +       if (ret)
13205 +               return ret;
13206 +       if (child == NULL) {
13207 +               pos_stop(pos);
13208 +               return 0;
13209 +       }
13210 +
13211 +       if (jnode_check_flushprepped(child)) {
13212 +               pos->state = POS_INVALID;
13213 +               goto out;
13214 +       }
13215 +
13216 +       ret =
13217 +           longterm_lock_znode(&child_lock, JZNODE(child), ZNODE_WRITE_LOCK,
13218 +                               ZNODE_LOCK_LOPRI);
13219 +       if (ret)
13220 +               goto out;
13221 +
13222 +       ret = incr_load_count_znode(&child_load, JZNODE(child));
13223 +       if (ret)
13224 +               goto out;
13225 +
13226 +       ret = allocate_znode(JZNODE(child), &pos->coord, pos);
13227 +       if (ret)
13228 +               goto out;
13229 +
13230 +       /* move flush position to leaf level */
13231 +       pos->state = POS_ON_LEAF;
13232 +       move_flush_pos(pos, &child_lock, &child_load, NULL);
13233 +
13234 +       if (node_is_empty(JZNODE(child))) {
13235 +               ret = delete_empty_node(JZNODE(child));
13236 +               pos->state = POS_INVALID;
13237 +       }
13238 +      out:
13239 +       done_load_count(&child_load);
13240 +       done_lh(&child_lock);
13241 +       jput(child);
13242 +
13243 +       return ret;
13244 +}
13245 +
13246 +/* move pos from leaf to twig, and move lock from leaf to twig. */
13247 +/* Move pos->lock to upper (twig) level */
13248 +static int handle_pos_to_twig(flush_pos_t * pos)
13249 +{
13250 +       int ret;
13251 +
13252 +       lock_handle parent_lock;
13253 +       load_count parent_load;
13254 +       coord_t pcoord;
13255 +
13256 +       assert("zam-852", pos->state == POS_TO_TWIG);
13257 +
13258 +       init_lh(&parent_lock);
13259 +       init_load_count(&parent_load);
13260 +
13261 +       ret =
13262 +           reiser4_get_parent(&parent_lock, pos->lock.node, ZNODE_WRITE_LOCK);
13263 +       if (ret)
13264 +               goto out;
13265 +
13266 +       ret = incr_load_count_znode(&parent_load, parent_lock.node);
13267 +       if (ret)
13268 +               goto out;
13269 +
13270 +       ret = find_child_ptr(parent_lock.node, pos->lock.node, &pcoord);
13271 +       if (ret)
13272 +               goto out;
13273 +
13274 +       assert("zam-870", item_is_internal(&pcoord));
13275 +       coord_next_item(&pcoord);
13276 +
13277 +       if (coord_is_after_rightmost(&pcoord))
13278 +               pos->state = POS_END_OF_TWIG;
13279 +       else if (item_is_extent(&pcoord))
13280 +               pos->state = POS_ON_EPOINT;
13281 +       else {
13282 +               /* Here we understand that getting -E_NO_NEIGHBOR in
13283 +                * handle_pos_on_leaf() was because of just a reaching edge of
13284 +                * slum */
13285 +               pos_stop(pos);
13286 +               goto out;
13287 +       }
13288 +
13289 +       move_flush_pos(pos, &parent_lock, &parent_load, &pcoord);
13290 +
13291 +      out:
13292 +       done_load_count(&parent_load);
13293 +       done_lh(&parent_lock);
13294 +
13295 +       return ret;
13296 +}
13297 +
13298 +typedef int (*pos_state_handle_t) (flush_pos_t *);
13299 +static pos_state_handle_t flush_pos_handlers[] = {
13300 +       /* process formatted nodes on leaf level, keep lock on a leaf node */
13301 +       [POS_ON_LEAF] = handle_pos_on_leaf,
13302 +       /* process unformatted nodes, keep lock on twig node, pos->coord points to extent currently
13303 +        * being processed */
13304 +       [POS_ON_EPOINT] = handle_pos_on_twig,
13305 +       /* move a lock from leaf node to its parent for further processing of unformatted nodes */
13306 +       [POS_TO_TWIG] = handle_pos_to_twig,
13307 +       /* move a lock from twig to leaf level when a processing of unformatted nodes finishes,
13308 +        * pos->coord points to the leaf node we jump to */
13309 +       [POS_TO_LEAF] = handle_pos_to_leaf,
13310 +       /* after processing last extent in the twig node, attempting to shift items from the twigs
13311 +        * right neighbor and process them while shifting */
13312 +       [POS_END_OF_TWIG] = handle_pos_end_of_twig,
13313 +       /* process formatted nodes on internal level, keep lock on an internal node */
13314 +       [POS_ON_INTERNAL] = handle_pos_on_internal
13315 +};
13316 +
13317 +/* Advance flush position horizontally, prepare for flushing ((re)allocate, squeeze,
13318 + * encrypt) nodes and their ancestors in "parent-first" order */
13319 +static int squalloc(flush_pos_t * pos)
13320 +{
13321 +       int ret = 0;
13322 +
13323 +       /* maybe needs to be made a case statement with handle_pos_on_leaf as first case, for
13324 +        * greater CPU efficiency? Measure and see.... -Hans */
13325 +       while (pos_valid(pos)) {
13326 +               ret = flush_pos_handlers[pos->state] (pos);
13327 +               if (ret < 0)
13328 +                       break;
13329 +
13330 +               ret = rapid_flush(pos);
13331 +               if (ret)
13332 +                       break;
13333 +       }
13334 +
13335 +       /* any positive value or -E_NO_NEIGHBOR are legal return codes for handle_pos*
13336 +          routines, -E_NO_NEIGHBOR means that slum edge was reached */
13337 +       if (ret > 0 || ret == -E_NO_NEIGHBOR)
13338 +               ret = 0;
13339 +
13340 +       return ret;
13341 +}
13342 +
13343 +static void update_ldkey(znode * node)
13344 +{
13345 +       reiser4_key ldkey;
13346 +
13347 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
13348 +       if (node_is_empty(node))
13349 +               return;
13350 +
13351 +       znode_set_ld_key(node, leftmost_key_in_node(node, &ldkey));
13352 +}
13353 +
13354 +/* this is to be called after calling of shift node's method to shift data from @right to
13355 +   @left. It sets left delimiting keys of @left and @right to keys of first items of @left
13356 +   and @right correspondingly and sets right delimiting key of @left to first key of @right */
13357 +static void update_znode_dkeys(znode * left, znode * right)
13358 +{
13359 +       assert_rw_write_locked(&(znode_get_tree(right)->dk_lock));
13360 +       assert("vs-1629", (znode_is_write_locked(left) &&
13361 +                          znode_is_write_locked(right)));
13362 +
13363 +       /* we need to update left delimiting of left if it was empty before shift */
13364 +       update_ldkey(left);
13365 +       update_ldkey(right);
13366 +       if (node_is_empty(right))
13367 +               znode_set_rd_key(left, znode_get_rd_key(right));
13368 +       else
13369 +               znode_set_rd_key(left, znode_get_ld_key(right));
13370 +}
13371 +
13372 +/* try to shift everything from @right to @left. If everything was shifted -
13373 +   @right is removed from the tree.  Result is the number of bytes shifted. */
13374 +static int
13375 +shift_everything_left(znode * right, znode * left, carry_level * todo)
13376 +{
13377 +       coord_t from;
13378 +       node_plugin *nplug;
13379 +       carry_plugin_info info;
13380 +
13381 +       coord_init_after_last_item(&from, right);
13382 +
13383 +       nplug = node_plugin_by_node(right);
13384 +       info.doing = NULL;
13385 +       info.todo = todo;
13386 +       return nplug->shift(&from, left, SHIFT_LEFT,
13387 +                           1 /* delete @right if it becomes empty */ ,
13388 +                           1
13389 +                           /* move coord @from to node @left if everything will be shifted */
13390 +                           ,
13391 +                           &info);
13392 +}
13393 +
13394 +/* Shift as much as possible from @right to @left using the memcpy-optimized
13395 +   shift_everything_left.  @left and @right are formatted neighboring nodes on
13396 +   leaf level. */
13397 +static int squeeze_right_non_twig(znode * left, znode * right)
13398 +{
13399 +       int ret;
13400 +       carry_pool *pool;
13401 +       carry_level *todo;
13402 +
13403 +       assert("nikita-2246", znode_get_level(left) == znode_get_level(right));
13404 +
13405 +       if (!JF_ISSET(ZJNODE(left), JNODE_DIRTY) ||
13406 +           !JF_ISSET(ZJNODE(right), JNODE_DIRTY))
13407 +               return SQUEEZE_TARGET_FULL;
13408 +
13409 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo));
13410 +       if (IS_ERR(pool))
13411 +               return PTR_ERR(pool);
13412 +       todo = (carry_level *) (pool + 1);
13413 +       init_carry_level(todo, pool);
13414 +
13415 +       ret = shift_everything_left(right, left, todo);
13416 +       if (ret > 0) {
13417 +               /* something was shifted */
13418 +               reiser4_tree *tree;
13419 +               __u64 grabbed;
13420 +
13421 +               znode_make_dirty(left);
13422 +               znode_make_dirty(right);
13423 +
13424 +               /* update delimiting keys of nodes which participated in
13425 +                  shift. FIXME: it would be better to have this in shift
13426 +                  node's operation. But it can not be done there. Nobody
13427 +                  remembers why, though */
13428 +               tree = znode_get_tree(left);
13429 +               write_lock_dk(tree);
13430 +               update_znode_dkeys(left, right);
13431 +               write_unlock_dk(tree);
13432 +
13433 +               /* Carry is called to update delimiting key and, maybe, to remove empty
13434 +                  node. */
13435 +               grabbed = get_current_context()->grabbed_blocks;
13436 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13437 +               assert("nikita-3003", ret == 0);        /* reserved space is exhausted. Ask Hans. */
13438 +               ret = reiser4_carry(todo, NULL /* previous level */ );
13439 +               grabbed2free_mark(grabbed);
13440 +       } else {
13441 +               /* Shifting impossible, we return appropriate result code */
13442 +               ret =
13443 +                   node_is_empty(right) ? SQUEEZE_SOURCE_EMPTY :
13444 +                   SQUEEZE_TARGET_FULL;
13445 +       }
13446 +
13447 +       done_carry_pool(pool);
13448 +
13449 +       return ret;
13450 +}
13451 +
13452 +#if REISER4_DEBUG
13453 +static int sibling_link_is_ok(const znode *left, const znode *right)
13454 +{
13455 +       int result;
13456 +
13457 +       read_lock_tree(znode_get_tree(left));
13458 +       result = (left->right == right && left == right->left);
13459 +       read_unlock_tree(znode_get_tree(left));
13460 +       return result;
13461 +}
13462 +#endif
13463 +
13464 +/* Shift first unit of first item if it is an internal one.  Return
13465 +   SQUEEZE_TARGET_FULL if it fails to shift an item, otherwise return
13466 +   SUBTREE_MOVED. */
13467 +static int shift_one_internal_unit(znode * left, znode * right)
13468 +{
13469 +       int ret;
13470 +       carry_pool *pool;
13471 +       carry_level *todo;
13472 +       coord_t *coord;
13473 +       carry_plugin_info *info;
13474 +       int size, moved;
13475 +
13476 +       assert("nikita-2247", znode_get_level(left) == znode_get_level(right));
13477 +       assert("nikita-2435", znode_is_write_locked(left));
13478 +       assert("nikita-2436", znode_is_write_locked(right));
13479 +       assert("nikita-2434", sibling_link_is_ok(left, right));
13480 +
13481 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*todo) +
13482 +                              sizeof(*coord) + sizeof(*info)
13483 +#if REISER4_DEBUG
13484 +                              + sizeof(*coord) + 2 * sizeof(reiser4_key)
13485 +#endif
13486 +           );
13487 +       if (IS_ERR(pool))
13488 +               return PTR_ERR(pool);
13489 +       todo = (carry_level *) (pool + 1);
13490 +       init_carry_level(todo, pool);
13491 +
13492 +       coord = (coord_t *) (todo + 3);
13493 +       coord_init_first_unit(coord, right);
13494 +       info = (carry_plugin_info *) (coord + 1);
13495 +
13496 +#if REISER4_DEBUG
13497 +       if (!node_is_empty(left)) {
13498 +               coord_t *last;
13499 +               reiser4_key *right_key;
13500 +               reiser4_key *left_key;
13501 +
13502 +               last = (coord_t *) (info + 1);
13503 +               right_key = (reiser4_key *) (last + 1);
13504 +               left_key = right_key + 1;
13505 +               coord_init_last_unit(last, left);
13506 +
13507 +               assert("nikita-2463",
13508 +                      keyle(item_key_by_coord(last, left_key),
13509 +                            item_key_by_coord(coord, right_key)));
13510 +       }
13511 +#endif
13512 +
13513 +       assert("jmacd-2007", item_is_internal(coord));
13514 +
13515 +       size = item_length_by_coord(coord);
13516 +       info->todo = todo;
13517 +       info->doing = NULL;
13518 +
13519 +       ret = node_plugin_by_node(left)->shift(coord, left, SHIFT_LEFT,
13520 +                                              1
13521 +                                              /* delete @right if it becomes empty */
13522 +                                              ,
13523 +                                              0
13524 +                                              /* do not move coord @coord to node @left */
13525 +                                              ,
13526 +                                              info);
13527 +
13528 +       /* If shift returns positive, then we shifted the item. */
13529 +       assert("vs-423", ret <= 0 || size == ret);
13530 +       moved = (ret > 0);
13531 +
13532 +       if (moved) {
13533 +               /* something was moved */
13534 +               reiser4_tree *tree;
13535 +               int grabbed;
13536 +
13537 +               znode_make_dirty(left);
13538 +               znode_make_dirty(right);
13539 +               tree = znode_get_tree(left);
13540 +               write_lock_dk(tree);
13541 +               update_znode_dkeys(left, right);
13542 +               write_unlock_dk(tree);
13543 +
13544 +               /* reserve space for delimiting keys after shifting */
13545 +               grabbed = get_current_context()->grabbed_blocks;
13546 +               ret = reiser4_grab_space_force(tree->height, BA_RESERVED);
13547 +               assert("nikita-3003", ret == 0);        /* reserved space is exhausted. Ask Hans. */
13548 +
13549 +               ret = reiser4_carry(todo, NULL /* previous level */ );
13550 +               grabbed2free_mark(grabbed);
13551 +       }
13552 +
13553 +       done_carry_pool(pool);
13554 +
13555 +       if (ret != 0) {
13556 +               /* Shift or carry operation failed. */
13557 +               assert("jmacd-7325", ret < 0);
13558 +               return ret;
13559 +       }
13560 +
13561 +       return moved ? SUBTREE_MOVED : SQUEEZE_TARGET_FULL;
13562 +}
13563 +
13564 +/* Make the final relocate/wander decision during forward parent-first squalloc for a
13565 +   znode.  For unformatted nodes this is done in plugin/item/extent.c:extent_needs_allocation(). */
13566 +static int
13567 +allocate_znode_loaded(znode * node,
13568 +                     const coord_t * parent_coord, flush_pos_t * pos)
13569 +{
13570 +       int ret;
13571 +       reiser4_super_info_data *sbinfo = get_current_super_private();
13572 +       /* FIXME(D): We have the node write-locked and should have checked for !
13573 +          allocated() somewhere before reaching this point, but there can be a race, so
13574 +          this assertion is bogus. */
13575 +       assert("jmacd-7987", !jnode_check_flushprepped(ZJNODE(node)));
13576 +       assert("jmacd-7988", znode_is_write_locked(node));
13577 +       assert("jmacd-7989", coord_is_invalid(parent_coord)
13578 +              || znode_is_write_locked(parent_coord->node));
13579 +
13580 +       if (ZF_ISSET(node, JNODE_REPACK) || ZF_ISSET(node, JNODE_CREATED) ||
13581 +           znode_is_root(node) ||
13582 +           /* We have enough nodes to relocate no matter what. */
13583 +           (pos->leaf_relocate != 0 && znode_get_level(node) == LEAF_LEVEL)) {
13584 +               /* No need to decide with new nodes, they are treated the same as
13585 +                  relocate. If the root node is dirty, relocate. */
13586 +               if (pos->preceder.blk == 0) {
13587 +                       /* preceder is unknown and we have decided to relocate node --
13588 +                          using of default value for search start is better than search
13589 +                          from block #0. */
13590 +                       get_blocknr_hint_default(&pos->preceder.blk);
13591 +                       check_preceder(pos->preceder.blk);
13592 +               }
13593 +
13594 +               goto best_reloc;
13595 +
13596 +       } else if (pos->preceder.blk == 0) {
13597 +               /* If we don't know the preceder, leave it where it is. */
13598 +               jnode_make_wander(ZJNODE(node));
13599 +       } else {
13600 +               /* Make a decision based on block distance. */
13601 +               reiser4_block_nr dist;
13602 +               reiser4_block_nr nblk = *znode_get_block(node);
13603 +
13604 +               assert("jmacd-6172", !reiser4_blocknr_is_fake(&nblk));
13605 +               assert("jmacd-6173", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13606 +               assert("jmacd-6174", pos->preceder.blk != 0);
13607 +
13608 +               if (pos->preceder.blk == nblk - 1) {
13609 +                       /* Ideal. */
13610 +                       jnode_make_wander(ZJNODE(node));
13611 +               } else {
13612 +
13613 +                       dist =
13614 +                           (nblk <
13615 +                            pos->preceder.blk) ? (pos->preceder.blk -
13616 +                                                  nblk) : (nblk -
13617 +                                                           pos->preceder.blk);
13618 +
13619 +                       /* See if we can find a closer block (forward direction only). */
13620 +                       pos->preceder.max_dist =
13621 +                           min((reiser4_block_nr) sbinfo->flush.
13622 +                               relocate_distance, dist);
13623 +                       pos->preceder.level = znode_get_level(node);
13624 +
13625 +                       ret = allocate_znode_update(node, parent_coord, pos);
13626 +
13627 +                       pos->preceder.max_dist = 0;
13628 +
13629 +                       if (ret && (ret != -ENOSPC))
13630 +                               return ret;
13631 +
13632 +                       if (ret == 0) {
13633 +                               /* Got a better allocation. */
13634 +                               znode_make_reloc(node, pos->fq);
13635 +                       } else if (dist < sbinfo->flush.relocate_distance) {
13636 +                               /* The present allocation is good enough. */
13637 +                               jnode_make_wander(ZJNODE(node));
13638 +                       } else {
13639 +                               /* Otherwise, try to relocate to the best position. */
13640 +                             best_reloc:
13641 +                               ret =
13642 +                                   allocate_znode_update(node, parent_coord,
13643 +                                                         pos);
13644 +                               if (ret != 0)
13645 +                                       return ret;
13646 +
13647 +                               /* set JNODE_RELOC bit _after_ node gets allocated */
13648 +                               znode_make_reloc(node, pos->fq);
13649 +                       }
13650 +               }
13651 +       }
13652 +
13653 +       /* This is the new preceder. */
13654 +       pos->preceder.blk = *znode_get_block(node);
13655 +       check_preceder(pos->preceder.blk);
13656 +       pos->alloc_cnt += 1;
13657 +
13658 +       assert("jmacd-4277", !reiser4_blocknr_is_fake(&pos->preceder.blk));
13659 +
13660 +       return 0;
13661 +}
13662 +
13663 +static int
13664 +allocate_znode(znode * node, const coord_t * parent_coord, flush_pos_t * pos)
13665 +{
13666 +       /*
13667 +        * perform znode allocation with znode pinned in memory to avoid races
13668 +        * with asynchronous emergency flush (which plays with
13669 +        * JNODE_FLUSH_RESERVED bit).
13670 +        */
13671 +       return WITH_DATA(node, allocate_znode_loaded(node, parent_coord, pos));
13672 +}
13673 +
13674 +/* A subroutine of allocate_znode, this is called first to see if there is a close
13675 +   position to relocate to.  It may return ENOSPC if there is no close position.  If there
13676 +   is no close position it may not relocate.  This takes care of updating the parent node
13677 +   with the relocated block address. */
13678 +static int
13679 +allocate_znode_update(znode * node, const coord_t * parent_coord,
13680 +                     flush_pos_t * pos)
13681 +{
13682 +       int ret;
13683 +       reiser4_block_nr blk;
13684 +       lock_handle uber_lock;
13685 +       int flush_reserved_used = 0;
13686 +       int grabbed;
13687 +       reiser4_context *ctx;
13688 +       reiser4_super_info_data *sbinfo;
13689 +
13690 +       init_lh(&uber_lock);
13691 +
13692 +       ctx = get_current_context();
13693 +       sbinfo = get_super_private(ctx->super);
13694 +
13695 +       grabbed = ctx->grabbed_blocks;
13696 +
13697 +       /* discard e-flush allocation */
13698 +       ret = zload(node);
13699 +       if (ret)
13700 +               return ret;
13701 +
13702 +       if (ZF_ISSET(node, JNODE_CREATED)) {
13703 +               assert("zam-816", reiser4_blocknr_is_fake(znode_get_block(node)));
13704 +               pos->preceder.block_stage = BLOCK_UNALLOCATED;
13705 +       } else {
13706 +               pos->preceder.block_stage = BLOCK_GRABBED;
13707 +
13708 +               /* The disk space for relocating the @node is already reserved in "flush reserved"
13709 +                * counter if @node is leaf, otherwise we grab space using BA_RESERVED (means grab
13710 +                * space from whole disk not from only 95%). */
13711 +               if (znode_get_level(node) == LEAF_LEVEL) {
13712 +                       /*
13713 +                        * earlier (during do_jnode_make_dirty()) we decided
13714 +                        * that @node can possibly go into overwrite set and
13715 +                        * reserved block for its wandering location.
13716 +                        */
13717 +                       txn_atom *atom = get_current_atom_locked();
13718 +                       assert("nikita-3449",
13719 +                              ZF_ISSET(node, JNODE_FLUSH_RESERVED));
13720 +                       flush_reserved2grabbed(atom, (__u64) 1);
13721 +                       spin_unlock_atom(atom);
13722 +                       /*
13723 +                        * we are trying to move node into relocate
13724 +                        * set. Allocation of relocated position "uses"
13725 +                        * reserved block.
13726 +                        */
13727 +                       ZF_CLR(node, JNODE_FLUSH_RESERVED);
13728 +                       flush_reserved_used = 1;
13729 +               } else {
13730 +                       ret = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
13731 +                       if (ret != 0)
13732 +                               goto exit;
13733 +               }
13734 +       }
13735 +
13736 +       /* We may do not use 5% of reserved disk space here and flush will not pack tightly. */
13737 +       ret = reiser4_alloc_block(&pos->preceder, &blk,
13738 +                                 BA_FORMATTED | BA_PERMANENT);
13739 +       if (ret)
13740 +               goto exit;
13741 +
13742 +       if (!ZF_ISSET(node, JNODE_CREATED) &&
13743 +           (ret =
13744 +            reiser4_dealloc_block(znode_get_block(node), 0,
13745 +                                  BA_DEFER | BA_FORMATTED)))
13746 +               goto exit;
13747 +
13748 +       if (likely(!znode_is_root(node))) {
13749 +               item_plugin *iplug;
13750 +
13751 +               iplug = item_plugin_by_coord(parent_coord);
13752 +               assert("nikita-2954", iplug->f.update != NULL);
13753 +               iplug->f.update(parent_coord, &blk);
13754 +
13755 +               znode_make_dirty(parent_coord->node);
13756 +
13757 +       } else {
13758 +               reiser4_tree *tree = znode_get_tree(node);
13759 +               znode *uber;
13760 +
13761 +               /* We take a longterm lock on the fake node in order to change
13762 +                  the root block number.  This may cause atom fusion. */
13763 +               ret = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
13764 +                                    &uber_lock);
13765 +               /* The fake node cannot be deleted, and we must have priority
13766 +                  here, and may not be confused with ENOSPC. */
13767 +               assert("jmacd-74412",
13768 +                      ret != -EINVAL && ret != -E_DEADLOCK && ret != -ENOSPC);
13769 +
13770 +               if (ret)
13771 +                       goto exit;
13772 +
13773 +               uber = uber_lock.node;
13774 +
13775 +               write_lock_tree(tree);
13776 +               tree->root_block = blk;
13777 +               write_unlock_tree(tree);
13778 +
13779 +               znode_make_dirty(uber);
13780 +       }
13781 +
13782 +       ret = znode_rehash(node, &blk);
13783 +      exit:
13784 +       if (ret) {
13785 +               /* Get flush reserved block back if something fails, because
13786 +                * callers assume that on error block wasn't relocated and its
13787 +                * flush reserved block wasn't used. */
13788 +               if (flush_reserved_used) {
13789 +                       /*
13790 +                        * ok, we failed to move node into relocate
13791 +                        * set. Restore status quo.
13792 +                        */
13793 +                       grabbed2flush_reserved((__u64) 1);
13794 +                       ZF_SET(node, JNODE_FLUSH_RESERVED);
13795 +               }
13796 +       }
13797 +       zrelse(node);
13798 +       done_lh(&uber_lock);
13799 +       grabbed2free_mark(grabbed);
13800 +       return ret;
13801 +}
13802 +
13803 +/* JNODE INTERFACE */
13804 +
13805 +/* Lock a node (if formatted) and then get its parent locked, set the child's
13806 +   coordinate in the parent.  If the child is the root node, the above_root
13807 +   znode is returned but the coord is not set.  This function may cause atom
13808 +   fusion, but it is only used for read locks (at this point) and therefore
13809 +   fusion only occurs when the parent is already dirty. */
13810 +/* Hans adds this note: remember to ask how expensive this operation is vs. storing parent
13811 +   pointer in jnodes. */
13812 +static int
13813 +jnode_lock_parent_coord(jnode * node,
13814 +                       coord_t * coord,
13815 +                       lock_handle * parent_lh,
13816 +                       load_count * parent_zh,
13817 +                       znode_lock_mode parent_mode, int try)
13818 +{
13819 +       int ret;
13820 +
13821 +       assert("edward-53", jnode_is_unformatted(node) || jnode_is_znode(node));
13822 +       assert("edward-54", jnode_is_unformatted(node)
13823 +              || znode_is_any_locked(JZNODE(node)));
13824 +
13825 +       if (!jnode_is_znode(node)) {
13826 +               reiser4_key key;
13827 +               tree_level stop_level = TWIG_LEVEL;
13828 +               lookup_bias bias = FIND_EXACT;
13829 +
13830 +               assert("edward-168", !(jnode_get_type(node) == JNODE_BITMAP));
13831 +
13832 +               /* The case when node is not znode, but can have parent coord
13833 +                  (unformatted node, node which represents cluster page,
13834 +                  etc..).  Generate a key for the appropriate entry, search
13835 +                  in the tree using coord_by_key, which handles locking for
13836 +                  us. */
13837 +
13838 +               /*
13839 +                * nothing is locked at this moment, so, nothing prevents
13840 +                * concurrent truncate from removing jnode from inode. To
13841 +                * prevent this spin-lock jnode. jnode can be truncated just
13842 +                * after call to the jnode_build_key(), but this is ok,
13843 +                * because coord_by_key() will just fail to find appropriate
13844 +                * extent.
13845 +                */
13846 +               spin_lock_jnode(node);
13847 +               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13848 +                       jnode_build_key(node, &key);
13849 +                       ret = 0;
13850 +               } else
13851 +                       ret = RETERR(-ENOENT);
13852 +               spin_unlock_jnode(node);
13853 +
13854 +               if (ret != 0)
13855 +                       return ret;
13856 +
13857 +               if (jnode_is_cluster_page(node))
13858 +                       stop_level = LEAF_LEVEL;
13859 +
13860 +               assert("jmacd-1812", coord != NULL);
13861 +
13862 +               ret = coord_by_key(jnode_get_tree(node), &key, coord, parent_lh,
13863 +                                  parent_mode, bias, stop_level, stop_level,
13864 +                                  CBK_UNIQUE, NULL /*ra_info */ );
13865 +               switch (ret) {
13866 +               case CBK_COORD_NOTFOUND:
13867 +                       assert("edward-1038",
13868 +                              ergo(jnode_is_cluster_page(node),
13869 +                                   JF_ISSET(node, JNODE_HEARD_BANSHEE)));
13870 +                       if (!JF_ISSET(node, JNODE_HEARD_BANSHEE))
13871 +                               warning("nikita-3177", "Parent not found");
13872 +                       return ret;
13873 +               case CBK_COORD_FOUND:
13874 +                       if (coord->between != AT_UNIT) {
13875 +                               /* FIXME: comment needed */
13876 +                               done_lh(parent_lh);
13877 +                               if (!JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
13878 +                                       warning("nikita-3178",
13879 +                                               "Found but not happy: %i",
13880 +                                               coord->between);
13881 +                               }
13882 +                               return RETERR(-ENOENT);
13883 +                       }
13884 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
13885 +                       if (ret != 0)
13886 +                               return ret;
13887 +                       /* if (jnode_is_cluster_page(node)) {
13888 +                          races with write() are possible
13889 +                          check_child_cluster (parent_lh->node);
13890 +                          }
13891 +                        */
13892 +                       break;
13893 +               default:
13894 +                       return ret;
13895 +               }
13896 +
13897 +       } else {
13898 +               int flags;
13899 +               znode *z;
13900 +
13901 +               z = JZNODE(node);
13902 +               /* Formatted node case: */
13903 +               assert("jmacd-2061", !znode_is_root(z));
13904 +
13905 +               flags = GN_ALLOW_NOT_CONNECTED;
13906 +               if (try)
13907 +                       flags |= GN_TRY_LOCK;
13908 +
13909 +               ret =
13910 +                   reiser4_get_parent_flags(parent_lh, z, parent_mode, flags);
13911 +               if (ret != 0)
13912 +                       /* -E_REPEAT is ok here, it is handled by the caller. */
13913 +                       return ret;
13914 +
13915 +               /* Make the child's position "hint" up-to-date.  (Unless above
13916 +                  root, which caller must check.) */
13917 +               if (coord != NULL) {
13918 +
13919 +                       ret = incr_load_count_znode(parent_zh, parent_lh->node);
13920 +                       if (ret != 0) {
13921 +                               warning("jmacd-976812386",
13922 +                                       "incr_load_count_znode failed: %d",
13923 +                                       ret);
13924 +                               return ret;
13925 +                       }
13926 +
13927 +                       ret = find_child_ptr(parent_lh->node, z, coord);
13928 +                       if (ret != 0) {
13929 +                               warning("jmacd-976812",
13930 +                                       "find_child_ptr failed: %d", ret);
13931 +                               return ret;
13932 +                       }
13933 +               }
13934 +       }
13935 +
13936 +       return 0;
13937 +}
13938 +
13939 +/* Get the (locked) next neighbor of a znode which is dirty and a member of the same atom.
13940 +   If there is no next neighbor or the neighbor is not in memory or if there is a
13941 +   neighbor but it is not dirty or not in the same atom, -E_NO_NEIGHBOR is returned.
13942 +   In some cases the slum may include nodes which are not dirty, if so @check_dirty should be 0 */
13943 +static int neighbor_in_slum(znode * node,      /* starting point */
13944 +                           lock_handle * lock, /* lock on starting point */
13945 +                           sideof side,        /* left or right direction we seek the next node in */
13946 +                           znode_lock_mode mode,       /* kind of lock we want */
13947 +                           int check_dirty)
13948 +{                              /* true if the neighbor should be dirty */
13949 +       int ret;
13950 +
13951 +       assert("jmacd-6334", znode_is_connected(node));
13952 +
13953 +       ret =
13954 +           reiser4_get_neighbor(lock, node, mode,
13955 +                                GN_SAME_ATOM | (side ==
13956 +                                                LEFT_SIDE ? GN_GO_LEFT : 0));
13957 +
13958 +       if (ret) {
13959 +               /* May return -ENOENT or -E_NO_NEIGHBOR. */
13960 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
13961 +               if (ret == -ENOENT) {
13962 +                       ret = RETERR(-E_NO_NEIGHBOR);
13963 +               }
13964 +
13965 +               return ret;
13966 +       }
13967 +       if (!check_dirty)
13968 +               return 0;
13969 +       /* Check dirty bit of locked znode, no races here */
13970 +       if (JF_ISSET(ZJNODE(lock->node), JNODE_DIRTY))
13971 +               return 0;
13972 +
13973 +       done_lh(lock);
13974 +       return RETERR(-E_NO_NEIGHBOR);
13975 +}
13976 +
13977 +/* Return true if two znodes have the same parent.  This is called with both nodes
13978 +   write-locked (for squeezing) so no tree lock is needed. */
13979 +static int znode_same_parents(znode * a, znode * b)
13980 +{
13981 +       int result;
13982 +
13983 +       assert("jmacd-7011", znode_is_write_locked(a));
13984 +       assert("jmacd-7012", znode_is_write_locked(b));
13985 +
13986 +       /* We lock the whole tree for this check.... I really don't like whole tree
13987 +        * locks... -Hans */
13988 +       read_lock_tree(znode_get_tree(a));
13989 +       result = (znode_parent(a) == znode_parent(b));
13990 +       read_unlock_tree(znode_get_tree(a));
13991 +       return result;
13992 +}
13993 +
13994 +/* FLUSH SCAN */
13995 +
13996 +/* Initialize the flush_scan data structure. */
13997 +static void scan_init(flush_scan * scan)
13998 +{
13999 +       memset(scan, 0, sizeof(*scan));
14000 +       init_lh(&scan->node_lock);
14001 +       init_lh(&scan->parent_lock);
14002 +       init_load_count(&scan->parent_load);
14003 +       init_load_count(&scan->node_load);
14004 +       coord_init_invalid(&scan->parent_coord, NULL);
14005 +}
14006 +
14007 +/* Release any resources held by the flush scan, e.g., release locks, free memory, etc. */
14008 +static void scan_done(flush_scan * scan)
14009 +{
14010 +       done_load_count(&scan->node_load);
14011 +       if (scan->node != NULL) {
14012 +               jput(scan->node);
14013 +               scan->node = NULL;
14014 +       }
14015 +       done_load_count(&scan->parent_load);
14016 +       done_lh(&scan->parent_lock);
14017 +       done_lh(&scan->node_lock);
14018 +}
14019 +
14020 +/* Returns true if flush scanning is finished. */
14021 +int reiser4_scan_finished(flush_scan * scan)
14022 +{
14023 +       return scan->stop || (scan->direction == RIGHT_SIDE &&
14024 +                             scan->count >= scan->max_count);
14025 +}
14026 +
14027 +/* Return true if the scan should continue to the @tonode.  True if the node meets the
14028 +   same_slum_check condition.  If not, deref the "left" node and stop the scan. */
14029 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode)
14030 +{
14031 +       int go = same_slum_check(scan->node, tonode, 1, 0);
14032 +
14033 +       if (!go) {
14034 +               scan->stop = 1;
14035 +               jput(tonode);
14036 +       }
14037 +
14038 +       return go;
14039 +}
14040 +
14041 +/* Set the current scan->node, refcount it, increment count by the @add_count (number to
14042 +   count, e.g., skipped unallocated nodes), deref previous current, and copy the current
14043 +   parent coordinate. */
14044 +int
14045 +scan_set_current(flush_scan * scan, jnode * node, unsigned add_count,
14046 +                const coord_t * parent)
14047 +{
14048 +       /* Release the old references, take the new reference. */
14049 +       done_load_count(&scan->node_load);
14050 +
14051 +       if (scan->node != NULL) {
14052 +               jput(scan->node);
14053 +       }
14054 +       scan->node = node;
14055 +       scan->count += add_count;
14056 +
14057 +       /* This next stmt is somewhat inefficient.  The reiser4_scan_extent() code could
14058 +          delay this update step until it finishes and update the parent_coord only once.
14059 +          It did that before, but there was a bug and this was the easiest way to make it
14060 +          correct. */
14061 +       if (parent != NULL) {
14062 +               coord_dup(&scan->parent_coord, parent);
14063 +       }
14064 +
14065 +       /* Failure may happen at the incr_load_count call, but the caller can assume the reference
14066 +          is safely taken. */
14067 +       return incr_load_count_jnode(&scan->node_load, node);
14068 +}
14069 +
14070 +/* Return true if scanning in the leftward direction. */
14071 +int reiser4_scanning_left(flush_scan * scan)
14072 +{
14073 +       return scan->direction == LEFT_SIDE;
14074 +}
14075 +
14076 +/* Performs leftward scanning starting from either kind of node.  Counts the starting
14077 +   node.  The right-scan object is passed in for the left-scan in order to copy the parent
14078 +   of an unformatted starting position.  This way we avoid searching for the unformatted
14079 +   node's parent when scanning in each direction.  If we search for the parent once it is
14080 +   set in both scan objects.  The limit parameter tells flush-scan when to stop.
14081 +
14082 +   Rapid scanning is used only during scan_left, where we are interested in finding the
14083 +   'leftpoint' where we begin flushing.  We are interested in stopping at the left child
14084 +   of a twig that does not have a dirty left neighbor.  THIS IS A SPECIAL CASE.  The
14085 +   problem is finding a way to flush only those nodes without unallocated children, and it
14086 +   is difficult to solve in the bottom-up flushing algorithm we are currently using.  The
14087 +   problem can be solved by scanning left at every level as we go upward, but this would
14088 +   basically bring us back to using a top-down allocation strategy, which we already tried
14089 +   (see BK history from May 2002), and has a different set of problems.  The top-down
14090 +   strategy makes avoiding unallocated children easier, but makes it difficult to
14091 +   propertly flush dirty children with clean parents that would otherwise stop the
14092 +   top-down flush, only later to dirty the parent once the children are flushed.  So we
14093 +   solve the problem in the bottom-up algorithm with a special case for twigs and leaves
14094 +   only.
14095 +
14096 +   The first step in solving the problem is this rapid leftward scan.  After we determine
14097 +   that there are at least enough nodes counted to qualify for FLUSH_RELOCATE_THRESHOLD we
14098 +   are no longer interested in the exact count, we are only interested in finding a the
14099 +   best place to start the flush.  We could choose one of two possibilities:
14100 +
14101 +   1. Stop at the leftmost child (of a twig) that does not have a dirty left neighbor.
14102 +   This requires checking one leaf per rapid-scan twig
14103 +
14104 +   2. Stop at the leftmost child (of a twig) where there are no dirty children of the twig
14105 +   to the left.  This requires checking possibly all of the in-memory children of each
14106 +   twig during the rapid scan.
14107 +
14108 +   For now we implement the first policy.
14109 +*/
14110 +static int
14111 +scan_left(flush_scan * scan, flush_scan * right, jnode * node, unsigned limit)
14112 +{
14113 +       int ret = 0;
14114 +
14115 +       scan->max_count = limit;
14116 +       scan->direction = LEFT_SIDE;
14117 +
14118 +       ret = scan_set_current(scan, jref(node), 1, NULL);
14119 +       if (ret != 0) {
14120 +               return ret;
14121 +       }
14122 +
14123 +       ret = scan_common(scan, right);
14124 +       if (ret != 0) {
14125 +               return ret;
14126 +       }
14127 +
14128 +       /* Before rapid scanning, we need a lock on scan->node so that we can get its
14129 +          parent, only if formatted. */
14130 +       if (jnode_is_znode(scan->node)) {
14131 +               ret = longterm_lock_znode(&scan->node_lock, JZNODE(scan->node),
14132 +                                         ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI);
14133 +       }
14134 +
14135 +       /* Rapid_scan would go here (with limit set to FLUSH_RELOCATE_THRESHOLD). */
14136 +       return ret;
14137 +}
14138 +
14139 +/* Performs rightward scanning... Does not count the starting node.  The limit parameter
14140 +   is described in scan_left.  If the starting node is unformatted then the
14141 +   parent_coord was already set during scan_left.  The rapid_after parameter is not used
14142 +   during right-scanning.
14143 +
14144 +   scan_right is only called if the scan_left operation does not count at least
14145 +   FLUSH_RELOCATE_THRESHOLD nodes for flushing.  Otherwise, the limit parameter is set to
14146 +   the difference between scan-left's count and FLUSH_RELOCATE_THRESHOLD, meaning
14147 +   scan-right counts as high as FLUSH_RELOCATE_THRESHOLD and then stops. */
14148 +static int scan_right(flush_scan * scan, jnode * node, unsigned limit)
14149 +{
14150 +       int ret;
14151 +
14152 +       scan->max_count = limit;
14153 +       scan->direction = RIGHT_SIDE;
14154 +
14155 +       ret = scan_set_current(scan, jref(node), 0, NULL);
14156 +       if (ret != 0) {
14157 +               return ret;
14158 +       }
14159 +
14160 +       return scan_common(scan, NULL);
14161 +}
14162 +
14163 +/* Common code to perform left or right scanning. */
14164 +static int scan_common(flush_scan * scan, flush_scan * other)
14165 +{
14166 +       int ret;
14167 +
14168 +       assert("nikita-2376", scan->node != NULL);
14169 +       assert("edward-54", jnode_is_unformatted(scan->node)
14170 +              || jnode_is_znode(scan->node));
14171 +
14172 +       /* Special case for starting at an unformatted node.  Optimization: we only want
14173 +          to search for the parent (which requires a tree traversal) once.  Obviously, we
14174 +          shouldn't have to call it once for the left scan and once for the right scan.
14175 +          For this reason, if we search for the parent during scan-left we then duplicate
14176 +          the coord/lock/load into the scan-right object. */
14177 +       if (jnode_is_unformatted(scan->node)) {
14178 +               ret = scan_unformatted(scan, other);
14179 +               if (ret != 0)
14180 +                       return ret;
14181 +       }
14182 +       /* This loop expects to start at a formatted position and performs chaining of
14183 +          formatted regions */
14184 +       while (!reiser4_scan_finished(scan)) {
14185 +
14186 +               ret = scan_formatted(scan);
14187 +               if (ret != 0) {
14188 +                       return ret;
14189 +               }
14190 +       }
14191 +
14192 +       return 0;
14193 +}
14194 +
14195 +static int scan_unformatted(flush_scan * scan, flush_scan * other)
14196 +{
14197 +       int ret = 0;
14198 +       int try = 0;
14199 +
14200 +       if (!coord_is_invalid(&scan->parent_coord))
14201 +               goto scan;
14202 +
14203 +       /* set parent coord from */
14204 +       if (!jnode_is_unformatted(scan->node)) {
14205 +               /* formatted position */
14206 +
14207 +               lock_handle lock;
14208 +               assert("edward-301", jnode_is_znode(scan->node));
14209 +               init_lh(&lock);
14210 +
14211 +               /*
14212 +                * when flush starts from unformatted node, first thing it
14213 +                * does is tree traversal to find formatted parent of starting
14214 +                * node. This parent is then kept lock across scans to the
14215 +                * left and to the right. This means that during scan to the
14216 +                * left we cannot take left-ward lock, because this is
14217 +                * dead-lock prone. So, if we are scanning to the left and
14218 +                * there is already lock held by this thread,
14219 +                * jnode_lock_parent_coord() should use try-lock.
14220 +                */
14221 +               try = reiser4_scanning_left(scan)
14222 +                   && !lock_stack_isclean(get_current_lock_stack());
14223 +               /* Need the node locked to get the parent lock, We have to
14224 +                  take write lock since there is at least one call path
14225 +                  where this znode is already write-locked by us. */
14226 +               ret =
14227 +                   longterm_lock_znode(&lock, JZNODE(scan->node),
14228 +                                       ZNODE_WRITE_LOCK,
14229 +                                       reiser4_scanning_left(scan) ?
14230 +                                       ZNODE_LOCK_LOPRI :
14231 +                                       ZNODE_LOCK_HIPRI);
14232 +               if (ret != 0)
14233 +                       /* EINVAL or E_DEADLOCK here mean... try again!  At this point we've
14234 +                          scanned too far and can't back out, just start over. */
14235 +                       return ret;
14236 +
14237 +               ret = jnode_lock_parent_coord(scan->node,
14238 +                                             &scan->parent_coord,
14239 +                                             &scan->parent_lock,
14240 +                                             &scan->parent_load,
14241 +                                             ZNODE_WRITE_LOCK, try);
14242 +
14243 +               /* FIXME(C): check EINVAL, E_DEADLOCK */
14244 +               done_lh(&lock);
14245 +               if (ret == -E_REPEAT) {
14246 +                       scan->stop = 1;
14247 +                       return 0;
14248 +               }
14249 +               if (ret)
14250 +                       return ret;
14251 +
14252 +       } else {
14253 +               /* unformatted position */
14254 +
14255 +               ret =
14256 +                   jnode_lock_parent_coord(scan->node, &scan->parent_coord,
14257 +                                           &scan->parent_lock,
14258 +                                           &scan->parent_load,
14259 +                                           ZNODE_WRITE_LOCK, try);
14260 +
14261 +               if (IS_CBKERR(ret))
14262 +                       return ret;
14263 +
14264 +               if (ret == CBK_COORD_NOTFOUND)
14265 +                       /* FIXME(C): check EINVAL, E_DEADLOCK */
14266 +                       return ret;
14267 +
14268 +               /* parent was found */
14269 +               assert("jmacd-8661", other != NULL);
14270 +               /* Duplicate the reference into the other flush_scan. */
14271 +               coord_dup(&other->parent_coord, &scan->parent_coord);
14272 +               copy_lh(&other->parent_lock, &scan->parent_lock);
14273 +               copy_load_count(&other->parent_load, &scan->parent_load);
14274 +       }
14275 +      scan:
14276 +       return scan_by_coord(scan);
14277 +}
14278 +
14279 +/* Performs left- or rightward scanning starting from a formatted node. Follow left
14280 +   pointers under tree lock as long as:
14281 +
14282 +   - node->left/right is non-NULL
14283 +   - node->left/right is connected, dirty
14284 +   - node->left/right belongs to the same atom
14285 +   - scan has not reached maximum count
14286 +*/
14287 +static int scan_formatted(flush_scan * scan)
14288 +{
14289 +       int ret;
14290 +       znode *neighbor = NULL;
14291 +
14292 +       assert("jmacd-1401", !reiser4_scan_finished(scan));
14293 +
14294 +       do {
14295 +               znode *node = JZNODE(scan->node);
14296 +
14297 +               /* Node should be connected, but if not stop the scan. */
14298 +               if (!znode_is_connected(node)) {
14299 +                       scan->stop = 1;
14300 +                       break;
14301 +               }
14302 +
14303 +               /* Lock the tree, check-for and reference the next sibling. */
14304 +               read_lock_tree(znode_get_tree(node));
14305 +
14306 +               /* It may be that a node is inserted or removed between a node and its
14307 +                  left sibling while the tree lock is released, but the flush-scan count
14308 +                  does not need to be precise.  Thus, we release the tree lock as soon as
14309 +                  we get the neighboring node. */
14310 +               neighbor =
14311 +                       reiser4_scanning_left(scan) ? node->left : node->right;
14312 +               if (neighbor != NULL) {
14313 +                       zref(neighbor);
14314 +               }
14315 +
14316 +               read_unlock_tree(znode_get_tree(node));
14317 +
14318 +               /* If neighbor is NULL at the leaf level, need to check for an unformatted
14319 +                  sibling using the parent--break in any case. */
14320 +               if (neighbor == NULL) {
14321 +                       break;
14322 +               }
14323 +
14324 +               /* Check the condition for going left, break if it is not met.  This also
14325 +                  releases (jputs) the neighbor if false. */
14326 +               if (!reiser4_scan_goto(scan, ZJNODE(neighbor))) {
14327 +                       break;
14328 +               }
14329 +
14330 +               /* Advance the flush_scan state to the left, repeat. */
14331 +               ret = scan_set_current(scan, ZJNODE(neighbor), 1, NULL);
14332 +               if (ret != 0) {
14333 +                       return ret;
14334 +               }
14335 +
14336 +       } while (!reiser4_scan_finished(scan));
14337 +
14338 +       /* If neighbor is NULL then we reached the end of a formatted region, or else the
14339 +          sibling is out of memory, now check for an extent to the left (as long as
14340 +          LEAF_LEVEL). */
14341 +       if (neighbor != NULL || jnode_get_level(scan->node) != LEAF_LEVEL
14342 +           || reiser4_scan_finished(scan)) {
14343 +               scan->stop = 1;
14344 +               return 0;
14345 +       }
14346 +       /* Otherwise, calls scan_by_coord for the right(left)most item of the
14347 +          left(right) neighbor on the parent level, then possibly continue. */
14348 +
14349 +       coord_init_invalid(&scan->parent_coord, NULL);
14350 +       return scan_unformatted(scan, NULL);
14351 +}
14352 +
14353 +/* NOTE-EDWARD:
14354 +   This scans adjacent items of the same type and calls scan flush plugin for each one.
14355 +   Performs left(right)ward scanning starting from a (possibly) unformatted node.  If we start
14356 +   from unformatted node, then we continue only if the next neighbor is also unformatted.
14357 +   When called from scan_formatted, we skip first iteration (to make sure that
14358 +   right(left)most item of the left(right) neighbor on the parent level is of the same
14359 +   type and set appropriate coord). */
14360 +static int scan_by_coord(flush_scan * scan)
14361 +{
14362 +       int ret = 0;
14363 +       int scan_this_coord;
14364 +       lock_handle next_lock;
14365 +       load_count next_load;
14366 +       coord_t next_coord;
14367 +       jnode *child;
14368 +       item_plugin *iplug;
14369 +
14370 +       init_lh(&next_lock);
14371 +       init_load_count(&next_load);
14372 +       scan_this_coord = (jnode_is_unformatted(scan->node) ? 1 : 0);
14373 +
14374 +       /* set initial item id */
14375 +       iplug = item_plugin_by_coord(&scan->parent_coord);
14376 +
14377 +       for (; !reiser4_scan_finished(scan); scan_this_coord = 1) {
14378 +               if (scan_this_coord) {
14379 +                       /* Here we expect that unit is scannable. it would not be so due
14380 +                        * to race with extent->tail conversion.  */
14381 +                       if (iplug->f.scan == NULL) {
14382 +                               scan->stop = 1;
14383 +                               ret = -E_REPEAT;
14384 +                               /* skip the check at the end. */
14385 +                               goto race;
14386 +                       }
14387 +
14388 +                       ret = iplug->f.scan(scan);
14389 +                       if (ret != 0)
14390 +                               goto exit;
14391 +
14392 +                       if (reiser4_scan_finished(scan)) {
14393 +                               checkchild(scan);
14394 +                               break;
14395 +                       }
14396 +               } else {
14397 +                       /* the same race against truncate as above is possible
14398 +                        * here, it seems */
14399 +
14400 +                       /* NOTE-JMACD: In this case, apply the same end-of-node logic but don't scan
14401 +                          the first coordinate. */
14402 +                       assert("jmacd-1231",
14403 +                              item_is_internal(&scan->parent_coord));
14404 +               }
14405 +
14406 +               if (iplug->f.utmost_child == NULL
14407 +                   || znode_get_level(scan->parent_coord.node) != TWIG_LEVEL) {
14408 +                       /* stop this coord and continue on parrent level */
14409 +                       ret =
14410 +                           scan_set_current(scan,
14411 +                                            ZJNODE(zref
14412 +                                                   (scan->parent_coord.node)),
14413 +                                            1, NULL);
14414 +                       if (ret != 0)
14415 +                               goto exit;
14416 +                       break;
14417 +               }
14418 +
14419 +               /* Either way, the invariant is that scan->parent_coord is set to the
14420 +                  parent of scan->node. Now get the next unit. */
14421 +               coord_dup(&next_coord, &scan->parent_coord);
14422 +               coord_sideof_unit(&next_coord, scan->direction);
14423 +
14424 +               /* If off-the-end of the twig, try the next twig. */
14425 +               if (coord_is_after_sideof_unit(&next_coord, scan->direction)) {
14426 +                       /* We take the write lock because we may start flushing from this
14427 +                        * coordinate. */
14428 +                       ret =
14429 +                           neighbor_in_slum(next_coord.node, &next_lock,
14430 +                                            scan->direction, ZNODE_WRITE_LOCK,
14431 +                                            1 /* check dirty */ );
14432 +                       if (ret == -E_NO_NEIGHBOR) {
14433 +                               scan->stop = 1;
14434 +                               ret = 0;
14435 +                               break;
14436 +                       }
14437 +
14438 +                       if (ret != 0) {
14439 +                               goto exit;
14440 +                       }
14441 +
14442 +                       ret = incr_load_count_znode(&next_load, next_lock.node);
14443 +                       if (ret != 0) {
14444 +                               goto exit;
14445 +                       }
14446 +
14447 +                       coord_init_sideof_unit(&next_coord, next_lock.node,
14448 +                                              sideof_reverse(scan->direction));
14449 +               }
14450 +
14451 +               iplug = item_plugin_by_coord(&next_coord);
14452 +
14453 +               /* Get the next child. */
14454 +               ret =
14455 +                   iplug->f.utmost_child(&next_coord,
14456 +                                         sideof_reverse(scan->direction),
14457 +                                         &child);
14458 +               if (ret != 0)
14459 +                       goto exit;
14460 +               /* If the next child is not in memory, or, item_utmost_child
14461 +                  failed (due to race with unlink, most probably), stop
14462 +                  here. */
14463 +               if (child == NULL || IS_ERR(child)) {
14464 +                       scan->stop = 1;
14465 +                       checkchild(scan);
14466 +                       break;
14467 +               }
14468 +
14469 +               assert("nikita-2374", jnode_is_unformatted(child)
14470 +                      || jnode_is_znode(child));
14471 +
14472 +               /* See if it is dirty, part of the same atom. */
14473 +               if (!reiser4_scan_goto(scan, child)) {
14474 +                       checkchild(scan);
14475 +                       break;
14476 +               }
14477 +
14478 +               /* If so, make this child current. */
14479 +               ret = scan_set_current(scan, child, 1, &next_coord);
14480 +               if (ret != 0)
14481 +                       goto exit;
14482 +
14483 +               /* Now continue.  If formatted we release the parent lock and return, then
14484 +                  proceed. */
14485 +               if (jnode_is_znode(child))
14486 +                       break;
14487 +
14488 +               /* Otherwise, repeat the above loop with next_coord. */
14489 +               if (next_load.node != NULL) {
14490 +                       done_lh(&scan->parent_lock);
14491 +                       move_lh(&scan->parent_lock, &next_lock);
14492 +                       move_load_count(&scan->parent_load, &next_load);
14493 +               }
14494 +       }
14495 +
14496 +       assert("jmacd-6233",
14497 +              reiser4_scan_finished(scan) || jnode_is_znode(scan->node));
14498 +      exit:
14499 +       checkchild(scan);
14500 +      race:                    /* skip the above check  */
14501 +       if (jnode_is_znode(scan->node)) {
14502 +               done_lh(&scan->parent_lock);
14503 +               done_load_count(&scan->parent_load);
14504 +       }
14505 +
14506 +       done_load_count(&next_load);
14507 +       done_lh(&next_lock);
14508 +       return ret;
14509 +}
14510 +
14511 +/* FLUSH POS HELPERS */
14512 +
14513 +/* Initialize the fields of a flush_position. */
14514 +static void pos_init(flush_pos_t * pos)
14515 +{
14516 +       memset(pos, 0, sizeof *pos);
14517 +
14518 +       pos->state = POS_INVALID;
14519 +       coord_init_invalid(&pos->coord, NULL);
14520 +       init_lh(&pos->lock);
14521 +       init_load_count(&pos->load);
14522 +
14523 +       reiser4_blocknr_hint_init(&pos->preceder);
14524 +}
14525 +
14526 +/* The flush loop inside squalloc periodically checks pos_valid to
14527 +   determine when "enough flushing" has been performed.  This will return true until one
14528 +   of the following conditions is met:
14529 +
14530 +   1. the number of flush-queued nodes has reached the kernel-supplied "int *nr_to_flush"
14531 +   parameter, meaning we have flushed as many blocks as the kernel requested.  When
14532 +   flushing to commit, this parameter is NULL.
14533 +
14534 +   2. pos_stop() is called because squalloc discovers that the "next" node in the
14535 +   flush order is either non-existant, not dirty, or not in the same atom.
14536 +*/
14537 +
14538 +static int pos_valid(flush_pos_t * pos)
14539 +{
14540 +       return pos->state != POS_INVALID;
14541 +}
14542 +
14543 +/* Release any resources of a flush_position.  Called when jnode_flush finishes. */
14544 +static void pos_done(flush_pos_t * pos)
14545 +{
14546 +       pos_stop(pos);
14547 +       reiser4_blocknr_hint_done(&pos->preceder);
14548 +       if (convert_data(pos))
14549 +               free_convert_data(pos);
14550 +}
14551 +
14552 +/* Reset the point and parent.  Called during flush subroutines to terminate the
14553 +   squalloc loop. */
14554 +static int pos_stop(flush_pos_t * pos)
14555 +{
14556 +       pos->state = POS_INVALID;
14557 +       done_lh(&pos->lock);
14558 +       done_load_count(&pos->load);
14559 +       coord_init_invalid(&pos->coord, NULL);
14560 +
14561 +       if (pos->child) {
14562 +               jput(pos->child);
14563 +               pos->child = NULL;
14564 +       }
14565 +
14566 +       return 0;
14567 +}
14568 +
14569 +/* Return the flush_position's block allocator hint. */
14570 +reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos)
14571 +{
14572 +       return &pos->preceder;
14573 +}
14574 +
14575 +flush_queue_t * reiser4_pos_fq(flush_pos_t * pos)
14576 +{
14577 +       return pos->fq;
14578 +}
14579 +
14580 +/* Make Linus happy.
14581 +   Local variables:
14582 +   c-indentation-style: "K&R"
14583 +   mode-name: "LC"
14584 +   c-basic-offset: 8
14585 +   tab-width: 8
14586 +   fill-column: 90
14587 +   LocalWords:  preceder
14588 +   End:
14589 +*/
14590 diff --git a/fs/reiser4/flush.h b/fs/reiser4/flush.h
14591 new file mode 100644
14592 index 0000000..beab76b
14593 --- /dev/null
14594 +++ b/fs/reiser4/flush.h
14595 @@ -0,0 +1,274 @@
14596 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14597 +
14598 +/* DECLARATIONS: */
14599 +
14600 +#if !defined(__REISER4_FLUSH_H__)
14601 +#define __REISER4_FLUSH_H__
14602 +
14603 +#include "plugin/cluster.h"
14604 +
14605 +/* The flush_scan data structure maintains the state of an in-progress flush-scan on a
14606 +   single level of the tree.  A flush-scan is used for counting the number of adjacent
14607 +   nodes to flush, which is used to determine whether we should relocate, and it is also
14608 +   used to find a starting point for flush.  A flush-scan object can scan in both right
14609 +   and left directions via the scan_left() and scan_right() interfaces.  The
14610 +   right- and left-variations are similar but perform different functions.  When scanning
14611 +   left we (optionally perform rapid scanning and then) longterm-lock the endpoint node.
14612 +   When scanning right we are simply counting the number of adjacent, dirty nodes. */
14613 +struct flush_scan {
14614 +
14615 +       /* The current number of nodes scanned on this level. */
14616 +       unsigned count;
14617 +
14618 +       /* There may be a maximum number of nodes for a scan on any single level.  When
14619 +          going leftward, max_count is determined by FLUSH_SCAN_MAXNODES (see reiser4.h) */
14620 +       unsigned max_count;
14621 +
14622 +       /* Direction: Set to one of the sideof enumeration: { LEFT_SIDE, RIGHT_SIDE }. */
14623 +       sideof direction;
14624 +
14625 +       /* Initially @stop is set to false then set true once some condition stops the
14626 +          search (e.g., we found a clean node before reaching max_count or we found a
14627 +          node belonging to another atom). */
14628 +       int stop;
14629 +
14630 +       /* The current scan position.  If @node is non-NULL then its reference count has
14631 +          been incremented to reflect this reference. */
14632 +       jnode *node;
14633 +
14634 +       /* A handle for zload/zrelse of current scan position node. */
14635 +       load_count node_load;
14636 +
14637 +       /* During left-scan, if the final position (a.k.a. endpoint node) is formatted the
14638 +          node is locked using this lock handle.  The endpoint needs to be locked for
14639 +          transfer to the flush_position object after scanning finishes. */
14640 +       lock_handle node_lock;
14641 +
14642 +       /* When the position is unformatted, its parent, coordinate, and parent
14643 +          zload/zrelse handle. */
14644 +       lock_handle parent_lock;
14645 +       coord_t parent_coord;
14646 +       load_count parent_load;
14647 +
14648 +       /* The block allocator preceder hint.  Sometimes flush_scan determines what the
14649 +          preceder is and if so it sets it here, after which it is copied into the
14650 +          flush_position.  Otherwise, the preceder is computed later. */
14651 +       reiser4_block_nr preceder_blk;
14652 +};
14653 +
14654 +typedef struct convert_item_info {
14655 +       dc_item_stat d_cur;     /* disk cluster state of the current item */
14656 +       dc_item_stat d_next;    /* disk cluster state of the next slum item */
14657 +       struct inode *inode;
14658 +       flow_t flow;
14659 +} convert_item_info_t;
14660 +
14661 +typedef struct convert_info {
14662 +       int count;              /* for squalloc terminating */
14663 +       reiser4_cluster_t clust;        /* transform cluster */
14664 +       item_plugin *iplug;     /* current item plugin */
14665 +       convert_item_info_t *itm;       /* current item info */
14666 +} convert_info_t;
14667 +
14668 +typedef enum flush_position_state {
14669 +       POS_INVALID,            /* Invalid or stopped pos, do not continue slum
14670 +                                * processing */
14671 +       POS_ON_LEAF,            /* pos points to already prepped, locked formatted node at
14672 +                                * leaf level */
14673 +       POS_ON_EPOINT,          /* pos keeps a lock on twig level, "coord" field is used
14674 +                                * to traverse unformatted nodes */
14675 +       POS_TO_LEAF,            /* pos is being moved to leaf level */
14676 +       POS_TO_TWIG,            /* pos is being moved to twig level */
14677 +       POS_END_OF_TWIG,        /* special case of POS_ON_TWIG, when coord is after
14678 +                                * rightmost unit of the current twig */
14679 +       POS_ON_INTERNAL         /* same as POS_ON_LEAF, but points to internal node */
14680 +} flushpos_state_t;
14681 +
14682 +/* An encapsulation of the current flush point and all the parameters that are passed
14683 +   through the entire squeeze-and-allocate stage of the flush routine.  A single
14684 +   flush_position object is constructed after left- and right-scanning finishes. */
14685 +struct flush_position {
14686 +       flushpos_state_t state;
14687 +
14688 +       coord_t coord;          /* coord to traverse unformatted nodes */
14689 +       lock_handle lock;       /* current lock we hold */
14690 +       load_count load;        /* load status for current locked formatted node  */
14691 +
14692 +       jnode *child;           /* for passing a reference to unformatted child
14693 +                                * across pos state changes */
14694 +
14695 +       reiser4_blocknr_hint preceder;  /* The flush 'hint' state. */
14696 +       int leaf_relocate;      /* True if enough leaf-level nodes were
14697 +                                * found to suggest a relocate policy. */
14698 +       int alloc_cnt;          /* The number of nodes allocated during squeeze and allococate. */
14699 +       int prep_or_free_cnt;   /* The number of nodes prepared for write (allocate) or squeezed and freed. */
14700 +       flush_queue_t *fq;
14701 +       long *nr_written;       /* number of nodes submitted to disk */
14702 +       int flags;              /* a copy of jnode_flush flags argument */
14703 +
14704 +       znode *prev_twig;       /* previous parent pointer value, used to catch
14705 +                                * processing of new twig node */
14706 +       convert_info_t *sq;     /* convert info */
14707 +
14708 +       unsigned long pos_in_unit;      /* for extents only. Position
14709 +                                          within an extent unit of first
14710 +                                          jnode of slum */
14711 +       long nr_to_write;       /* number of unformatted nodes to handle on flush */
14712 +};
14713 +
14714 +static inline int item_convert_count(flush_pos_t * pos)
14715 +{
14716 +       return pos->sq->count;
14717 +}
14718 +static inline void inc_item_convert_count(flush_pos_t * pos)
14719 +{
14720 +       pos->sq->count++;
14721 +}
14722 +static inline void set_item_convert_count(flush_pos_t * pos, int count)
14723 +{
14724 +       pos->sq->count = count;
14725 +}
14726 +static inline item_plugin *item_convert_plug(flush_pos_t * pos)
14727 +{
14728 +       return pos->sq->iplug;
14729 +}
14730 +
14731 +static inline convert_info_t *convert_data(flush_pos_t * pos)
14732 +{
14733 +       return pos->sq;
14734 +}
14735 +
14736 +static inline convert_item_info_t *item_convert_data(flush_pos_t * pos)
14737 +{
14738 +       assert("edward-955", convert_data(pos));
14739 +       return pos->sq->itm;
14740 +}
14741 +
14742 +static inline tfm_cluster_t *tfm_cluster_sq(flush_pos_t * pos)
14743 +{
14744 +       return &pos->sq->clust.tc;
14745 +}
14746 +
14747 +static inline tfm_stream_t *tfm_stream_sq(flush_pos_t * pos, tfm_stream_id id)
14748 +{
14749 +       assert("edward-854", pos->sq != NULL);
14750 +       return tfm_stream(tfm_cluster_sq(pos), id);
14751 +}
14752 +
14753 +static inline int chaining_data_present(flush_pos_t * pos)
14754 +{
14755 +       return convert_data(pos) && item_convert_data(pos);
14756 +}
14757 +
14758 +/* Returns true if next node contains next item of the disk cluster
14759 +   so item convert data should be moved to the right slum neighbor.
14760 +*/
14761 +static inline int should_chain_next_node(flush_pos_t * pos)
14762 +{
14763 +       int result = 0;
14764 +
14765 +       assert("edward-1007", chaining_data_present(pos));
14766 +
14767 +       switch (item_convert_data(pos)->d_next) {
14768 +       case DC_CHAINED_ITEM:
14769 +               result = 1;
14770 +               break;
14771 +       case DC_AFTER_CLUSTER:
14772 +               break;
14773 +       default:
14774 +               impossible("edward-1009", "bad state of next slum item");
14775 +       }
14776 +       return result;
14777 +}
14778 +
14779 +/* update item state in a disk cluster to assign conversion mode */
14780 +static inline void
14781 +move_chaining_data(flush_pos_t * pos, int this_node /* where is next item */ )
14782 +{
14783 +
14784 +       assert("edward-1010", chaining_data_present(pos));
14785 +
14786 +       if (this_node == 0) {
14787 +               /* next item is on the right neighbor */
14788 +               assert("edward-1011",
14789 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14790 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14791 +               assert("edward-1012",
14792 +                      item_convert_data(pos)->d_next == DC_CHAINED_ITEM);
14793 +
14794 +               item_convert_data(pos)->d_cur = DC_CHAINED_ITEM;
14795 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
14796 +       } else {
14797 +               /* next item is on the same node */
14798 +               assert("edward-1013",
14799 +                      item_convert_data(pos)->d_cur == DC_FIRST_ITEM ||
14800 +                      item_convert_data(pos)->d_cur == DC_CHAINED_ITEM);
14801 +               assert("edward-1227",
14802 +                      item_convert_data(pos)->d_next == DC_AFTER_CLUSTER ||
14803 +                      item_convert_data(pos)->d_next == DC_INVALID_STATE);
14804 +
14805 +               item_convert_data(pos)->d_cur = DC_AFTER_CLUSTER;
14806 +               item_convert_data(pos)->d_next = DC_INVALID_STATE;
14807 +       }
14808 +}
14809 +
14810 +static inline int should_convert_node(flush_pos_t * pos, znode * node)
14811 +{
14812 +       return znode_convertible(node);
14813 +}
14814 +
14815 +/* true if there is attached convert item info */
14816 +static inline int should_convert_next_node(flush_pos_t * pos, znode * node)
14817 +{
14818 +       return convert_data(pos) && item_convert_data(pos);
14819 +}
14820 +
14821 +#define SQUALLOC_THRESHOLD 256
14822 +
14823 +static inline int should_terminate_squalloc(flush_pos_t * pos)
14824 +{
14825 +       return convert_data(pos) &&
14826 +           !item_convert_data(pos) &&
14827 +           item_convert_count(pos) >= SQUALLOC_THRESHOLD;
14828 +}
14829 +
14830 +void free_convert_data(flush_pos_t * pos);
14831 +/* used in extent.c */
14832 +int scan_set_current(flush_scan * scan, jnode * node, unsigned add_size,
14833 +                    const coord_t * parent);
14834 +int reiser4_scan_finished(flush_scan * scan);
14835 +int reiser4_scanning_left(flush_scan * scan);
14836 +int reiser4_scan_goto(flush_scan * scan, jnode * tonode);
14837 +txn_atom *atom_locked_by_fq(flush_queue_t * fq);
14838 +int reiser4_alloc_extent(flush_pos_t *flush_pos);
14839 +squeeze_result squalloc_extent(znode *left, const coord_t *, flush_pos_t *,
14840 +                              reiser4_key *stop_key);
14841 +extern int reiser4_init_fqs(void);
14842 +extern void reiser4_done_fqs(void);
14843 +
14844 +#if REISER4_DEBUG
14845 +
14846 +extern void reiser4_check_fq(const txn_atom *atom);
14847 +extern atomic_t flush_cnt;
14848 +
14849 +#define check_preceder(blk) \
14850 +assert("nikita-2588", blk < reiser4_block_count(reiser4_get_current_sb()));
14851 +extern void check_pos(flush_pos_t * pos);
14852 +#else
14853 +#define check_preceder(b) noop
14854 +#define check_pos(pos) noop
14855 +#endif
14856 +
14857 +/* __REISER4_FLUSH_H__ */
14858 +#endif
14859 +
14860 +/* Make Linus happy.
14861 +   Local variables:
14862 +   c-indentation-style: "K&R"
14863 +   mode-name: "LC"
14864 +   c-basic-offset: 8
14865 +   tab-width: 8
14866 +   fill-column: 90
14867 +   LocalWords:  preceder
14868 +   End:
14869 +*/
14870 diff --git a/fs/reiser4/flush_queue.c b/fs/reiser4/flush_queue.c
14871 new file mode 100644
14872 index 0000000..f6c5d9a
14873 --- /dev/null
14874 +++ b/fs/reiser4/flush_queue.c
14875 @@ -0,0 +1,680 @@
14876 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
14877 +
14878 +#include "debug.h"
14879 +#include "super.h"
14880 +#include "txnmgr.h"
14881 +#include "jnode.h"
14882 +#include "znode.h"
14883 +#include "page_cache.h"
14884 +#include "wander.h"
14885 +#include "vfs_ops.h"
14886 +#include "writeout.h"
14887 +#include "flush.h"
14888 +
14889 +#include <linux/bio.h>
14890 +#include <linux/mm.h>
14891 +#include <linux/pagemap.h>
14892 +#include <linux/blkdev.h>
14893 +#include <linux/writeback.h>
14894 +
14895 +/* A flush queue object is an accumulator for keeping jnodes prepared
14896 +   by the jnode_flush() function for writing to disk. Those "queued" jnodes are
14897 +   kept on the flush queue until memory pressure or atom commit asks
14898 +   flush queues to write some or all from their jnodes. */
14899 +
14900 +/*
14901 +   LOCKING:
14902 +
14903 +   fq->guard spin lock protects fq->atom pointer and nothing else.  fq->prepped
14904 +   list protected by atom spin lock.  fq->prepped list uses the following
14905 +   locking:
14906 +
14907 +   two ways to protect fq->prepped list for read-only list traversal:
14908 +
14909 +   1. atom spin-lock atom.
14910 +   2. fq is IN_USE, atom->nr_running_queues increased.
14911 +
14912 +   and one for list modification:
14913 +
14914 +   1. atom is spin-locked and one condition is true: fq is IN_USE or
14915 +      atom->nr_running_queues == 0.
14916 +
14917 +   The deadlock-safe order for flush queues and atoms is: first lock atom, then
14918 +   lock flush queue, then lock jnode.
14919 +*/
14920 +
14921 +#define fq_in_use(fq)          ((fq)->state & FQ_IN_USE)
14922 +#define fq_ready(fq)           (!fq_in_use(fq))
14923 +
14924 +#define mark_fq_in_use(fq)     do { (fq)->state |= FQ_IN_USE;    } while (0)
14925 +#define mark_fq_ready(fq)      do { (fq)->state &= ~FQ_IN_USE;   } while (0)
14926 +
14927 +/* get lock on atom from locked flush queue object */
14928 +static txn_atom *atom_locked_by_fq_nolock(flush_queue_t * fq)
14929 +{
14930 +       /* This code is similar to jnode_get_atom(), look at it for the
14931 +        * explanation. */
14932 +       txn_atom *atom;
14933 +
14934 +       assert_spin_locked(&(fq->guard));
14935 +
14936 +       while (1) {
14937 +               atom = fq->atom;
14938 +               if (atom == NULL)
14939 +                       break;
14940 +
14941 +               if (spin_trylock_atom(atom))
14942 +                       break;
14943 +
14944 +               atomic_inc(&atom->refcount);
14945 +               spin_unlock(&(fq->guard));
14946 +               spin_lock_atom(atom);
14947 +               spin_lock(&(fq->guard));
14948 +
14949 +               if (fq->atom == atom) {
14950 +                       atomic_dec(&atom->refcount);
14951 +                       break;
14952 +               }
14953 +
14954 +               spin_unlock(&(fq->guard));
14955 +               atom_dec_and_unlock(atom);
14956 +               spin_lock(&(fq->guard));
14957 +       }
14958 +
14959 +       return atom;
14960 +}
14961 +
14962 +txn_atom *atom_locked_by_fq(flush_queue_t * fq)
14963 +{
14964 +       txn_atom *atom;
14965 +
14966 +       spin_lock(&(fq->guard));
14967 +       atom = atom_locked_by_fq_nolock(fq);
14968 +       spin_unlock(&(fq->guard));
14969 +       return atom;
14970 +}
14971 +
14972 +static void init_fq(flush_queue_t * fq)
14973 +{
14974 +       memset(fq, 0, sizeof *fq);
14975 +
14976 +       atomic_set(&fq->nr_submitted, 0);
14977 +
14978 +       INIT_LIST_HEAD(ATOM_FQ_LIST(fq));
14979 +
14980 +       init_waitqueue_head(&fq->wait);
14981 +       spin_lock_init(&fq->guard);
14982 +}
14983 +
14984 +/* slab for flush queues */
14985 +static struct kmem_cache *fq_slab;
14986 +
14987 +/**
14988 + * reiser4_init_fqs - create flush queue cache
14989 + *
14990 + * Initializes slab cache of flush queues. It is part of reiser4 module
14991 + * initialization.
14992 + */
14993 +int reiser4_init_fqs(void)
14994 +{
14995 +       fq_slab = kmem_cache_create("fq",
14996 +                                   sizeof(flush_queue_t),
14997 +                                   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
14998 +       if (fq_slab == NULL)
14999 +               return RETERR(-ENOMEM);
15000 +       return 0;
15001 +}
15002 +
15003 +/**
15004 + * reiser4_done_fqs - delete flush queue cache
15005 + *
15006 + * This is called on reiser4 module unloading or system shutdown.
15007 + */
15008 +void reiser4_done_fqs(void)
15009 +{
15010 +       destroy_reiser4_cache(&fq_slab);
15011 +}
15012 +
15013 +/* create new flush queue object */
15014 +static flush_queue_t *create_fq(gfp_t gfp)
15015 +{
15016 +       flush_queue_t *fq;
15017 +
15018 +       fq = kmem_cache_alloc(fq_slab, gfp);
15019 +       if (fq)
15020 +               init_fq(fq);
15021 +
15022 +       return fq;
15023 +}
15024 +
15025 +/* adjust atom's and flush queue's counters of queued nodes */
15026 +static void count_enqueued_node(flush_queue_t * fq)
15027 +{
15028 +       ON_DEBUG(fq->atom->num_queued++);
15029 +}
15030 +
15031 +static void count_dequeued_node(flush_queue_t * fq)
15032 +{
15033 +       assert("zam-993", fq->atom->num_queued > 0);
15034 +       ON_DEBUG(fq->atom->num_queued--);
15035 +}
15036 +
15037 +/* attach flush queue object to the atom */
15038 +static void attach_fq(txn_atom *atom, flush_queue_t *fq)
15039 +{
15040 +       assert_spin_locked(&(atom->alock));
15041 +       list_add(&fq->alink, &atom->flush_queues);
15042 +       fq->atom = atom;
15043 +       ON_DEBUG(atom->nr_flush_queues++);
15044 +}
15045 +
15046 +static void detach_fq(flush_queue_t * fq)
15047 +{
15048 +       assert_spin_locked(&(fq->atom->alock));
15049 +
15050 +       spin_lock(&(fq->guard));
15051 +       list_del_init(&fq->alink);
15052 +       assert("vs-1456", fq->atom->nr_flush_queues > 0);
15053 +       ON_DEBUG(fq->atom->nr_flush_queues--);
15054 +       fq->atom = NULL;
15055 +       spin_unlock(&(fq->guard));
15056 +}
15057 +
15058 +/* destroy flush queue object */
15059 +static void done_fq(flush_queue_t * fq)
15060 +{
15061 +       assert("zam-763", list_empty_careful(ATOM_FQ_LIST(fq)));
15062 +       assert("zam-766", atomic_read(&fq->nr_submitted) == 0);
15063 +
15064 +       kmem_cache_free(fq_slab, fq);
15065 +}
15066 +
15067 +/* */
15068 +static void mark_jnode_queued(flush_queue_t * fq, jnode * node)
15069 +{
15070 +       JF_SET(node, JNODE_FLUSH_QUEUED);
15071 +       count_enqueued_node(fq);
15072 +}
15073 +
15074 +/* Putting jnode into the flush queue. Both atom and jnode should be
15075 +   spin-locked. */
15076 +void queue_jnode(flush_queue_t * fq, jnode * node)
15077 +{
15078 +       assert_spin_locked(&(node->guard));
15079 +       assert("zam-713", node->atom != NULL);
15080 +       assert_spin_locked(&(node->atom->alock));
15081 +       assert("zam-716", fq->atom != NULL);
15082 +       assert("zam-717", fq->atom == node->atom);
15083 +       assert("zam-907", fq_in_use(fq));
15084 +
15085 +       assert("zam-714", JF_ISSET(node, JNODE_DIRTY));
15086 +       assert("zam-826", JF_ISSET(node, JNODE_RELOC));
15087 +       assert("vs-1481", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
15088 +       assert("vs-1481", NODE_LIST(node) != FQ_LIST);
15089 +
15090 +       mark_jnode_queued(fq, node);
15091 +       list_move_tail(&node->capture_link, ATOM_FQ_LIST(fq));
15092 +
15093 +       ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
15094 +                            FQ_LIST, 1));
15095 +}
15096 +
15097 +/* repeatable process for waiting io completion on a flush queue object */
15098 +static int wait_io(flush_queue_t * fq, int *nr_io_errors)
15099 +{
15100 +       assert("zam-738", fq->atom != NULL);
15101 +       assert_spin_locked(&(fq->atom->alock));
15102 +       assert("zam-736", fq_in_use(fq));
15103 +       assert("zam-911", list_empty_careful(ATOM_FQ_LIST(fq)));
15104 +
15105 +       if (atomic_read(&fq->nr_submitted) != 0) {
15106 +               struct super_block *super;
15107 +
15108 +               spin_unlock_atom(fq->atom);
15109 +
15110 +               assert("nikita-3013", reiser4_schedulable());
15111 +
15112 +               super = reiser4_get_current_sb();
15113 +
15114 +               /* FIXME: this is instead of blk_run_queues() */
15115 +               blk_run_address_space(reiser4_get_super_fake(super)->i_mapping);
15116 +
15117 +               if (!(super->s_flags & MS_RDONLY))
15118 +                       wait_event(fq->wait, atomic_read(&fq->nr_submitted) == 0);
15119 +
15120 +               /* Ask the caller to re-acquire the locks and call this
15121 +                  function again. Note: this technique is commonly used in
15122 +                  the txnmgr code. */
15123 +               return -E_REPEAT;
15124 +       }
15125 +
15126 +       *nr_io_errors += atomic_read(&fq->nr_errors);
15127 +       return 0;
15128 +}
15129 +
15130 +/* wait on I/O completion, re-submit dirty nodes to write */
15131 +static int finish_fq(flush_queue_t * fq, int *nr_io_errors)
15132 +{
15133 +       int ret;
15134 +       txn_atom *atom = fq->atom;
15135 +
15136 +       assert("zam-801", atom != NULL);
15137 +       assert_spin_locked(&(atom->alock));
15138 +       assert("zam-762", fq_in_use(fq));
15139 +
15140 +       ret = wait_io(fq, nr_io_errors);
15141 +       if (ret)
15142 +               return ret;
15143 +
15144 +       detach_fq(fq);
15145 +       done_fq(fq);
15146 +
15147 +       reiser4_atom_send_event(atom);
15148 +
15149 +       return 0;
15150 +}
15151 +
15152 +/* wait for all i/o for given atom to be completed, actually do one iteration
15153 +   on that and return -E_REPEAT if there more iterations needed */
15154 +static int finish_all_fq(txn_atom * atom, int *nr_io_errors)
15155 +{
15156 +       flush_queue_t *fq;
15157 +
15158 +       assert_spin_locked(&(atom->alock));
15159 +
15160 +       if (list_empty_careful(&atom->flush_queues))
15161 +               return 0;
15162 +
15163 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
15164 +               if (fq_ready(fq)) {
15165 +                       int ret;
15166 +
15167 +                       mark_fq_in_use(fq);
15168 +                       assert("vs-1247", fq->owner == NULL);
15169 +                       ON_DEBUG(fq->owner = current);
15170 +                       ret = finish_fq(fq, nr_io_errors);
15171 +
15172 +                       if (*nr_io_errors)
15173 +                               reiser4_handle_error();
15174 +
15175 +                       if (ret) {
15176 +                               reiser4_fq_put(fq);
15177 +                               return ret;
15178 +                       }
15179 +
15180 +                       spin_unlock_atom(atom);
15181 +
15182 +                       return -E_REPEAT;
15183 +               }
15184 +       }
15185 +
15186 +       /* All flush queues are in use; atom remains locked */
15187 +       return -EBUSY;
15188 +}
15189 +
15190 +/* wait all i/o for current atom */
15191 +int current_atom_finish_all_fq(void)
15192 +{
15193 +       txn_atom *atom;
15194 +       int nr_io_errors = 0;
15195 +       int ret = 0;
15196 +
15197 +       do {
15198 +               while (1) {
15199 +                       atom = get_current_atom_locked();
15200 +                       ret = finish_all_fq(atom, &nr_io_errors);
15201 +                       if (ret != -EBUSY)
15202 +                               break;
15203 +                       reiser4_atom_wait_event(atom);
15204 +               }
15205 +       } while (ret == -E_REPEAT);
15206 +
15207 +       /* we do not need locked atom after this function finishes, SUCCESS or
15208 +          -EBUSY are two return codes when atom remains locked after
15209 +          finish_all_fq */
15210 +       if (!ret)
15211 +               spin_unlock_atom(atom);
15212 +
15213 +       assert_spin_not_locked(&(atom->alock));
15214 +
15215 +       if (ret)
15216 +               return ret;
15217 +
15218 +       if (nr_io_errors)
15219 +               return RETERR(-EIO);
15220 +
15221 +       return 0;
15222 +}
15223 +
15224 +/* change node->atom field for all jnode from given list */
15225 +static void
15226 +scan_fq_and_update_atom_ref(struct list_head *list, txn_atom *atom)
15227 +{
15228 +       jnode *cur;
15229 +
15230 +       list_for_each_entry(cur, list, capture_link) {
15231 +               spin_lock_jnode(cur);
15232 +               cur->atom = atom;
15233 +               spin_unlock_jnode(cur);
15234 +       }
15235 +}
15236 +
15237 +/* support for atom fusion operation */
15238 +void reiser4_fuse_fq(txn_atom *to, txn_atom *from)
15239 +{
15240 +       flush_queue_t *fq;
15241 +
15242 +       assert_spin_locked(&(to->alock));
15243 +       assert_spin_locked(&(from->alock));
15244 +
15245 +       list_for_each_entry(fq, &from->flush_queues, alink) {
15246 +               scan_fq_and_update_atom_ref(ATOM_FQ_LIST(fq), to);
15247 +               spin_lock(&(fq->guard));
15248 +               fq->atom = to;
15249 +               spin_unlock(&(fq->guard));
15250 +       }
15251 +
15252 +       list_splice_init(&from->flush_queues, to->flush_queues.prev);
15253 +
15254 +#if REISER4_DEBUG
15255 +       to->num_queued += from->num_queued;
15256 +       to->nr_flush_queues += from->nr_flush_queues;
15257 +       from->nr_flush_queues = 0;
15258 +#endif
15259 +}
15260 +
15261 +#if REISER4_DEBUG
15262 +int atom_fq_parts_are_clean(txn_atom * atom)
15263 +{
15264 +       assert("zam-915", atom != NULL);
15265 +       return list_empty_careful(&atom->flush_queues);
15266 +}
15267 +#endif
15268 +/* Bio i/o completion routine for reiser4 write operations. */
15269 +static int
15270 +end_io_handler(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
15271 +              int err)
15272 +{
15273 +       int i;
15274 +       int nr_errors = 0;
15275 +       flush_queue_t *fq;
15276 +
15277 +       assert("zam-958", bio->bi_rw & WRITE);
15278 +
15279 +       /* i/o op. is not fully completed */
15280 +       if (bio->bi_size != 0)
15281 +               return 1;
15282 +
15283 +       if (err == -EOPNOTSUPP)
15284 +               set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
15285 +
15286 +       /* we expect that bio->private is set to NULL or fq object which is used
15287 +        * for synchronization and error counting. */
15288 +       fq = bio->bi_private;
15289 +       /* Check all elements of io_vec for correct write completion. */
15290 +       for (i = 0; i < bio->bi_vcnt; i += 1) {
15291 +               struct page *pg = bio->bi_io_vec[i].bv_page;
15292 +
15293 +               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
15294 +                       SetPageError(pg);
15295 +                       nr_errors++;
15296 +               }
15297 +
15298 +               {
15299 +                       /* jnode WRITEBACK ("write is in progress bit") is
15300 +                        * atomically cleared here. */
15301 +                       jnode *node;
15302 +
15303 +                       assert("zam-736", pg != NULL);
15304 +                       assert("zam-736", PagePrivate(pg));
15305 +                       node = jprivate(pg);
15306 +
15307 +                       JF_CLR(node, JNODE_WRITEBACK);
15308 +               }
15309 +
15310 +               end_page_writeback(pg);
15311 +               page_cache_release(pg);
15312 +       }
15313 +
15314 +       if (fq) {
15315 +               /* count i/o error in fq object */
15316 +               atomic_add(nr_errors, &fq->nr_errors);
15317 +
15318 +               /* If all write requests registered in this "fq" are done we up
15319 +                * the waiter. */
15320 +               if (atomic_sub_and_test(bio->bi_vcnt, &fq->nr_submitted))
15321 +                       wake_up(&fq->wait);
15322 +       }
15323 +
15324 +       bio_put(bio);
15325 +       return 0;
15326 +}
15327 +
15328 +/* Count I/O requests which will be submitted by @bio in given flush queues
15329 +   @fq */
15330 +void add_fq_to_bio(flush_queue_t * fq, struct bio *bio)
15331 +{
15332 +       bio->bi_private = fq;
15333 +       bio->bi_end_io = end_io_handler;
15334 +
15335 +       if (fq)
15336 +               atomic_add(bio->bi_vcnt, &fq->nr_submitted);
15337 +}
15338 +
15339 +/* Move all queued nodes out from @fq->prepped list. */
15340 +static void release_prepped_list(flush_queue_t * fq)
15341 +{
15342 +       txn_atom *atom;
15343 +
15344 +       assert("zam-904", fq_in_use(fq));
15345 +       atom = atom_locked_by_fq(fq);
15346 +
15347 +       while (!list_empty(ATOM_FQ_LIST(fq))) {
15348 +               jnode *cur;
15349 +
15350 +               cur = list_entry(ATOM_FQ_LIST(fq)->next, jnode, capture_link);
15351 +               list_del_init(&cur->capture_link);
15352 +
15353 +               count_dequeued_node(fq);
15354 +               spin_lock_jnode(cur);
15355 +               assert("nikita-3154", !JF_ISSET(cur, JNODE_OVRWR));
15356 +               assert("nikita-3154", JF_ISSET(cur, JNODE_RELOC));
15357 +               assert("nikita-3154", JF_ISSET(cur, JNODE_FLUSH_QUEUED));
15358 +               JF_CLR(cur, JNODE_FLUSH_QUEUED);
15359 +
15360 +               if (JF_ISSET(cur, JNODE_DIRTY)) {
15361 +                       list_add_tail(&cur->capture_link,
15362 +                                     ATOM_DIRTY_LIST(atom, jnode_get_level(cur)));
15363 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15364 +                                            DIRTY_LIST, 1));
15365 +               } else {
15366 +                       list_add_tail(&cur->capture_link, ATOM_CLEAN_LIST(atom));
15367 +                       ON_DEBUG(count_jnode(atom, cur, FQ_LIST,
15368 +                                            CLEAN_LIST, 1));
15369 +               }
15370 +
15371 +               spin_unlock_jnode(cur);
15372 +       }
15373 +
15374 +       if (--atom->nr_running_queues == 0)
15375 +               reiser4_atom_send_event(atom);
15376 +
15377 +       spin_unlock_atom(atom);
15378 +}
15379 +
15380 +/* Submit write requests for nodes on the already filled flush queue @fq.
15381 +
15382 +   @fq: flush queue object which contains jnodes we can (and will) write.
15383 +   @return: number of submitted blocks (>=0) if success, otherwise -- an error
15384 +            code (<0). */
15385 +int reiser4_write_fq(flush_queue_t * fq, long *nr_submitted, int flags)
15386 +{
15387 +       int ret;
15388 +       txn_atom *atom;
15389 +
15390 +       while (1) {
15391 +               atom = atom_locked_by_fq(fq);
15392 +               assert("zam-924", atom);
15393 +               /* do not write fq in parallel. */
15394 +               if (atom->nr_running_queues == 0
15395 +                   || !(flags & WRITEOUT_SINGLE_STREAM))
15396 +                       break;
15397 +               reiser4_atom_wait_event(atom);
15398 +       }
15399 +
15400 +       atom->nr_running_queues++;
15401 +       spin_unlock_atom(atom);
15402 +
15403 +       ret = write_jnode_list(ATOM_FQ_LIST(fq), fq, nr_submitted, flags);
15404 +       release_prepped_list(fq);
15405 +
15406 +       return ret;
15407 +}
15408 +
15409 +/* Getting flush queue object for exclusive use by one thread. May require
15410 +   several iterations which is indicated by -E_REPEAT return code.
15411 +
15412 +   This function does not contain code for obtaining an atom lock because an
15413 +   atom lock is obtained by different ways in different parts of reiser4,
15414 +   usually it is current atom, but we need a possibility for getting fq for the
15415 +   atom of given jnode. */
15416 +static int fq_by_atom_gfp(txn_atom *atom, flush_queue_t **new_fq, gfp_t gfp)
15417 +{
15418 +       flush_queue_t *fq;
15419 +
15420 +       assert_spin_locked(&(atom->alock));
15421 +
15422 +       fq = list_entry(atom->flush_queues.next, flush_queue_t, alink);
15423 +       while (&atom->flush_queues != &fq->alink) {
15424 +               spin_lock(&(fq->guard));
15425 +
15426 +               if (fq_ready(fq)) {
15427 +                       mark_fq_in_use(fq);
15428 +                       assert("vs-1246", fq->owner == NULL);
15429 +                       ON_DEBUG(fq->owner = current);
15430 +                       spin_unlock(&(fq->guard));
15431 +
15432 +                       if (*new_fq)
15433 +                               done_fq(*new_fq);
15434 +
15435 +                       *new_fq = fq;
15436 +
15437 +                       return 0;
15438 +               }
15439 +
15440 +               spin_unlock(&(fq->guard));
15441 +
15442 +               fq = list_entry(fq->alink.next, flush_queue_t, alink);
15443 +       }
15444 +
15445 +       /* Use previously allocated fq object */
15446 +       if (*new_fq) {
15447 +               mark_fq_in_use(*new_fq);
15448 +               assert("vs-1248", (*new_fq)->owner == 0);
15449 +               ON_DEBUG((*new_fq)->owner = current);
15450 +               attach_fq(atom, *new_fq);
15451 +
15452 +               return 0;
15453 +       }
15454 +
15455 +       spin_unlock_atom(atom);
15456 +
15457 +       *new_fq = create_fq(gfp);
15458 +
15459 +       if (*new_fq == NULL)
15460 +               return RETERR(-ENOMEM);
15461 +
15462 +       return RETERR(-E_REPEAT);
15463 +}
15464 +
15465 +int reiser4_fq_by_atom(txn_atom * atom, flush_queue_t ** new_fq)
15466 +{
15467 +       return fq_by_atom_gfp(atom, new_fq, reiser4_ctx_gfp_mask_get());
15468 +}
15469 +
15470 +/* A wrapper around reiser4_fq_by_atom for getting a flush queue
15471 +   object for current atom, if success fq->atom remains locked. */
15472 +flush_queue_t *get_fq_for_current_atom(void)
15473 +{
15474 +       flush_queue_t *fq = NULL;
15475 +       txn_atom *atom;
15476 +       int ret;
15477 +
15478 +       do {
15479 +               atom = get_current_atom_locked();
15480 +               ret = reiser4_fq_by_atom(atom, &fq);
15481 +       } while (ret == -E_REPEAT);
15482 +
15483 +       if (ret)
15484 +               return ERR_PTR(ret);
15485 +       return fq;
15486 +}
15487 +
15488 +/* Releasing flush queue object after exclusive use */
15489 +void reiser4_fq_put_nolock(flush_queue_t *fq)
15490 +{
15491 +       assert("zam-747", fq->atom != NULL);
15492 +       assert("zam-902", list_empty_careful(ATOM_FQ_LIST(fq)));
15493 +       mark_fq_ready(fq);
15494 +       assert("vs-1245", fq->owner == current);
15495 +       ON_DEBUG(fq->owner = NULL);
15496 +}
15497 +
15498 +void reiser4_fq_put(flush_queue_t * fq)
15499 +{
15500 +       txn_atom *atom;
15501 +
15502 +       spin_lock(&(fq->guard));
15503 +       atom = atom_locked_by_fq_nolock(fq);
15504 +
15505 +       assert("zam-746", atom != NULL);
15506 +
15507 +       reiser4_fq_put_nolock(fq);
15508 +       reiser4_atom_send_event(atom);
15509 +
15510 +       spin_unlock(&(fq->guard));
15511 +       spin_unlock_atom(atom);
15512 +}
15513 +
15514 +/* A part of atom object initialization related to the embedded flush queue
15515 +   list head */
15516 +
15517 +void init_atom_fq_parts(txn_atom *atom)
15518 +{
15519 +       INIT_LIST_HEAD(&atom->flush_queues);
15520 +}
15521 +
15522 +#if REISER4_DEBUG
15523 +
15524 +void reiser4_check_fq(const txn_atom *atom)
15525 +{
15526 +       /* check number of nodes on all atom's flush queues */
15527 +       flush_queue_t *fq;
15528 +       int count;
15529 +       struct list_head *pos;
15530 +
15531 +       count = 0;
15532 +       list_for_each_entry(fq, &atom->flush_queues, alink) {
15533 +               spin_lock(&(fq->guard));
15534 +               /* calculate number of jnodes on fq' list of prepped jnodes */
15535 +               list_for_each(pos, ATOM_FQ_LIST(fq))
15536 +                       count++;
15537 +               spin_unlock(&(fq->guard));
15538 +       }
15539 +       if (count != atom->fq)
15540 +               warning("", "fq counter %d, real %d\n", atom->fq, count);
15541 +
15542 +}
15543 +
15544 +#endif
15545 +
15546 +/*
15547 + * Local variables:
15548 + * c-indentation-style: "K&R"
15549 + * mode-name: "LC"
15550 + * c-basic-offset: 8
15551 + * tab-width: 8
15552 + * fill-column: 79
15553 + * scroll-step: 1
15554 + * End:
15555 + */
15556 diff --git a/fs/reiser4/forward.h b/fs/reiser4/forward.h
15557 new file mode 100644
15558 index 0000000..8536833
15559 --- /dev/null
15560 +++ b/fs/reiser4/forward.h
15561 @@ -0,0 +1,256 @@
15562 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
15563 +
15564 +/* Forward declarations. Thank you Kernighan. */
15565 +
15566 +#if !defined( __REISER4_FORWARD_H__ )
15567 +#define __REISER4_FORWARD_H__
15568 +
15569 +#include <asm/errno.h>
15570 +#include <linux/types.h>
15571 +
15572 +typedef struct zlock zlock;
15573 +typedef struct lock_stack lock_stack;
15574 +typedef struct lock_handle lock_handle;
15575 +typedef struct znode znode;
15576 +typedef struct flow flow_t;
15577 +typedef struct coord coord_t;
15578 +typedef struct tree_access_pointer tap_t;
15579 +typedef struct item_coord item_coord;
15580 +typedef struct shift_params shift_params;
15581 +typedef struct reiser4_object_create_data reiser4_object_create_data;
15582 +typedef union reiser4_plugin reiser4_plugin;
15583 +typedef __u16 reiser4_plugin_id;
15584 +typedef __u64 reiser4_plugin_groups;
15585 +typedef struct item_plugin item_plugin;
15586 +typedef struct jnode_plugin jnode_plugin;
15587 +typedef struct reiser4_item_data reiser4_item_data;
15588 +typedef union reiser4_key reiser4_key;
15589 +typedef struct reiser4_tree reiser4_tree;
15590 +typedef struct carry_cut_data carry_cut_data;
15591 +typedef struct carry_kill_data carry_kill_data;
15592 +typedef struct carry_tree_op carry_tree_op;
15593 +typedef struct carry_tree_node carry_tree_node;
15594 +typedef struct carry_plugin_info carry_plugin_info;
15595 +typedef struct reiser4_journal reiser4_journal;
15596 +typedef struct txn_atom txn_atom;
15597 +typedef struct txn_handle txn_handle;
15598 +typedef struct txn_mgr txn_mgr;
15599 +typedef struct reiser4_dir_entry_desc reiser4_dir_entry_desc;
15600 +typedef struct reiser4_context reiser4_context;
15601 +typedef struct carry_level carry_level;
15602 +typedef struct blocknr_set_entry blocknr_set_entry;
15603 +/* super_block->s_fs_info points to this */
15604 +typedef struct reiser4_super_info_data reiser4_super_info_data;
15605 +/* next two objects are fields of reiser4_super_info_data */
15606 +typedef struct reiser4_oid_allocator reiser4_oid_allocator;
15607 +typedef struct reiser4_space_allocator reiser4_space_allocator;
15608 +
15609 +typedef struct flush_scan flush_scan;
15610 +typedef struct flush_position flush_pos_t;
15611 +
15612 +typedef unsigned short pos_in_node_t;
15613 +#define MAX_POS_IN_NODE 65535
15614 +
15615 +typedef struct jnode jnode;
15616 +typedef struct reiser4_blocknr_hint reiser4_blocknr_hint;
15617 +
15618 +typedef struct uf_coord uf_coord_t;
15619 +typedef struct hint hint_t;
15620 +
15621 +typedef struct ktxnmgrd_context ktxnmgrd_context;
15622 +
15623 +typedef struct reiser4_xattr_plugin reiser4_xattr_plugin;
15624 +
15625 +struct inode;
15626 +struct page;
15627 +struct file;
15628 +struct dentry;
15629 +struct super_block;
15630 +
15631 +/* return values of coord_by_key(). cbk == coord_by_key */
15632 +typedef enum {
15633 +       CBK_COORD_FOUND = 0,
15634 +       CBK_COORD_NOTFOUND = -ENOENT,
15635 +} lookup_result;
15636 +
15637 +/* results of lookup with directory file */
15638 +typedef enum {
15639 +       FILE_NAME_FOUND = 0,
15640 +       FILE_NAME_NOTFOUND = -ENOENT,
15641 +       FILE_IO_ERROR = -EIO,   /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15642 +       FILE_OOM = -ENOMEM      /* FIXME: it seems silly to have special OOM, IO_ERROR return codes for each search. */
15643 +} file_lookup_result;
15644 +
15645 +/* behaviors of lookup. If coord we are looking for is actually in a tree,
15646 +    both coincide. */
15647 +typedef enum {
15648 +       /* search exactly for the coord with key given */
15649 +       FIND_EXACT,
15650 +       /* search for coord with the maximal key not greater than one
15651 +          given */
15652 +       FIND_MAX_NOT_MORE_THAN  /*LEFT_SLANT_BIAS */
15653 +} lookup_bias;
15654 +
15655 +typedef enum {
15656 +       /* number of leaf level of the tree
15657 +          The fake root has (tree_level=0). */
15658 +       LEAF_LEVEL = 1,
15659 +
15660 +       /* number of level one above leaf level of the tree.
15661 +
15662 +          It is supposed that internal tree used by reiser4 to store file
15663 +          system data and meta data will have height 2 initially (when
15664 +          created by mkfs).
15665 +        */
15666 +       TWIG_LEVEL = 2,
15667 +} tree_level;
15668 +
15669 +/* The "real" maximum ztree height is the 0-origin size of any per-level
15670 +   array, since the zero'th level is not used. */
15671 +#define REAL_MAX_ZTREE_HEIGHT     (REISER4_MAX_ZTREE_HEIGHT-LEAF_LEVEL)
15672 +
15673 +/* enumeration of possible mutual position of item and coord.  This enum is
15674 +    return type of ->is_in_item() item plugin method which see. */
15675 +typedef enum {
15676 +       /* coord is on the left of an item */
15677 +       IP_ON_THE_LEFT,
15678 +       /* coord is inside item */
15679 +       IP_INSIDE,
15680 +       /* coord is inside item, but to the right of the rightmost unit of
15681 +          this item */
15682 +       IP_RIGHT_EDGE,
15683 +       /* coord is on the right of an item */
15684 +       IP_ON_THE_RIGHT
15685 +} interposition;
15686 +
15687 +/* type of lock to acquire on znode before returning it to caller */
15688 +typedef enum {
15689 +       ZNODE_NO_LOCK = 0,
15690 +       ZNODE_READ_LOCK = 1,
15691 +       ZNODE_WRITE_LOCK = 2,
15692 +} znode_lock_mode;
15693 +
15694 +/* type of lock request */
15695 +typedef enum {
15696 +       ZNODE_LOCK_LOPRI = 0,
15697 +       ZNODE_LOCK_HIPRI = (1 << 0),
15698 +
15699 +       /* By setting the ZNODE_LOCK_NONBLOCK flag in a lock request the call to longterm_lock_znode will not sleep
15700 +          waiting for the lock to become available.  If the lock is unavailable, reiser4_znode_lock will immediately
15701 +          return the value -E_REPEAT. */
15702 +       ZNODE_LOCK_NONBLOCK = (1 << 1),
15703 +       /* An option for longterm_lock_znode which prevents atom fusion */
15704 +       ZNODE_LOCK_DONT_FUSE = (1 << 2)
15705 +} znode_lock_request;
15706 +
15707 +typedef enum { READ_OP = 0, WRITE_OP = 1 } rw_op;
15708 +
15709 +/* used to specify direction of shift. These must be -1 and 1 */
15710 +typedef enum {
15711 +       SHIFT_LEFT = 1,
15712 +       SHIFT_RIGHT = -1
15713 +} shift_direction;
15714 +
15715 +typedef enum {
15716 +       LEFT_SIDE,
15717 +       RIGHT_SIDE
15718 +} sideof;
15719 +
15720 +#define round_up( value, order )                                               \
15721 +       ( ( typeof( value ) )( ( ( long ) ( value ) + ( order ) - 1U ) &        \
15722 +                            ~( ( order ) - 1 ) ) )
15723 +
15724 +/* values returned by squalloc_right_neighbor and its auxiliary functions */
15725 +typedef enum {
15726 +       /* unit of internal item is moved */
15727 +       SUBTREE_MOVED = 0,
15728 +       /* nothing else can be squeezed into left neighbor */
15729 +       SQUEEZE_TARGET_FULL = 1,
15730 +       /* all content of node is squeezed into its left neighbor */
15731 +       SQUEEZE_SOURCE_EMPTY = 2,
15732 +       /* one more item is copied (this is only returned by
15733 +          allocate_and_copy_extent to squalloc_twig)) */
15734 +       SQUEEZE_CONTINUE = 3
15735 +} squeeze_result;
15736 +
15737 +/* Do not change items ids. If you do - there will be format change */
15738 +typedef enum {
15739 +       STATIC_STAT_DATA_ID = 0x0,
15740 +       SIMPLE_DIR_ENTRY_ID = 0x1,
15741 +       COMPOUND_DIR_ID = 0x2,
15742 +       NODE_POINTER_ID = 0x3,
15743 +       EXTENT_POINTER_ID = 0x5,
15744 +       FORMATTING_ID = 0x6,
15745 +       CTAIL_ID = 0x7,
15746 +       BLACK_BOX_ID = 0x8,
15747 +       LAST_ITEM_ID = 0x9
15748 +} item_id;
15749 +
15750 +/* Flags passed to jnode_flush() to allow it to distinguish default settings based on
15751 +   whether commit() was called or VM memory pressure was applied. */
15752 +typedef enum {
15753 +       /* submit flush queue to disk at jnode_flush completion */
15754 +       JNODE_FLUSH_WRITE_BLOCKS = 1,
15755 +
15756 +       /* flush is called for commit */
15757 +       JNODE_FLUSH_COMMIT = 2,
15758 +       /* not implemented */
15759 +       JNODE_FLUSH_MEMORY_FORMATTED = 4,
15760 +
15761 +       /* not implemented */
15762 +       JNODE_FLUSH_MEMORY_UNFORMATTED = 8,
15763 +} jnode_flush_flags;
15764 +
15765 +/* Flags to insert/paste carry operations. Currently they only used in
15766 +   flushing code, but in future, they can be used to optimize for repetitive
15767 +   accesses.  */
15768 +typedef enum {
15769 +       /* carry is not allowed to shift data to the left when trying to find
15770 +          free space  */
15771 +       COPI_DONT_SHIFT_LEFT = (1 << 0),
15772 +       /* carry is not allowed to shift data to the right when trying to find
15773 +          free space  */
15774 +       COPI_DONT_SHIFT_RIGHT = (1 << 1),
15775 +       /* carry is not allowed to allocate new node(s) when trying to find
15776 +          free space */
15777 +       COPI_DONT_ALLOCATE = (1 << 2),
15778 +       /* try to load left neighbor if its not in a cache */
15779 +       COPI_LOAD_LEFT = (1 << 3),
15780 +       /* try to load right neighbor if its not in a cache */
15781 +       COPI_LOAD_RIGHT = (1 << 4),
15782 +       /* shift insertion point to the left neighbor */
15783 +       COPI_GO_LEFT = (1 << 5),
15784 +       /* shift insertion point to the right neighbor */
15785 +       COPI_GO_RIGHT = (1 << 6),
15786 +       /* try to step back into original node if insertion into new node
15787 +          fails after shifting data there. */
15788 +       COPI_STEP_BACK = (1 << 7)
15789 +} cop_insert_flag;
15790 +
15791 +typedef enum {
15792 +       SAFE_UNLINK,            /* safe-link for unlink */
15793 +       SAFE_TRUNCATE           /* safe-link for truncate */
15794 +} reiser4_safe_link_t;
15795 +
15796 +/* this is to show on which list of atom jnode is */
15797 +typedef enum {
15798 +       NOT_CAPTURED,
15799 +       DIRTY_LIST,
15800 +       CLEAN_LIST,
15801 +       FQ_LIST,
15802 +       WB_LIST,
15803 +       OVRWR_LIST
15804 +} atom_list;
15805 +
15806 +/* __REISER4_FORWARD_H__ */
15807 +#endif
15808 +
15809 +/* Make Linus happy.
15810 +   Local variables:
15811 +   c-indentation-style: "K&R"
15812 +   mode-name: "LC"
15813 +   c-basic-offset: 8
15814 +   tab-width: 8
15815 +   fill-column: 120
15816 +   End:
15817 +*/
15818 diff --git a/fs/reiser4/fsdata.c b/fs/reiser4/fsdata.c
15819 new file mode 100644
15820 index 0000000..47da01c
15821 --- /dev/null
15822 +++ b/fs/reiser4/fsdata.c
15823 @@ -0,0 +1,804 @@
15824 +/* Copyright 2001, 2002, 2003, 2004, 2005 by Hans Reiser, licensing governed by
15825 + * reiser4/README */
15826 +
15827 +#include "fsdata.h"
15828 +#include "inode.h"
15829 +
15830 +
15831 +/* cache or dir_cursors */
15832 +static struct kmem_cache *d_cursor_cache;
15833 +static struct shrinker *d_cursor_shrinker;
15834 +
15835 +/* list of unused cursors */
15836 +static LIST_HEAD(cursor_cache);
15837 +
15838 +/* number of cursors in list of ununsed cursors */
15839 +static unsigned long d_cursor_unused = 0;
15840 +
15841 +/* spinlock protecting manipulations with dir_cursor's hash table and lists */
15842 +DEFINE_SPINLOCK(d_lock);
15843 +
15844 +static reiser4_file_fsdata *create_fsdata(struct file *file);
15845 +static int file_is_stateless(struct file *file);
15846 +static void free_fsdata(reiser4_file_fsdata *fsdata);
15847 +static void kill_cursor(dir_cursor *);
15848 +
15849 +/**
15850 + * d_cursor_shrink - shrink callback for cache of dir_cursor-s
15851 + * @nr: number of objects to free
15852 + * @mask: GFP mask
15853 + *
15854 + * Shrinks d_cursor_cache. Scan LRU list of unused cursors, freeing requested
15855 + * number. Return number of still freeable cursors.
15856 + */
15857 +static int d_cursor_shrink(int nr, gfp_t mask)
15858 +{
15859 +       if (nr != 0) {
15860 +               dir_cursor *scan;
15861 +               int killed;
15862 +
15863 +               killed = 0;
15864 +               spin_lock(&d_lock);
15865 +               while (!list_empty(&cursor_cache)) {
15866 +                       scan = list_entry(cursor_cache.next, dir_cursor, alist);
15867 +                       assert("nikita-3567", scan->ref == 0);
15868 +                       kill_cursor(scan);
15869 +                       ++killed;
15870 +                       --nr;
15871 +                       if (nr == 0)
15872 +                               break;
15873 +               }
15874 +               spin_unlock(&d_lock);
15875 +       }
15876 +       return d_cursor_unused;
15877 +}
15878 +
15879 +/**
15880 + * reiser4_init_d_cursor - create d_cursor cache
15881 + *
15882 + * Initializes slab cache of d_cursors. It is part of reiser4 module
15883 + * initialization.
15884 + */
15885 +int reiser4_init_d_cursor(void)
15886 +{
15887 +       d_cursor_cache = kmem_cache_create("d_cursor", sizeof(dir_cursor), 0,
15888 +                                          SLAB_HWCACHE_ALIGN, NULL, NULL);
15889 +       if (d_cursor_cache == NULL)
15890 +               return RETERR(-ENOMEM);
15891 +
15892 +       /*
15893 +        * actually, d_cursors are "priceless", because there is no way to
15894 +        * recover information stored in them. On the other hand, we don't
15895 +        * want to consume all kernel memory by them. As a compromise, just
15896 +        * assign higher "seeks" value to d_cursor cache, so that it will be
15897 +        * shrunk only if system is really tight on memory.
15898 +        */
15899 +       d_cursor_shrinker = set_shrinker(DEFAULT_SEEKS << 3,
15900 +                                        d_cursor_shrink);
15901 +       if (d_cursor_shrinker == NULL) {
15902 +               destroy_reiser4_cache(&d_cursor_cache);
15903 +               d_cursor_cache = NULL;
15904 +               return RETERR(-ENOMEM);
15905 +       }
15906 +       return 0;
15907 +}
15908 +
15909 +/**
15910 + * reiser4_done_d_cursor - delete d_cursor cache and d_cursor shrinker
15911 + *
15912 + * This is called on reiser4 module unloading or system shutdown.
15913 + */
15914 +void reiser4_done_d_cursor(void)
15915 +{
15916 +       BUG_ON(d_cursor_shrinker == NULL);
15917 +       remove_shrinker(d_cursor_shrinker);
15918 +       d_cursor_shrinker = NULL;
15919 +
15920 +       destroy_reiser4_cache(&d_cursor_cache);
15921 +}
15922 +
15923 +#define D_CURSOR_TABLE_SIZE (256)
15924 +
15925 +static inline unsigned long
15926 +d_cursor_hash(d_cursor_hash_table *table, const d_cursor_key *key)
15927 +{
15928 +       assert("nikita-3555", IS_POW(D_CURSOR_TABLE_SIZE));
15929 +       return (key->oid + key->cid) & (D_CURSOR_TABLE_SIZE - 1);
15930 +}
15931 +
15932 +static inline int d_cursor_eq(const d_cursor_key *k1, const d_cursor_key *k2)
15933 +{
15934 +       return k1->cid == k2->cid && k1->oid == k2->oid;
15935 +}
15936 +
15937 +/*
15938 + * define functions to manipulate reiser4 super block's hash table of
15939 + * dir_cursors
15940 + */
15941 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
15942 +#define KFREE(ptr, size) kfree(ptr)
15943 +TYPE_SAFE_HASH_DEFINE(d_cursor,
15944 +                     dir_cursor,
15945 +                     d_cursor_key, key, hash, d_cursor_hash, d_cursor_eq);
15946 +#undef KFREE
15947 +#undef KMALLOC
15948 +
15949 +/**
15950 + * reiser4_init_super_d_info - initialize per-super-block d_cursor resources
15951 + * @super: super block to initialize
15952 + *
15953 + * Initializes per-super-block d_cursor's hash table and radix tree. It is part
15954 + * of mount.
15955 + */
15956 +int reiser4_init_super_d_info(struct super_block *super)
15957 +{
15958 +       d_cursor_info *p;
15959 +
15960 +       p = &get_super_private(super)->d_info;
15961 +
15962 +       INIT_RADIX_TREE(&p->tree, reiser4_ctx_gfp_mask_get());
15963 +       return d_cursor_hash_init(&p->table, D_CURSOR_TABLE_SIZE);
15964 +}
15965 +
15966 +/**
15967 + * reiser4_done_super_d_info - release per-super-block d_cursor resources
15968 + * @super: super block being umounted
15969 + *
15970 + * It is called on umount. Kills all directory cursors attached to suoer block.
15971 + */
15972 +void reiser4_done_super_d_info(struct super_block *super)
15973 +{
15974 +       d_cursor_info *d_info;
15975 +       dir_cursor *cursor, *next;
15976 +
15977 +       d_info = &get_super_private(super)->d_info;
15978 +       for_all_in_htable(&d_info->table, d_cursor, cursor, next)
15979 +               kill_cursor(cursor);
15980 +
15981 +       BUG_ON(d_info->tree.rnode != NULL);
15982 +       d_cursor_hash_done(&d_info->table);
15983 +}
15984 +
15985 +/**
15986 + * kill_cursor - free dir_cursor and reiser4_file_fsdata attached to it
15987 + * @cursor: cursor to free
15988 + *
15989 + * Removes reiser4_file_fsdata attached to @cursor from readdir list of
15990 + * reiser4_inode, frees that reiser4_file_fsdata. Removes @cursor from from
15991 + * indices, hash table, list of unused cursors and frees it.
15992 + */
15993 +static void kill_cursor(dir_cursor *cursor)
15994 +{
15995 +       unsigned long index;
15996 +
15997 +       assert("nikita-3566", cursor->ref == 0);
15998 +       assert("nikita-3572", cursor->fsdata != NULL);
15999 +
16000 +       index = (unsigned long)cursor->key.oid;
16001 +       list_del_init(&cursor->fsdata->dir.linkage);
16002 +       free_fsdata(cursor->fsdata);
16003 +       cursor->fsdata = NULL;
16004 +
16005 +       if (list_empty_careful(&cursor->list))
16006 +               /* this is last cursor for a file. Kill radix-tree entry */
16007 +               radix_tree_delete(&cursor->info->tree, index);
16008 +       else {
16009 +               void **slot;
16010 +
16011 +               /*
16012 +                * there are other cursors for the same oid.
16013 +                */
16014 +
16015 +               /*
16016 +                * if radix tree point to the cursor being removed, re-target
16017 +                * radix tree slot to the next cursor in the (non-empty as was
16018 +                * checked above) element of the circular list of all cursors
16019 +                * for this oid.
16020 +                */
16021 +               slot = radix_tree_lookup_slot(&cursor->info->tree, index);
16022 +               assert("nikita-3571", *slot != NULL);
16023 +               if (*slot == cursor)
16024 +                       *slot = list_entry(cursor->list.next, dir_cursor, list);
16025 +               /* remove cursor from circular list */
16026 +               list_del_init(&cursor->list);
16027 +       }
16028 +       /* remove cursor from the list of unused cursors */
16029 +       list_del_init(&cursor->alist);
16030 +       /* remove cursor from the hash table */
16031 +       d_cursor_hash_remove(&cursor->info->table, cursor);
16032 +       /* and free it */
16033 +       kmem_cache_free(d_cursor_cache, cursor);
16034 +       --d_cursor_unused;
16035 +}
16036 +
16037 +/* possible actions that can be performed on all cursors for the given file */
16038 +enum cursor_action {
16039 +       /*
16040 +        * load all detached state: this is called when stat-data is loaded
16041 +        * from the disk to recover information about all pending readdirs
16042 +        */
16043 +       CURSOR_LOAD,
16044 +       /*
16045 +        * detach all state from inode, leaving it in the cache. This is called
16046 +        * when inode is removed form the memory by memory pressure
16047 +        */
16048 +       CURSOR_DISPOSE,
16049 +       /*
16050 +        * detach cursors from the inode, and free them. This is called when
16051 +        * inode is destroyed
16052 +        */
16053 +       CURSOR_KILL
16054 +};
16055 +
16056 +/*
16057 + * return d_cursor data for the file system @inode is in.
16058 + */
16059 +static inline d_cursor_info *d_info(struct inode *inode)
16060 +{
16061 +       return &get_super_private(inode->i_sb)->d_info;
16062 +}
16063 +
16064 +/*
16065 + * lookup d_cursor in the per-super-block radix tree.
16066 + */
16067 +static inline dir_cursor *lookup(d_cursor_info * info, unsigned long index)
16068 +{
16069 +       return (dir_cursor *) radix_tree_lookup(&info->tree, index);
16070 +}
16071 +
16072 +/*
16073 + * attach @cursor to the radix tree. There may be multiple cursors for the
16074 + * same oid, they are chained into circular list.
16075 + */
16076 +static void bind_cursor(dir_cursor * cursor, unsigned long index)
16077 +{
16078 +       dir_cursor *head;
16079 +
16080 +       head = lookup(cursor->info, index);
16081 +       if (head == NULL) {
16082 +               /* this is the first cursor for this index */
16083 +               INIT_LIST_HEAD(&cursor->list);
16084 +               radix_tree_insert(&cursor->info->tree, index, cursor);
16085 +       } else {
16086 +               /* some cursor already exists. Chain ours */
16087 +               list_add(&cursor->list, &head->list);
16088 +       }
16089 +}
16090 +
16091 +/*
16092 + * detach fsdata (if detachable) from file descriptor, and put cursor on the
16093 + * "unused" list. Called when file descriptor is not longer in active use.
16094 + */
16095 +static void clean_fsdata(struct file *file)
16096 +{
16097 +       dir_cursor *cursor;
16098 +       reiser4_file_fsdata *fsdata;
16099 +
16100 +       assert("nikita-3570", file_is_stateless(file));
16101 +
16102 +       fsdata = (reiser4_file_fsdata *) file->private_data;
16103 +       if (fsdata != NULL) {
16104 +               cursor = fsdata->cursor;
16105 +               if (cursor != NULL) {
16106 +                       spin_lock(&d_lock);
16107 +                       --cursor->ref;
16108 +                       if (cursor->ref == 0) {
16109 +                               list_add_tail(&cursor->alist, &cursor_cache);
16110 +                               ++d_cursor_unused;
16111 +                       }
16112 +                       spin_unlock(&d_lock);
16113 +                       file->private_data = NULL;
16114 +               }
16115 +       }
16116 +}
16117 +
16118 +/*
16119 + * global counter used to generate "client ids". These ids are encoded into
16120 + * high bits of fpos.
16121 + */
16122 +static __u32 cid_counter = 0;
16123 +#define CID_SHIFT (20)
16124 +#define CID_MASK  (0xfffffull)
16125 +
16126 +static void free_file_fsdata_nolock(struct file *);
16127 +
16128 +/**
16129 + * insert_cursor - allocate file_fsdata, insert cursor to tree and hash table
16130 + * @cursor:
16131 + * @file:
16132 + * @inode:
16133 + *
16134 + * Allocates reiser4_file_fsdata, attaches it to @cursor, inserts cursor to
16135 + * reiser4 super block's hash table and radix tree.
16136 + add detachable readdir
16137 + * state to the @f
16138 + */
16139 +static int insert_cursor(dir_cursor *cursor, struct file *file,
16140 +                        struct inode *inode)
16141 +{
16142 +       int result;
16143 +       reiser4_file_fsdata *fsdata;
16144 +
16145 +       memset(cursor, 0, sizeof *cursor);
16146 +
16147 +       /* this is either first call to readdir, or rewind. Anyway, create new
16148 +        * cursor. */
16149 +       fsdata = create_fsdata(NULL);
16150 +       if (fsdata != NULL) {
16151 +               result = radix_tree_preload(reiser4_ctx_gfp_mask_get());
16152 +               if (result == 0) {
16153 +                       d_cursor_info *info;
16154 +                       oid_t oid;
16155 +
16156 +                       info = d_info(inode);
16157 +                       oid = get_inode_oid(inode);
16158 +                       /* cid occupies higher 12 bits of f->f_pos. Don't
16159 +                        * allow it to become negative: this confuses
16160 +                        * nfsd_readdir() */
16161 +                       cursor->key.cid = (++cid_counter) & 0x7ff;
16162 +                       cursor->key.oid = oid;
16163 +                       cursor->fsdata = fsdata;
16164 +                       cursor->info = info;
16165 +                       cursor->ref = 1;
16166 +
16167 +                       spin_lock_inode(inode);
16168 +                       /* install cursor as @f's private_data, discarding old
16169 +                        * one if necessary */
16170 +#if REISER4_DEBUG
16171 +                       if (file->private_data)
16172 +                               warning("", "file has fsdata already");
16173 +#endif
16174 +                       clean_fsdata(file);
16175 +                       free_file_fsdata_nolock(file);
16176 +                       file->private_data = fsdata;
16177 +                       fsdata->cursor = cursor;
16178 +                       spin_unlock_inode(inode);
16179 +                       spin_lock(&d_lock);
16180 +                       /* insert cursor into hash table */
16181 +                       d_cursor_hash_insert(&info->table, cursor);
16182 +                       /* and chain it into radix-tree */
16183 +                       bind_cursor(cursor, (unsigned long)oid);
16184 +                       spin_unlock(&d_lock);
16185 +                       radix_tree_preload_end();
16186 +                       file->f_pos = ((__u64) cursor->key.cid) << CID_SHIFT;
16187 +               }
16188 +       } else
16189 +               result = RETERR(-ENOMEM);
16190 +       return result;
16191 +}
16192 +
16193 +/**
16194 + * process_cursors - do action on each cursor attached to inode
16195 + * @inode:
16196 + * @act: action to do
16197 + *
16198 + * Finds all cursors of @inode in reiser4's super block radix tree of cursors
16199 + * and performs action specified by @act on each of cursors.
16200 + */
16201 +static void process_cursors(struct inode *inode, enum cursor_action act)
16202 +{
16203 +       oid_t oid;
16204 +       dir_cursor *start;
16205 +       struct list_head *head;
16206 +       reiser4_context *ctx;
16207 +       d_cursor_info *info;
16208 +
16209 +       /* this can be called by
16210 +        *
16211 +        * kswapd->...->prune_icache->..reiser4_destroy_inode
16212 +        *
16213 +        * without reiser4_context
16214 +        */
16215 +       ctx = reiser4_init_context(inode->i_sb);
16216 +       if (IS_ERR(ctx)) {
16217 +               warning("vs-23", "failed to init context");
16218 +               return;
16219 +       }
16220 +
16221 +       assert("nikita-3558", inode != NULL);
16222 +
16223 +       info = d_info(inode);
16224 +       oid = get_inode_oid(inode);
16225 +       spin_lock_inode(inode);
16226 +       head = get_readdir_list(inode);
16227 +       spin_lock(&d_lock);
16228 +       /* find any cursor for this oid: reference to it is hanging of radix
16229 +        * tree */
16230 +       start = lookup(info, (unsigned long)oid);
16231 +       if (start != NULL) {
16232 +               dir_cursor *scan;
16233 +               reiser4_file_fsdata *fsdata;
16234 +
16235 +               /* process circular list of cursors for this oid */
16236 +               scan = start;
16237 +               do {
16238 +                       dir_cursor *next;
16239 +
16240 +                       next = list_entry(scan->list.next, dir_cursor, list);
16241 +                       fsdata = scan->fsdata;
16242 +                       assert("nikita-3557", fsdata != NULL);
16243 +                       if (scan->key.oid == oid) {
16244 +                               switch (act) {
16245 +                               case CURSOR_DISPOSE:
16246 +                                       list_del_init(&fsdata->dir.linkage);
16247 +                                       break;
16248 +                               case CURSOR_LOAD:
16249 +                                       list_add(&fsdata->dir.linkage, head);
16250 +                                       break;
16251 +                               case CURSOR_KILL:
16252 +                                       kill_cursor(scan);
16253 +                                       break;
16254 +                               }
16255 +                       }
16256 +                       if (scan == next)
16257 +                               /* last cursor was just killed */
16258 +                               break;
16259 +                       scan = next;
16260 +               } while (scan != start);
16261 +       }
16262 +       spin_unlock(&d_lock);
16263 +       /* check that we killed 'em all */
16264 +       assert("nikita-3568",
16265 +              ergo(act == CURSOR_KILL,
16266 +                   list_empty_careful(get_readdir_list(inode))));
16267 +       assert("nikita-3569",
16268 +              ergo(act == CURSOR_KILL, lookup(info, oid) == NULL));
16269 +       spin_unlock_inode(inode);
16270 +       reiser4_exit_context(ctx);
16271 +}
16272 +
16273 +/**
16274 + * reiser4_dispose_cursors - removes cursors from inode's list
16275 + * @inode: inode to dispose cursors of
16276 + *
16277 + * For each of cursors corresponding to @inode - removes reiser4_file_fsdata
16278 + * attached to cursor from inode's readdir list. This is called when inode is
16279 + * removed from the memory by memory pressure.
16280 + */
16281 +void reiser4_dispose_cursors(struct inode *inode)
16282 +{
16283 +       process_cursors(inode, CURSOR_DISPOSE);
16284 +}
16285 +
16286 +/**
16287 + * reiser4_load_cursors - attach cursors to inode
16288 + * @inode: inode to load cursors to
16289 + *
16290 + * For each of cursors corresponding to @inode - attaches reiser4_file_fsdata
16291 + * attached to cursor to inode's readdir list. This is done when inode is
16292 + * loaded into memory.
16293 + */
16294 +void reiser4_load_cursors(struct inode *inode)
16295 +{
16296 +       process_cursors(inode, CURSOR_LOAD);
16297 +}
16298 +
16299 +/**
16300 + * reiser4_kill_cursors - kill all inode cursors
16301 + * @inode: inode to kill cursors of
16302 + *
16303 + * Frees all cursors for this inode. This is called when inode is destroyed.
16304 + */
16305 +void reiser4_kill_cursors(struct inode *inode)
16306 +{
16307 +       process_cursors(inode, CURSOR_KILL);
16308 +}
16309 +
16310 +/**
16311 + * file_is_stateless -
16312 + * @file:
16313 + *
16314 + * true, if file descriptor @f is created by NFS server by "demand" to serve
16315 + * one file system operation. This means that there may be "detached state"
16316 + * for underlying inode.
16317 + */
16318 +static int file_is_stateless(struct file *file)
16319 +{
16320 +       return reiser4_get_dentry_fsdata(file->f_dentry)->stateless;
16321 +}
16322 +
16323 +/**
16324 + * reiser4_get_dir_fpos -
16325 + * @dir:
16326 + *
16327 + * Calculates ->fpos from user-supplied cookie. Normally it is dir->f_pos, but
16328 + * in the case of stateless directory operation (readdir-over-nfs), client id
16329 + * was encoded in the high bits of cookie and should me masked off.
16330 + */
16331 +loff_t reiser4_get_dir_fpos(struct file *dir)
16332 +{
16333 +       if (file_is_stateless(dir))
16334 +               return dir->f_pos & CID_MASK;
16335 +       else
16336 +               return dir->f_pos;
16337 +}
16338 +
16339 +/**
16340 + * reiser4_attach_fsdata - try to attach fsdata
16341 + * @file:
16342 + * @inode:
16343 + *
16344 + * Finds or creates cursor for readdir-over-nfs.
16345 + */
16346 +int reiser4_attach_fsdata(struct file *file, struct inode *inode)
16347 +{
16348 +       loff_t pos;
16349 +       int result;
16350 +       dir_cursor *cursor;
16351 +
16352 +       /*
16353 +        * we are serialized by inode->i_mutex
16354 +        */
16355 +       if (!file_is_stateless(file))
16356 +               return 0;
16357 +
16358 +       pos = file->f_pos;
16359 +       result = 0;
16360 +       if (pos == 0) {
16361 +               /*
16362 +                * first call to readdir (or rewind to the beginning of
16363 +                * directory)
16364 +                */
16365 +               cursor = kmem_cache_alloc(d_cursor_cache,
16366 +                                         reiser4_ctx_gfp_mask_get());
16367 +               if (cursor != NULL)
16368 +                       result = insert_cursor(cursor, file, inode);
16369 +               else
16370 +                       result = RETERR(-ENOMEM);
16371 +       } else {
16372 +               /* try to find existing cursor */
16373 +               d_cursor_key key;
16374 +
16375 +               key.cid = pos >> CID_SHIFT;
16376 +               key.oid = get_inode_oid(inode);
16377 +               spin_lock(&d_lock);
16378 +               cursor = d_cursor_hash_find(&d_info(inode)->table, &key);
16379 +               if (cursor != NULL) {
16380 +                       /* cursor was found */
16381 +                       if (cursor->ref == 0) {
16382 +                               /* move it from unused list */
16383 +                               list_del_init(&cursor->alist);
16384 +                               --d_cursor_unused;
16385 +                       }
16386 +                       ++cursor->ref;
16387 +               }
16388 +               spin_unlock(&d_lock);
16389 +               if (cursor != NULL) {
16390 +                       spin_lock_inode(inode);
16391 +                       assert("nikita-3556", cursor->fsdata->back == NULL);
16392 +                       clean_fsdata(file);
16393 +                       free_file_fsdata_nolock(file);
16394 +                       file->private_data = cursor->fsdata;
16395 +                       spin_unlock_inode(inode);
16396 +               }
16397 +       }
16398 +       return result;
16399 +}
16400 +
16401 +/**
16402 + * reiser4_detach_fsdata - ???
16403 + * @file:
16404 + *
16405 + * detach fsdata, if necessary
16406 + */
16407 +void reiser4_detach_fsdata(struct file *file)
16408 +{
16409 +       struct inode *inode;
16410 +
16411 +       if (!file_is_stateless(file))
16412 +               return;
16413 +
16414 +       inode = file->f_dentry->d_inode;
16415 +       spin_lock_inode(inode);
16416 +       clean_fsdata(file);
16417 +       spin_unlock_inode(inode);
16418 +}
16419 +
16420 +/* slab for reiser4_dentry_fsdata */
16421 +static struct kmem_cache *dentry_fsdata_cache;
16422 +
16423 +/**
16424 + * reiser4_init_dentry_fsdata - create cache of dentry_fsdata
16425 + *
16426 + * Initializes slab cache of structures attached to denty->d_fsdata. It is
16427 + * part of reiser4 module initialization.
16428 + */
16429 +int reiser4_init_dentry_fsdata(void)
16430 +{
16431 +       dentry_fsdata_cache = kmem_cache_create("dentry_fsdata",
16432 +                                               sizeof(reiser4_dentry_fsdata),
16433 +                                               0,
16434 +                                               SLAB_HWCACHE_ALIGN |
16435 +                                               SLAB_RECLAIM_ACCOUNT, NULL,
16436 +                                               NULL);
16437 +       if (dentry_fsdata_cache == NULL)
16438 +               return RETERR(-ENOMEM);
16439 +       return 0;
16440 +}
16441 +
16442 +/**
16443 + * reiser4_done_dentry_fsdata - delete cache of dentry_fsdata
16444 + *
16445 + * This is called on reiser4 module unloading or system shutdown.
16446 + */
16447 +void reiser4_done_dentry_fsdata(void)
16448 +{
16449 +       destroy_reiser4_cache(&dentry_fsdata_cache);
16450 +}
16451 +
16452 +/**
16453 + * reiser4_get_dentry_fsdata - get fs-specific dentry data
16454 + * @dentry: queried dentry
16455 + *
16456 + * Allocates if necessary and returns per-dentry data that we attach to each
16457 + * dentry.
16458 + */
16459 +reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *dentry)
16460 +{
16461 +       assert("nikita-1365", dentry != NULL);
16462 +
16463 +       if (dentry->d_fsdata == NULL) {
16464 +               dentry->d_fsdata = kmem_cache_alloc(dentry_fsdata_cache,
16465 +                                                   reiser4_ctx_gfp_mask_get());
16466 +               if (dentry->d_fsdata == NULL)
16467 +                       return ERR_PTR(RETERR(-ENOMEM));
16468 +               memset(dentry->d_fsdata, 0, sizeof(reiser4_dentry_fsdata));
16469 +       }
16470 +       return dentry->d_fsdata;
16471 +}
16472 +
16473 +/**
16474 + * reiser4_free_dentry_fsdata - detach and free dentry_fsdata
16475 + * @dentry: dentry to free fsdata of
16476 + *
16477 + * Detaches and frees fs-specific dentry data
16478 + */
16479 +void reiser4_free_dentry_fsdata(struct dentry *dentry)
16480 +{
16481 +       if (dentry->d_fsdata != NULL) {
16482 +               kmem_cache_free(dentry_fsdata_cache, dentry->d_fsdata);
16483 +               dentry->d_fsdata = NULL;
16484 +       }
16485 +}
16486 +
16487 +/* slab for reiser4_file_fsdata */
16488 +static struct kmem_cache *file_fsdata_cache;
16489 +
16490 +/**
16491 + * reiser4_init_file_fsdata - create cache of reiser4_file_fsdata
16492 + *
16493 + * Initializes slab cache of structures attached to file->private_data. It is
16494 + * part of reiser4 module initialization.
16495 + */
16496 +int reiser4_init_file_fsdata(void)
16497 +{
16498 +       file_fsdata_cache = kmem_cache_create("file_fsdata",
16499 +                                             sizeof(reiser4_file_fsdata),
16500 +                                             0,
16501 +                                             SLAB_HWCACHE_ALIGN |
16502 +                                             SLAB_RECLAIM_ACCOUNT, NULL, NULL);
16503 +       if (file_fsdata_cache == NULL)
16504 +               return RETERR(-ENOMEM);
16505 +       return 0;
16506 +}
16507 +
16508 +/**
16509 + * reiser4_done_file_fsdata - delete cache of reiser4_file_fsdata
16510 + *
16511 + * This is called on reiser4 module unloading or system shutdown.
16512 + */
16513 +void reiser4_done_file_fsdata(void)
16514 +{
16515 +       destroy_reiser4_cache(&file_fsdata_cache);
16516 +}
16517 +
16518 +/**
16519 + * create_fsdata - allocate and initialize reiser4_file_fsdata
16520 + * @file: what to create file_fsdata for, may be NULL
16521 + *
16522 + * Allocates and initializes reiser4_file_fsdata structure.
16523 + */
16524 +static reiser4_file_fsdata *create_fsdata(struct file *file)
16525 +{
16526 +       reiser4_file_fsdata *fsdata;
16527 +
16528 +       fsdata = kmem_cache_alloc(file_fsdata_cache,
16529 +                                 reiser4_ctx_gfp_mask_get());
16530 +       if (fsdata != NULL) {
16531 +               memset(fsdata, 0, sizeof *fsdata);
16532 +               fsdata->ra1.max_window_size = VM_MAX_READAHEAD * 1024;
16533 +               fsdata->back = file;
16534 +               INIT_LIST_HEAD(&fsdata->dir.linkage);
16535 +       }
16536 +       return fsdata;
16537 +}
16538 +
16539 +/**
16540 + * free_fsdata - free reiser4_file_fsdata
16541 + * @fsdata: object to free
16542 + *
16543 + * Dual to create_fsdata(). Free reiser4_file_fsdata.
16544 + */
16545 +static void free_fsdata(reiser4_file_fsdata *fsdata)
16546 +{
16547 +       BUG_ON(fsdata == NULL);
16548 +       kmem_cache_free(file_fsdata_cache, fsdata);
16549 +}
16550 +
16551 +/**
16552 + * reiser4_get_file_fsdata - get fs-specific file data
16553 + * @file: queried file
16554 + *
16555 + * Returns fs-specific data of @file. If it is NULL, allocates it and attaches
16556 + * to @file.
16557 + */
16558 +reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *file)
16559 +{
16560 +       assert("nikita-1603", file != NULL);
16561 +
16562 +       if (file->private_data == NULL) {
16563 +               reiser4_file_fsdata *fsdata;
16564 +               struct inode *inode;
16565 +
16566 +               fsdata = create_fsdata(file);
16567 +               if (fsdata == NULL)
16568 +                       return ERR_PTR(RETERR(-ENOMEM));
16569 +
16570 +               inode = file->f_dentry->d_inode;
16571 +               spin_lock_inode(inode);
16572 +               if (file->private_data == NULL) {
16573 +                       file->private_data = fsdata;
16574 +                       fsdata = NULL;
16575 +               }
16576 +               spin_unlock_inode(inode);
16577 +               if (fsdata != NULL)
16578 +                       /* other thread initialized ->fsdata */
16579 +                       kmem_cache_free(file_fsdata_cache, fsdata);
16580 +       }
16581 +       assert("nikita-2665", file->private_data != NULL);
16582 +       return file->private_data;
16583 +}
16584 +
16585 +/**
16586 + * free_file_fsdata_nolock - detach and free reiser4_file_fsdata
16587 + * @file:
16588 + *
16589 + * Detaches reiser4_file_fsdata from @file, removes reiser4_file_fsdata from
16590 + * readdir list, frees if it is not linked to d_cursor object.
16591 + */
16592 +static void free_file_fsdata_nolock(struct file *file)
16593 +{
16594 +       reiser4_file_fsdata *fsdata;
16595 +
16596 +       assert("", spin_inode_is_locked(file->f_dentry->d_inode));
16597 +       fsdata = file->private_data;
16598 +       if (fsdata != NULL) {
16599 +               list_del_init(&fsdata->dir.linkage);
16600 +               if (fsdata->cursor == NULL)
16601 +                       free_fsdata(fsdata);
16602 +       }
16603 +       file->private_data = NULL;
16604 +}
16605 +
16606 +/**
16607 + * reiser4_free_file_fsdata - detach from struct file and free reiser4_file_fsdata
16608 + * @file:
16609 + *
16610 + * Spinlocks inode and calls free_file_fsdata_nolock to do the work.
16611 + */
16612 +void reiser4_free_file_fsdata(struct file *file)
16613 +{
16614 +       spin_lock_inode(file->f_dentry->d_inode);
16615 +       free_file_fsdata_nolock(file);
16616 +       spin_unlock_inode(file->f_dentry->d_inode);
16617 +}
16618 +
16619 +/*
16620 + * Local variables:
16621 + * c-indentation-style: "K&R"
16622 + * mode-name: "LC"
16623 + * c-basic-offset: 8
16624 + * tab-width: 8
16625 + * fill-column: 79
16626 + * End:
16627 + */
16628 diff --git a/fs/reiser4/fsdata.h b/fs/reiser4/fsdata.h
16629 new file mode 100644
16630 index 0000000..49e8ebf
16631 --- /dev/null
16632 +++ b/fs/reiser4/fsdata.h
16633 @@ -0,0 +1,207 @@
16634 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
16635 + * reiser4/README */
16636 +
16637 +#if !defined( __REISER4_FSDATA_H__ )
16638 +#define __REISER4_FSDATA_H__
16639 +
16640 +#include "debug.h"
16641 +#include "kassign.h"
16642 +#include "seal.h"
16643 +#include "type_safe_hash.h"
16644 +#include "plugin/file/file.h"
16645 +#include "readahead.h"
16646 +
16647 +/*
16648 + * comment about reiser4_dentry_fsdata
16649 + *
16650 + *
16651 + */
16652 +
16653 +/*
16654 + * locking: fields of per file descriptor readdir_pos and ->f_pos are
16655 + * protected by ->i_mutex on inode. Under this lock following invariant
16656 + * holds:
16657 + *
16658 + *     file descriptor is "looking" at the entry_no-th directory entry from
16659 + *     the beginning of directory. This entry has key dir_entry_key and is
16660 + *     pos-th entry with duplicate-key sequence.
16661 + *
16662 + */
16663 +
16664 +/* logical position within directory */
16665 +typedef struct {
16666 +       /* key of directory entry (actually, part of a key sufficient to
16667 +          identify directory entry)  */
16668 +       de_id dir_entry_key;
16669 +       /* ordinal number of directory entry among all entries with the same
16670 +          key. (Starting from 0.) */
16671 +       unsigned pos;
16672 +} dir_pos;
16673 +
16674 +typedef struct {
16675 +       /* f_pos corresponding to this readdir position */
16676 +       __u64 fpos;
16677 +       /* logical position within directory */
16678 +       dir_pos position;
16679 +       /* logical number of directory entry within
16680 +          directory  */
16681 +       __u64 entry_no;
16682 +} readdir_pos;
16683 +
16684 +/*
16685 + * this is used to speed up lookups for directory entry: on initial call to
16686 + * ->lookup() seal and coord of directory entry (if found, that is) are stored
16687 + * in struct dentry and reused later to avoid tree traversals.
16688 + */
16689 +typedef struct de_location {
16690 +       /* seal covering directory entry */
16691 +       seal_t entry_seal;
16692 +       /* coord of directory entry */
16693 +       coord_t entry_coord;
16694 +       /* ordinal number of directory entry among all entries with the same
16695 +          key. (Starting from 0.) */
16696 +       int pos;
16697 +} de_location;
16698 +
16699 +/**
16700 + * reiser4_dentry_fsdata - reiser4-specific data attached to dentries
16701 + *
16702 + * This is allocated dynamically and released in d_op->d_release()
16703 + *
16704 + * Currently it only contains cached location (hint) of directory entry, but
16705 + * it is expected that other information will be accumulated here.
16706 + */
16707 +typedef struct reiser4_dentry_fsdata {
16708 +       /*
16709 +        * here will go fields filled by ->lookup() to speedup next
16710 +        * create/unlink, like blocknr of znode with stat-data, or key of
16711 +        * stat-data.
16712 +        */
16713 +       de_location dec;
16714 +       int stateless;          /* created through reiser4_decode_fh, needs special
16715 +                                * treatment in readdir. */
16716 +} reiser4_dentry_fsdata;
16717 +
16718 +extern int reiser4_init_dentry_fsdata(void);
16719 +extern void reiser4_done_dentry_fsdata(void);
16720 +extern reiser4_dentry_fsdata *reiser4_get_dentry_fsdata(struct dentry *);
16721 +extern void reiser4_free_dentry_fsdata(struct dentry *dentry);
16722 +
16723 +/**
16724 + * reiser4_file_fsdata - reiser4-specific data attached to file->private_data
16725 + *
16726 + * This is allocated dynamically and released in inode->i_fop->release
16727 + */
16728 +typedef struct reiser4_file_fsdata {
16729 +       /*
16730 +        * pointer back to the struct file which this reiser4_file_fsdata is
16731 +        * part of
16732 +        */
16733 +       struct file *back;
16734 +       /* detached cursor for stateless readdir. */
16735 +       struct dir_cursor *cursor;
16736 +       /*
16737 +        * We need both directory and regular file parts here, because there
16738 +        * are file system objects that are files and directories.
16739 +        */
16740 +       struct {
16741 +               /*
16742 +                * position in directory. It is updated each time directory is
16743 +                * modified
16744 +                */
16745 +               readdir_pos readdir;
16746 +               /* head of this list is reiser4_inode->lists.readdir_list */
16747 +               struct list_head linkage;
16748 +       } dir;
16749 +       /* hints to speed up operations with regular files: read and write. */
16750 +       struct {
16751 +               hint_t hint;
16752 +       } reg;
16753 +       struct reiser4_file_ra_state ra1;
16754 +
16755 +} reiser4_file_fsdata;
16756 +
16757 +extern int reiser4_init_file_fsdata(void);
16758 +extern void reiser4_done_file_fsdata(void);
16759 +extern reiser4_file_fsdata *reiser4_get_file_fsdata(struct file *);
16760 +extern void reiser4_free_file_fsdata(struct file *);
16761 +
16762 +/*
16763 + * d_cursor is reiser4_file_fsdata not attached to struct file. d_cursors are
16764 + * used to address problem reiser4 has with readdir accesses via NFS. See
16765 + * plugin/file_ops_readdir.c for more details.
16766 + */
16767 +typedef struct {
16768 +       __u16 cid;
16769 +       __u64 oid;
16770 +} d_cursor_key;
16771 +
16772 +/*
16773 + * define structures d_cursor_hash_table d_cursor_hash_link which are used to
16774 + * maintain hash table of dir_cursor-s in reiser4's super block
16775 + */
16776 +typedef struct dir_cursor dir_cursor;
16777 +TYPE_SAFE_HASH_DECLARE(d_cursor, dir_cursor);
16778 +
16779 +typedef struct d_cursor_info d_cursor_info;
16780 +
16781 +struct dir_cursor {
16782 +       int ref;
16783 +       reiser4_file_fsdata *fsdata;
16784 +
16785 +       /* link to reiser4 super block hash table of cursors */
16786 +       d_cursor_hash_link hash;
16787 +
16788 +       /*
16789 +        * this is to link cursors to reiser4 super block's radix tree of
16790 +        * cursors if there are more than one cursor of the same objectid
16791 +        */
16792 +       struct list_head list;
16793 +       d_cursor_key key;
16794 +       d_cursor_info *info;
16795 +       /* list of unused cursors */
16796 +       struct list_head alist;
16797 +};
16798 +
16799 +extern int reiser4_init_d_cursor(void);
16800 +extern void reiser4_done_d_cursor(void);
16801 +
16802 +extern int reiser4_init_super_d_info(struct super_block *);
16803 +extern void reiser4_done_super_d_info(struct super_block *);
16804 +
16805 +extern loff_t reiser4_get_dir_fpos(struct file *);
16806 +extern int reiser4_attach_fsdata(struct file *, struct inode *);
16807 +extern void reiser4_detach_fsdata(struct file *);
16808 +
16809 +/* these are needed for "stateless" readdir. See plugin/file_ops_readdir.c for
16810 +   more details */
16811 +void reiser4_dispose_cursors(struct inode *inode);
16812 +void reiser4_load_cursors(struct inode *inode);
16813 +void reiser4_kill_cursors(struct inode *inode);
16814 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
16815 +                            int offset, int adj);
16816 +
16817 +/*
16818 + * this structure is embedded to reise4_super_info_data. It maintains d_cursors
16819 + * (detached readdir state). See plugin/file_ops_readdir.c for more details.
16820 + */
16821 +struct d_cursor_info {
16822 +       d_cursor_hash_table table;
16823 +       struct radix_tree_root tree;
16824 +};
16825 +
16826 +/* spinlock protecting readdir cursors */
16827 +extern spinlock_t d_lock;
16828 +
16829 +/* __REISER4_FSDATA_H__ */
16830 +#endif
16831 +
16832 +/*
16833 + * Local variables:
16834 + * c-indentation-style: "K&R"
16835 + * mode-name: "LC"
16836 + * c-basic-offset: 8
16837 + * tab-width: 8
16838 + * fill-column: 120
16839 + * End:
16840 + */
16841 diff --git a/fs/reiser4/init_super.c b/fs/reiser4/init_super.c
16842 new file mode 100644
16843 index 0000000..3513d5f
16844 --- /dev/null
16845 +++ b/fs/reiser4/init_super.c
16846 @@ -0,0 +1,750 @@
16847 +/* Copyright by Hans Reiser, 2003 */
16848 +
16849 +#include "super.h"
16850 +#include "inode.h"
16851 +#include "plugin/plugin_set.h"
16852 +
16853 +#include <linux/swap.h>
16854 +
16855 +/**
16856 + * init_fs_info - allocate reiser4 specific super block
16857 + * @super: super block of filesystem
16858 + *
16859 + * Allocates and initialize reiser4_super_info_data, attaches it to
16860 + * super->s_fs_info, initializes structures maintaining d_cursor-s.
16861 + */
16862 +int reiser4_init_fs_info(struct super_block *super)
16863 +{
16864 +       reiser4_super_info_data *sbinfo;
16865 +
16866 +       sbinfo = kmalloc(sizeof(reiser4_super_info_data),
16867 +                        reiser4_ctx_gfp_mask_get());
16868 +       if (!sbinfo)
16869 +               return RETERR(-ENOMEM);
16870 +
16871 +       super->s_fs_info = sbinfo;
16872 +       super->s_op = NULL;
16873 +       memset(sbinfo, 0, sizeof(*sbinfo));
16874 +
16875 +       ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
16876 +       ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
16877 +
16878 +       mutex_init(&sbinfo->delete_mutex);
16879 +       spin_lock_init(&(sbinfo->guard));
16880 +
16881 +       /*  initialize per-super-block d_cursor resources */
16882 +       reiser4_init_super_d_info(super);
16883 +
16884 +       return 0;
16885 +}
16886 +
16887 +/**
16888 + * reiser4_done_fs_info - free reiser4 specific super block
16889 + * @super: super block of filesystem
16890 + *
16891 + * Performs some sanity checks, releases structures maintaining d_cursor-s,
16892 + * frees reiser4_super_info_data.
16893 + */
16894 +void reiser4_done_fs_info(struct super_block *super)
16895 +{
16896 +       assert("zam-990", super->s_fs_info != NULL);
16897 +
16898 +       /* release per-super-block d_cursor resources */
16899 +       reiser4_done_super_d_info(super);
16900 +
16901 +       /* make sure that there are not jnodes already */
16902 +       assert("", list_empty(&get_super_private(super)->all_jnodes));
16903 +       assert("", get_current_context()->trans->atom == NULL);
16904 +       reiser4_check_block_counters(super);
16905 +       kfree(super->s_fs_info);
16906 +       super->s_fs_info = NULL;
16907 +}
16908 +
16909 +/* type of option parseable by parse_option() */
16910 +typedef enum {
16911 +       /* value of option is arbitrary string */
16912 +       OPT_STRING,
16913 +
16914 +       /*
16915 +        * option specifies bit in a bitmask. When option is set - bit in
16916 +        * sbinfo->fs_flags is set. Examples are bsdgroups, 32bittimes, mtflush,
16917 +        * dont_load_bitmap, atomic_write.
16918 +        */
16919 +       OPT_BIT,
16920 +
16921 +       /*
16922 +        * value of option should conform to sprintf() format. Examples are
16923 +        * tmgr.atom_max_size=N, tmgr.atom_max_age=N
16924 +        */
16925 +       OPT_FORMAT,
16926 +
16927 +       /*
16928 +        * option can take one of predefined values. Example is onerror=panic or
16929 +        * onerror=remount-ro
16930 +        */
16931 +       OPT_ONEOF,
16932 +} opt_type_t;
16933 +
16934 +typedef struct opt_bitmask_bit {
16935 +       const char *bit_name;
16936 +       int bit_nr;
16937 +} opt_bitmask_bit;
16938 +
16939 +/* description of option parseable by parse_option() */
16940 +typedef struct opt_desc {
16941 +       /* option name.
16942 +
16943 +          parsed portion of string has a form "name=value".
16944 +        */
16945 +       const char *name;
16946 +       /* type of option */
16947 +       opt_type_t type;
16948 +       union {
16949 +               /* where to store value of string option (type == OPT_STRING) */
16950 +               char **string;
16951 +               /* description of bits for bit option (type == OPT_BIT) */
16952 +               struct {
16953 +                       int nr;
16954 +                       void *addr;
16955 +               } bit;
16956 +               /* description of format and targets for format option (type
16957 +                  == OPT_FORMAT) */
16958 +               struct {
16959 +                       const char *format;
16960 +                       int nr_args;
16961 +                       void *arg1;
16962 +                       void *arg2;
16963 +                       void *arg3;
16964 +                       void *arg4;
16965 +               } f;
16966 +               struct {
16967 +                       int *result;
16968 +                       const char *list[10];
16969 +               } oneof;
16970 +               struct {
16971 +                       void *addr;
16972 +                       int nr_bits;
16973 +                       opt_bitmask_bit *bits;
16974 +               } bitmask;
16975 +       } u;
16976 +} opt_desc_t;
16977 +
16978 +/**
16979 + * parse_option - parse one option
16980 + * @opt_strin: starting point of parsing
16981 + * @opt: option description
16982 + *
16983 + * foo=bar,
16984 + * ^   ^  ^
16985 + * |   |  +-- replaced to '\0'
16986 + * |   +-- val_start
16987 + * +-- opt_string
16988 + * Figures out option type and handles option correspondingly.
16989 + */
16990 +static int parse_option(char *opt_string, opt_desc_t *opt)
16991 +{
16992 +       char *val_start;
16993 +       int result;
16994 +       const char *err_msg;
16995 +
16996 +       /* NOTE-NIKITA think about using lib/cmdline.c functions here. */
16997 +
16998 +       val_start = strchr(opt_string, '=');
16999 +       if (val_start != NULL) {
17000 +               *val_start = '\0';
17001 +               ++val_start;
17002 +       }
17003 +
17004 +       err_msg = NULL;
17005 +       result = 0;
17006 +       switch (opt->type) {
17007 +       case OPT_STRING:
17008 +               if (val_start == NULL) {
17009 +                       err_msg = "String arg missing";
17010 +                       result = RETERR(-EINVAL);
17011 +               } else
17012 +                       *opt->u.string = val_start;
17013 +               break;
17014 +       case OPT_BIT:
17015 +               if (val_start != NULL)
17016 +                       err_msg = "Value ignored";
17017 +               else
17018 +                       set_bit(opt->u.bit.nr, opt->u.bit.addr);
17019 +               break;
17020 +       case OPT_FORMAT:
17021 +               if (val_start == NULL) {
17022 +                       err_msg = "Formatted arg missing";
17023 +                       result = RETERR(-EINVAL);
17024 +                       break;
17025 +               }
17026 +               if (sscanf(val_start, opt->u.f.format,
17027 +                          opt->u.f.arg1, opt->u.f.arg2, opt->u.f.arg3,
17028 +                          opt->u.f.arg4) != opt->u.f.nr_args) {
17029 +                       err_msg = "Wrong conversion";
17030 +                       result = RETERR(-EINVAL);
17031 +               }
17032 +               break;
17033 +       case OPT_ONEOF:
17034 +               {
17035 +                       int i = 0;
17036 +
17037 +                       if (val_start == NULL) {
17038 +                               err_msg = "Value is missing";
17039 +                               result = RETERR(-EINVAL);
17040 +                               break;
17041 +                       }
17042 +                       err_msg = "Wrong option value";
17043 +                       result = RETERR(-EINVAL);
17044 +                       while (opt->u.oneof.list[i]) {
17045 +                               if (!strcmp(opt->u.oneof.list[i], val_start)) {
17046 +                                       result = 0;
17047 +                                       err_msg = NULL;
17048 +                                       *opt->u.oneof.result = i;
17049 +                                       break;
17050 +                               }
17051 +                               i++;
17052 +                       }
17053 +                       break;
17054 +               }
17055 +       default:
17056 +               wrong_return_value("nikita-2100", "opt -> type");
17057 +               break;
17058 +       }
17059 +       if (err_msg != NULL) {
17060 +               warning("nikita-2496", "%s when parsing option \"%s%s%s\"",
17061 +                       err_msg, opt->name, val_start ? "=" : "",
17062 +                       val_start ? : "");
17063 +       }
17064 +       return result;
17065 +}
17066 +
17067 +/**
17068 + * parse_options - parse reiser4 mount options
17069 + * @opt_string: starting point
17070 + * @opts: array of option description
17071 + * @nr_opts: number of elements in @opts
17072 + *
17073 + * Parses comma separated list of reiser4 mount options.
17074 + */
17075 +static int parse_options(char *opt_string, opt_desc_t *opts, int nr_opts)
17076 +{
17077 +       int result;
17078 +
17079 +       result = 0;
17080 +       while ((result == 0) && opt_string && *opt_string) {
17081 +               int j;
17082 +               char *next;
17083 +
17084 +               next = strchr(opt_string, ',');
17085 +               if (next != NULL) {
17086 +                       *next = '\0';
17087 +                       ++next;
17088 +               }
17089 +               for (j = 0; j < nr_opts; ++j) {
17090 +                       if (!strncmp(opt_string, opts[j].name,
17091 +                                    strlen(opts[j].name))) {
17092 +                               result = parse_option(opt_string, &opts[j]);
17093 +                               break;
17094 +                       }
17095 +               }
17096 +               if (j == nr_opts) {
17097 +                       warning("nikita-2307", "Unrecognized option: \"%s\"",
17098 +                               opt_string);
17099 +                       /* traditionally, -EINVAL is returned on wrong mount
17100 +                          option */
17101 +                       result = RETERR(-EINVAL);
17102 +               }
17103 +               opt_string = next;
17104 +       }
17105 +       return result;
17106 +}
17107 +
17108 +#define NUM_OPT( label, fmt, addr )                            \
17109 +               {                                               \
17110 +                       .name = ( label ),                      \
17111 +                       .type = OPT_FORMAT,                     \
17112 +                       .u = {                                  \
17113 +                               .f = {                          \
17114 +                                       .format  = ( fmt ),     \
17115 +                                       .nr_args = 1,           \
17116 +                                       .arg1 = ( addr ),       \
17117 +                                       .arg2 = NULL,           \
17118 +                                       .arg3 = NULL,           \
17119 +                                       .arg4 = NULL            \
17120 +                               }                               \
17121 +                       }                                       \
17122 +               }
17123 +
17124 +#define SB_FIELD_OPT( field, fmt ) NUM_OPT( #field, fmt, &sbinfo -> field )
17125 +
17126 +#define BIT_OPT(label, bitnr)                                  \
17127 +       {                                                       \
17128 +               .name = label,                                  \
17129 +               .type = OPT_BIT,                                \
17130 +               .u = {                                          \
17131 +                       .bit = {                                \
17132 +                               .nr = bitnr,                    \
17133 +                               .addr = &sbinfo->fs_flags       \
17134 +                       }                                       \
17135 +               }                                               \
17136 +       }
17137 +
17138 +#define MAX_NR_OPTIONS (30)
17139 +
17140 +/**
17141 + * reiser4_init_super_data - initialize reiser4 private super block
17142 + * @super: super block to initialize
17143 + * @opt_string: list of reiser4 mount options
17144 + *
17145 + * Sets various reiser4 parameters to default values. Parses mount options and
17146 + * overwrites default settings.
17147 + */
17148 +int reiser4_init_super_data(struct super_block *super, char *opt_string)
17149 +{
17150 +       int result;
17151 +       opt_desc_t *opts, *p;
17152 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17153 +
17154 +       /* initialize super, export, dentry operations */
17155 +       sbinfo->ops.super = reiser4_super_operations;
17156 +       sbinfo->ops.export = reiser4_export_operations;
17157 +       sbinfo->ops.dentry = reiser4_dentry_operations;
17158 +       super->s_op = &sbinfo->ops.super;
17159 +       super->s_export_op = &sbinfo->ops.export;
17160 +
17161 +       /* initialize transaction manager parameters to default values */
17162 +       sbinfo->tmgr.atom_max_size = totalram_pages / 4;
17163 +       sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE / HZ;
17164 +       sbinfo->tmgr.atom_min_size = 256;
17165 +       sbinfo->tmgr.atom_max_flushers = ATOM_MAX_FLUSHERS;
17166 +
17167 +       /* initialize cbk cache parameter */
17168 +       sbinfo->tree.cbk_cache.nr_slots = CBK_CACHE_SLOTS;
17169 +
17170 +       /* initialize flush parameters */
17171 +       sbinfo->flush.relocate_threshold = FLUSH_RELOCATE_THRESHOLD;
17172 +       sbinfo->flush.relocate_distance = FLUSH_RELOCATE_DISTANCE;
17173 +       sbinfo->flush.written_threshold = FLUSH_WRITTEN_THRESHOLD;
17174 +       sbinfo->flush.scan_maxnodes = FLUSH_SCAN_MAXNODES;
17175 +
17176 +       sbinfo->optimal_io_size = REISER4_OPTIMAL_IO_SIZE;
17177 +
17178 +       /* preliminary tree initializations */
17179 +       sbinfo->tree.super = super;
17180 +       sbinfo->tree.carry.new_node_flags = REISER4_NEW_NODE_FLAGS;
17181 +       sbinfo->tree.carry.new_extent_flags = REISER4_NEW_EXTENT_FLAGS;
17182 +       sbinfo->tree.carry.paste_flags = REISER4_PASTE_FLAGS;
17183 +       sbinfo->tree.carry.insert_flags = REISER4_INSERT_FLAGS;
17184 +       rwlock_init(&(sbinfo->tree.tree_lock));
17185 +       spin_lock_init(&(sbinfo->tree.epoch_lock));
17186 +
17187 +       /* initialize default readahead params */
17188 +       sbinfo->ra_params.max = num_physpages / 4;
17189 +       sbinfo->ra_params.flags = 0;
17190 +
17191 +       /* allocate memory for structure describing reiser4 mount options */
17192 +       opts = kmalloc(sizeof(opt_desc_t) * MAX_NR_OPTIONS,
17193 +                      reiser4_ctx_gfp_mask_get());
17194 +       if (opts == NULL)
17195 +               return RETERR(-ENOMEM);
17196 +
17197 +       /* initialize structure describing reiser4 mount options */
17198 +       p = opts;
17199 +
17200 +#if REISER4_DEBUG
17201 +#  define OPT_ARRAY_CHECK if ((p) > (opts) + MAX_NR_OPTIONS) {         \
17202 +               warning ("zam-1046", "opt array is overloaded"); break; \
17203 +       }
17204 +#else
17205 +#   define OPT_ARRAY_CHECK noop
17206 +#endif
17207 +
17208 +#define PUSH_OPT(...)                          \
17209 +do {                                           \
17210 +        opt_desc_t o = __VA_ARGS__;            \
17211 +        OPT_ARRAY_CHECK;                       \
17212 +        *p ++ = o;                             \
17213 +} while (0)
17214 +
17215 +#define PUSH_SB_FIELD_OPT(field, format) PUSH_OPT(SB_FIELD_OPT(field, format))
17216 +#define PUSH_BIT_OPT(name, bit) PUSH_OPT(BIT_OPT(name, bit))
17217 +
17218 +       /*
17219 +        * tmgr.atom_max_size=N
17220 +        * Atoms containing more than N blocks will be forced to commit. N is
17221 +        * decimal.
17222 +        */
17223 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_size, "%u");
17224 +       /*
17225 +        * tmgr.atom_max_age=N
17226 +        * Atoms older than N seconds will be forced to commit. N is decimal.
17227 +        */
17228 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_age, "%u");
17229 +       /*
17230 +        * tmgr.atom_min_size=N
17231 +        * In committing an atom to free dirty pages, force the atom less than
17232 +        * N in size to fuse with another one.
17233 +        */
17234 +       PUSH_SB_FIELD_OPT(tmgr.atom_min_size, "%u");
17235 +       /*
17236 +        * tmgr.atom_max_flushers=N
17237 +        * limit of concurrent flushers for one atom. 0 means no limit.
17238 +        */
17239 +       PUSH_SB_FIELD_OPT(tmgr.atom_max_flushers, "%u");
17240 +       /*
17241 +        * tree.cbk_cache_slots=N
17242 +        * Number of slots in the cbk cache.
17243 +        */
17244 +       PUSH_SB_FIELD_OPT(tree.cbk_cache.nr_slots, "%u");
17245 +       /*
17246 +        * If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty
17247 +        * leaf-level blocks it will force them to be relocated.
17248 +        */
17249 +       PUSH_SB_FIELD_OPT(flush.relocate_threshold, "%u");
17250 +       /*
17251 +        * If flush finds can find a block allocation closer than at most
17252 +        * FLUSH_RELOCATE_DISTANCE from the preceder it will relocate to that
17253 +        * position.
17254 +        */
17255 +       PUSH_SB_FIELD_OPT(flush.relocate_distance, "%u");
17256 +       /*
17257 +        * If we have written this much or more blocks before encountering busy
17258 +        * jnode in flush list - abort flushing hoping that next time we get
17259 +        * called this jnode will be clean already, and we will save some
17260 +        * seeks.
17261 +        */
17262 +       PUSH_SB_FIELD_OPT(flush.written_threshold, "%u");
17263 +       /* The maximum number of nodes to scan left on a level during flush. */
17264 +       PUSH_SB_FIELD_OPT(flush.scan_maxnodes, "%u");
17265 +       /* preferred IO size */
17266 +       PUSH_SB_FIELD_OPT(optimal_io_size, "%u");
17267 +       /* carry flags used for insertion of new nodes */
17268 +       PUSH_SB_FIELD_OPT(tree.carry.new_node_flags, "%u");
17269 +       /* carry flags used for insertion of new extents */
17270 +       PUSH_SB_FIELD_OPT(tree.carry.new_extent_flags, "%u");
17271 +       /* carry flags used for paste operations */
17272 +       PUSH_SB_FIELD_OPT(tree.carry.paste_flags, "%u");
17273 +       /* carry flags used for insert operations */
17274 +       PUSH_SB_FIELD_OPT(tree.carry.insert_flags, "%u");
17275 +
17276 +#ifdef CONFIG_REISER4_BADBLOCKS
17277 +       /*
17278 +        * Alternative master superblock location in case if it's original
17279 +        * location is not writeable/accessable. This is offset in BYTES.
17280 +        */
17281 +       PUSH_SB_FIELD_OPT(altsuper, "%lu");
17282 +#endif
17283 +
17284 +       /* turn on BSD-style gid assignment */
17285 +       PUSH_BIT_OPT("bsdgroups", REISER4_BSD_GID);
17286 +       /* turn on 32 bit times */
17287 +       PUSH_BIT_OPT("32bittimes", REISER4_32_BIT_TIMES);
17288 +       /*
17289 +        * Don't load all bitmap blocks at mount time, it is useful for
17290 +        * machines with tiny RAM and large disks.
17291 +        */
17292 +       PUSH_BIT_OPT("dont_load_bitmap", REISER4_DONT_LOAD_BITMAP);
17293 +       /* disable transaction commits during write() */
17294 +       PUSH_BIT_OPT("atomic_write", REISER4_ATOMIC_WRITE);
17295 +       /* disable use of write barriers in the reiser4 log writer. */
17296 +       PUSH_BIT_OPT("no_write_barrier", REISER4_NO_WRITE_BARRIER);
17297 +
17298 +       PUSH_OPT(
17299 +       {
17300 +               /*
17301 +                * tree traversal readahead parameters:
17302 +                * -o readahead:MAXNUM:FLAGS
17303 +                * MAXNUM - max number fo nodes to request readahead for: -1UL
17304 +                * will set it to max_sane_readahead()
17305 +                * FLAGS - combination of bits: RA_ADJCENT_ONLY, RA_ALL_LEVELS,
17306 +                * CONTINUE_ON_PRESENT
17307 +                */
17308 +               .name = "readahead",
17309 +               .type = OPT_FORMAT,
17310 +               .u = {
17311 +                       .f = {
17312 +                               .format = "%u:%u",
17313 +                               .nr_args = 2,
17314 +                               .arg1 = &sbinfo->ra_params.max,
17315 +                               .arg2 = &sbinfo->ra_params.flags,
17316 +                               .arg3 = NULL,
17317 +                               .arg4 = NULL
17318 +                       }
17319 +               }
17320 +       }
17321 +       );
17322 +
17323 +       /* What to do in case of fs error */
17324 +       PUSH_OPT(
17325 +       {
17326 +               .name = "onerror",
17327 +               .type = OPT_ONEOF,
17328 +               .u = {
17329 +                       .oneof = {
17330 +                               .result = &sbinfo->onerror,
17331 +                               .list = {
17332 +                                       "panic", "remount-ro", NULL
17333 +                               },
17334 +                       }
17335 +               }
17336 +       }
17337 +       );
17338 +
17339 +       /* modify default settings to values set by mount options */
17340 +       result = parse_options(opt_string, opts, p - opts);
17341 +       kfree(opts);
17342 +       if (result != 0)
17343 +               return result;
17344 +
17345 +       /* correct settings to sanity values */
17346 +       sbinfo->tmgr.atom_max_age *= HZ;
17347 +       if (sbinfo->tmgr.atom_max_age <= 0)
17348 +               /* overflow */
17349 +               sbinfo->tmgr.atom_max_age = REISER4_ATOM_MAX_AGE;
17350 +
17351 +       /* round optimal io size up to 512 bytes */
17352 +       sbinfo->optimal_io_size >>= VFS_BLKSIZE_BITS;
17353 +       sbinfo->optimal_io_size <<= VFS_BLKSIZE_BITS;
17354 +       if (sbinfo->optimal_io_size == 0) {
17355 +               warning("nikita-2497", "optimal_io_size is too small");
17356 +               return RETERR(-EINVAL);
17357 +       }
17358 +       return result;
17359 +}
17360 +
17361 +/**
17362 + * reiser4_init_read_super - read reiser4 master super block
17363 + * @super: super block to fill
17364 + * @silent: if 0 - print warnings
17365 + *
17366 + * Reads reiser4 master super block either from predefined location or from
17367 + * location specified by altsuper mount option, initializes disk format plugin.
17368 + */
17369 +int reiser4_init_read_super(struct super_block *super, int silent)
17370 +{
17371 +       struct buffer_head *super_bh;
17372 +       struct reiser4_master_sb *master_sb;
17373 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17374 +       unsigned long blocksize;
17375 +
17376 + read_super_block:
17377 +#ifdef CONFIG_REISER4_BADBLOCKS
17378 +       if (sbinfo->altsuper)
17379 +               /*
17380 +                * read reiser4 master super block at position specified by
17381 +                * mount option
17382 +                */
17383 +               super_bh = sb_bread(super,
17384 +                                   (sector_t)(sbinfo->altsuper / super->s_blocksize));
17385 +       else
17386 +#endif
17387 +               /* read reiser4 master super block at 16-th 4096 block */
17388 +               super_bh = sb_bread(super,
17389 +                                   (sector_t)(REISER4_MAGIC_OFFSET / super->s_blocksize));
17390 +       if (!super_bh)
17391 +               return RETERR(-EIO);
17392 +
17393 +       master_sb = (struct reiser4_master_sb *)super_bh->b_data;
17394 +       /* check reiser4 magic string */
17395 +       if (!strncmp(master_sb->magic, REISER4_SUPER_MAGIC_STRING,
17396 +                    sizeof(REISER4_SUPER_MAGIC_STRING))) {
17397 +               /* reiser4 master super block contains filesystem blocksize */
17398 +               blocksize = le16_to_cpu(get_unaligned(&master_sb->blocksize));
17399 +
17400 +               if (blocksize != PAGE_CACHE_SIZE) {
17401 +                       /*
17402 +                        * currenly reiser4's blocksize must be equal to
17403 +                        * pagesize
17404 +                        */
17405 +                       if (!silent)
17406 +                               warning("nikita-2609",
17407 +                                       "%s: wrong block size %ld\n", super->s_id,
17408 +                                       blocksize);
17409 +                       brelse(super_bh);
17410 +                       return RETERR(-EINVAL);
17411 +               }
17412 +               if (blocksize != super->s_blocksize) {
17413 +                       /*
17414 +                        * filesystem uses different blocksize. Reread master
17415 +                        * super block with correct blocksize
17416 +                        */
17417 +                       brelse(super_bh);
17418 +                       if (!sb_set_blocksize(super, (int)blocksize))
17419 +                               return RETERR(-EINVAL);
17420 +                       goto read_super_block;
17421 +               }
17422 +
17423 +               sbinfo->df_plug =
17424 +                       disk_format_plugin_by_id(
17425 +                               le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17426 +               if (sbinfo->df_plug == NULL) {
17427 +                       if (!silent)
17428 +                               warning("nikita-26091",
17429 +                                       "%s: unknown disk format plugin %d\n",
17430 +                                       super->s_id,
17431 +                                       le16_to_cpu(get_unaligned(&master_sb->disk_plugin_id)));
17432 +                       brelse(super_bh);
17433 +                       return RETERR(-EINVAL);
17434 +               }
17435 +               sbinfo->diskmap_block = le64_to_cpu(get_unaligned(&master_sb->diskmap));
17436 +               brelse(super_bh);
17437 +               return 0;
17438 +       }
17439 +
17440 +       /* there is no reiser4 on the device */
17441 +       if (!silent)
17442 +               warning("nikita-2608",
17443 +                       "%s: wrong master super block magic", super->s_id);
17444 +       brelse(super_bh);
17445 +       return RETERR(-EINVAL);
17446 +}
17447 +
17448 +static struct {
17449 +       reiser4_plugin_type type;
17450 +       reiser4_plugin_id id;
17451 +} default_plugins[PSET_LAST] = {
17452 +       [PSET_FILE] = {
17453 +               .type = REISER4_FILE_PLUGIN_TYPE,
17454 +               .id = UNIX_FILE_PLUGIN_ID
17455 +       },
17456 +       [PSET_DIR] = {
17457 +               .type = REISER4_DIR_PLUGIN_TYPE,
17458 +               .id = HASHED_DIR_PLUGIN_ID
17459 +       },
17460 +       [PSET_HASH] = {
17461 +               .type = REISER4_HASH_PLUGIN_TYPE,
17462 +               .id = R5_HASH_ID
17463 +       },
17464 +       [PSET_FIBRATION] = {
17465 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
17466 +               .id = FIBRATION_DOT_O
17467 +       },
17468 +       [PSET_PERM] = {
17469 +               .type = REISER4_PERM_PLUGIN_TYPE,
17470 +               .id = NULL_PERM_ID
17471 +       },
17472 +       [PSET_FORMATTING] = {
17473 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
17474 +               .id = SMALL_FILE_FORMATTING_ID
17475 +       },
17476 +       [PSET_SD] = {
17477 +               .type = REISER4_ITEM_PLUGIN_TYPE,
17478 +               .id = STATIC_STAT_DATA_ID
17479 +       },
17480 +       [PSET_DIR_ITEM] = {
17481 +               .type = REISER4_ITEM_PLUGIN_TYPE,
17482 +               .id = COMPOUND_DIR_ID
17483 +       },
17484 +       [PSET_CIPHER] = {
17485 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
17486 +               .id = NONE_CIPHER_ID
17487 +       },
17488 +       [PSET_DIGEST] = {
17489 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
17490 +               .id = SHA256_32_DIGEST_ID
17491 +       },
17492 +       [PSET_COMPRESSION] = {
17493 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
17494 +               .id = LZO1_COMPRESSION_ID
17495 +       },
17496 +       [PSET_COMPRESSION_MODE] = {
17497 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
17498 +               .id = CONVX_COMPRESSION_MODE_ID
17499 +       },
17500 +       [PSET_CLUSTER] = {
17501 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
17502 +               .id = CLUSTER_64K_ID
17503 +       },
17504 +       [PSET_CREATE] = {
17505 +               .type = REISER4_FILE_PLUGIN_TYPE,
17506 +               .id = UNIX_FILE_PLUGIN_ID
17507 +       }
17508 +};
17509 +
17510 +/* access to default plugin table */
17511 +reiser4_plugin *get_default_plugin(pset_member memb)
17512 +{
17513 +       return plugin_by_id(default_plugins[memb].type,
17514 +                           default_plugins[memb].id);
17515 +}
17516 +
17517 +/**
17518 + * reiser4_init_root_inode - obtain inode of root directory
17519 + * @super: super block of filesystem
17520 + *
17521 + * Obtains inode of root directory (reading it from disk), initializes plugin
17522 + * set it was not initialized.
17523 + */
17524 +int reiser4_init_root_inode(struct super_block *super)
17525 +{
17526 +       reiser4_super_info_data *sbinfo = get_super_private(super);
17527 +       struct inode *inode;
17528 +       int result = 0;
17529 +
17530 +       inode = reiser4_iget(super, sbinfo->df_plug->root_dir_key(super), 0);
17531 +       if (IS_ERR(inode))
17532 +               return RETERR(PTR_ERR(inode));
17533 +
17534 +       super->s_root = d_alloc_root(inode);
17535 +       if (!super->s_root) {
17536 +               iput(inode);
17537 +               return RETERR(-ENOMEM);
17538 +       }
17539 +
17540 +       super->s_root->d_op = &sbinfo->ops.dentry;
17541 +
17542 +       if (!is_inode_loaded(inode)) {
17543 +               pset_member memb;
17544 +               plugin_set *pset;
17545 +
17546 +               pset = reiser4_inode_data(inode)->pset;
17547 +               for (memb = 0; memb < PSET_LAST; ++memb) {
17548 +
17549 +                       if (aset_get(pset, memb) != NULL)
17550 +                               continue;
17551 +
17552 +                       result = grab_plugin_pset(inode, NULL, memb);
17553 +                       if (result != 0)
17554 +                               break;
17555 +
17556 +                       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
17557 +               }
17558 +
17559 +               if (result == 0) {
17560 +                       if (REISER4_DEBUG) {
17561 +                               for (memb = 0; memb < PSET_LAST; ++memb)
17562 +                                       assert("nikita-3500",
17563 +                                              aset_get(pset, memb) != NULL);
17564 +                       }
17565 +               } else
17566 +                       warning("nikita-3448", "Cannot set plugins of root: %i",
17567 +                               result);
17568 +               reiser4_iget_complete(inode);
17569 +
17570 +               /* As the default pset kept in the root dir may has been changed
17571 +                  (length is unknown), call update_sd. */
17572 +               if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
17573 +                       result = reiser4_grab_space(
17574 +                               inode_file_plugin(inode)->estimate.update(inode),
17575 +                               BA_CAN_COMMIT);
17576 +
17577 +                       if (result == 0)
17578 +                               result = reiser4_update_sd(inode);
17579 +
17580 +                       all_grabbed2free();
17581 +               }
17582 +       }
17583 +
17584 +       super->s_maxbytes = MAX_LFS_FILESIZE;
17585 +       return result;
17586 +}
17587 +
17588 +/*
17589 + * Local variables:
17590 + * c-indentation-style: "K&R"
17591 + * mode-name: "LC"
17592 + * c-basic-offset: 8
17593 + * tab-width: 8
17594 + * fill-column: 79
17595 + * End:
17596 + */
17597 diff --git a/fs/reiser4/inode.c b/fs/reiser4/inode.c
17598 new file mode 100644
17599 index 0000000..2429ac1
17600 --- /dev/null
17601 +++ b/fs/reiser4/inode.c
17602 @@ -0,0 +1,709 @@
17603 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
17604 +
17605 +/* Inode specific operations. */
17606 +
17607 +#include "forward.h"
17608 +#include "debug.h"
17609 +#include "key.h"
17610 +#include "kassign.h"
17611 +#include "coord.h"
17612 +#include "seal.h"
17613 +#include "dscale.h"
17614 +#include "plugin/item/item.h"
17615 +#include "plugin/security/perm.h"
17616 +#include "plugin/plugin.h"
17617 +#include "plugin/object.h"
17618 +#include "znode.h"
17619 +#include "vfs_ops.h"
17620 +#include "inode.h"
17621 +#include "super.h"
17622 +#include "reiser4.h"
17623 +
17624 +#include <linux/fs.h>          /* for struct super_block,  address_space */
17625 +
17626 +/* return reiser4 internal tree which inode belongs to */
17627 +/* Audited by: green(2002.06.17) */
17628 +reiser4_tree *reiser4_tree_by_inode(const struct inode *inode /* inode queried */ )
17629 +{
17630 +       assert("nikita-256", inode != NULL);
17631 +       assert("nikita-257", inode->i_sb != NULL);
17632 +       return reiser4_get_tree(inode->i_sb);
17633 +}
17634 +
17635 +/* return reiser4-specific inode flags */
17636 +static inline unsigned long *inode_flags(const struct inode *const inode)
17637 +{
17638 +       assert("nikita-2842", inode != NULL);
17639 +       return &reiser4_inode_data(inode)->flags;
17640 +}
17641 +
17642 +/* set reiser4-specific flag @f in @inode */
17643 +void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f)
17644 +{
17645 +       assert("nikita-2248", inode != NULL);
17646 +       set_bit((int)f, inode_flags(inode));
17647 +}
17648 +
17649 +/* clear reiser4-specific flag @f in @inode */
17650 +void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f)
17651 +{
17652 +       assert("nikita-2250", inode != NULL);
17653 +       clear_bit((int)f, inode_flags(inode));
17654 +}
17655 +
17656 +/* true if reiser4-specific flag @f is set in @inode */
17657 +int reiser4_inode_get_flag(const struct inode *inode,
17658 +                          reiser4_file_plugin_flags f)
17659 +{
17660 +       assert("nikita-2251", inode != NULL);
17661 +       return test_bit((int)f, inode_flags(inode));
17662 +}
17663 +
17664 +/* convert oid to inode number */
17665 +ino_t oid_to_ino(oid_t oid)
17666 +{
17667 +       return (ino_t) oid;
17668 +}
17669 +
17670 +/* convert oid to user visible inode number */
17671 +ino_t oid_to_uino(oid_t oid)
17672 +{
17673 +       /* reiser4 object is uniquely identified by oid which is 64 bit
17674 +          quantity. Kernel in-memory inode is indexed (in the hash table) by
17675 +          32 bit i_ino field, but this is not a problem, because there is a
17676 +          way to further distinguish inodes with identical inode numbers
17677 +          (find_actor supplied to iget()).
17678 +
17679 +          But user space expects unique 32 bit inode number. Obviously this
17680 +          is impossible. Work-around is to somehow hash oid into user visible
17681 +          inode number.
17682 +        */
17683 +       oid_t max_ino = (ino_t) ~ 0;
17684 +
17685 +       if (REISER4_INO_IS_OID || (oid <= max_ino))
17686 +               return oid;
17687 +       else
17688 +               /* this is remotely similar to algorithm used to find next pid
17689 +                  to use for process: after wrap-around start from some
17690 +                  offset rather than from 0. Idea is that there are some long
17691 +                  living objects with which we don't want to collide.
17692 +                */
17693 +               return REISER4_UINO_SHIFT + ((oid - max_ino) & (max_ino >> 1));
17694 +}
17695 +
17696 +/* check that "inode" is on reiser4 file-system */
17697 +int is_reiser4_inode(const struct inode *inode /* inode queried */ )
17698 +{
17699 +       return inode != NULL && is_reiser4_super(inode->i_sb);
17700 +}
17701 +
17702 +/* Maximal length of a name that can be stored in directory @inode.
17703 +
17704 +   This is used in check during file creation and lookup. */
17705 +int reiser4_max_filename_len(const struct inode *inode /* inode queried */ )
17706 +{
17707 +       assert("nikita-287", is_reiser4_inode(inode));
17708 +       assert("nikita-1710", inode_dir_item_plugin(inode));
17709 +       if (inode_dir_item_plugin(inode)->s.dir.max_name_len)
17710 +               return inode_dir_item_plugin(inode)->s.dir.max_name_len(inode);
17711 +       else
17712 +               return 255;
17713 +}
17714 +
17715 +#if REISER4_USE_COLLISION_LIMIT
17716 +/* Maximal number of hash collisions for this directory. */
17717 +int max_hash_collisions(const struct inode *dir /* inode queried */ )
17718 +{
17719 +       assert("nikita-1711", dir != NULL);
17720 +       return reiser4_inode_data(dir)->plugin.max_collisions;
17721 +}
17722 +#endif  /*  REISER4_USE_COLLISION_LIMIT  */
17723 +
17724 +/* Install file, inode, and address_space operation on @inode, depending on
17725 +   its mode. */
17726 +int setup_inode_ops(struct inode *inode /* inode to intialize */ ,
17727 +                   reiser4_object_create_data * data   /* parameters to create
17728 +                                                        * object */ )
17729 +{
17730 +       reiser4_super_info_data *sinfo;
17731 +       file_plugin *fplug;
17732 +       dir_plugin *dplug;
17733 +
17734 +       fplug = inode_file_plugin(inode);
17735 +       dplug = inode_dir_plugin(inode);
17736 +
17737 +       sinfo = get_super_private(inode->i_sb);
17738 +
17739 +       switch (inode->i_mode & S_IFMT) {
17740 +       case S_IFSOCK:
17741 +       case S_IFBLK:
17742 +       case S_IFCHR:
17743 +       case S_IFIFO:
17744 +               {
17745 +                       dev_t rdev;     /* to keep gcc happy */
17746 +
17747 +                       assert("vs-46", fplug != NULL);
17748 +                       /* ugly hack with rdev */
17749 +                       if (data == NULL) {
17750 +                               rdev = inode->i_rdev;
17751 +                               inode->i_rdev = 0;
17752 +                       } else
17753 +                               rdev = data->rdev;
17754 +                       inode->i_blocks = 0;
17755 +                       assert("vs-42", fplug->h.id == SPECIAL_FILE_PLUGIN_ID);
17756 +                       inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17757 +                       /* initialize inode->i_fop and inode->i_rdev for block and char
17758 +                          devices */
17759 +                       init_special_inode(inode, inode->i_mode, rdev);
17760 +                       /* all address space operations are null */
17761 +                       inode->i_mapping->a_ops =
17762 +                           &file_plugins[fplug->h.id].as_ops;
17763 +                       break;
17764 +               }
17765 +       case S_IFLNK:
17766 +               assert("vs-46", fplug != NULL);
17767 +               assert("vs-42", fplug->h.id == SYMLINK_FILE_PLUGIN_ID);
17768 +               inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17769 +               inode->i_fop = NULL;
17770 +               /* all address space operations are null */
17771 +               inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17772 +               break;
17773 +       case S_IFDIR:
17774 +               assert("vs-46", dplug != NULL);
17775 +               assert("vs-43", (dplug->h.id == HASHED_DIR_PLUGIN_ID ||
17776 +                                dplug->h.id == SEEKABLE_HASHED_DIR_PLUGIN_ID));
17777 +               inode->i_op = &dir_plugins[dplug->h.id].inode_ops;
17778 +               inode->i_fop = &dir_plugins[dplug->h.id].file_ops;
17779 +               inode->i_mapping->a_ops = &dir_plugins[dplug->h.id].as_ops;
17780 +               break;
17781 +       case S_IFREG:
17782 +               assert("vs-46", fplug != NULL);
17783 +               assert("vs-43", (fplug->h.id == UNIX_FILE_PLUGIN_ID ||
17784 +                                fplug->h.id == CRYPTCOMPRESS_FILE_PLUGIN_ID));
17785 +               inode->i_op = &file_plugins[fplug->h.id].inode_ops;
17786 +               inode->i_fop = &file_plugins[fplug->h.id].file_ops;
17787 +               inode->i_mapping->a_ops = &file_plugins[fplug->h.id].as_ops;
17788 +               break;
17789 +       default:
17790 +               warning("nikita-291", "wrong file mode: %o for %llu",
17791 +                       inode->i_mode,
17792 +                       (unsigned long long)get_inode_oid(inode));
17793 +               reiser4_make_bad_inode(inode);
17794 +               return RETERR(-EINVAL);
17795 +       }
17796 +       return 0;
17797 +}
17798 +
17799 +/* Initialize inode from disk data. Called with inode locked.
17800 +   Return inode locked. */
17801 +static int init_inode(struct inode *inode /* inode to intialise */ ,
17802 +                     coord_t * coord /* coord of stat data */ )
17803 +{
17804 +       int result;
17805 +       item_plugin *iplug;
17806 +       void *body;
17807 +       int length;
17808 +       reiser4_inode *state;
17809 +
17810 +       assert("nikita-292", coord != NULL);
17811 +       assert("nikita-293", inode != NULL);
17812 +
17813 +       coord_clear_iplug(coord);
17814 +       result = zload(coord->node);
17815 +       if (result)
17816 +               return result;
17817 +       iplug = item_plugin_by_coord(coord);
17818 +       body = item_body_by_coord(coord);
17819 +       length = item_length_by_coord(coord);
17820 +
17821 +       assert("nikita-295", iplug != NULL);
17822 +       assert("nikita-296", body != NULL);
17823 +       assert("nikita-297", length > 0);
17824 +
17825 +       /* inode is under I_LOCK now */
17826 +
17827 +       state = reiser4_inode_data(inode);
17828 +       /* call stat-data plugin method to load sd content into inode */
17829 +       result = iplug->s.sd.init_inode(inode, body, length);
17830 +       set_plugin(&state->pset, PSET_SD, item_plugin_to_plugin(iplug));
17831 +       if (result == 0) {
17832 +               result = setup_inode_ops(inode, NULL);
17833 +               if (result == 0 && inode->i_sb->s_root &&
17834 +                   inode->i_sb->s_root->d_inode)
17835 +                       result = finish_pset(inode);
17836 +       }
17837 +       zrelse(coord->node);
17838 +       return result;
17839 +}
17840 +
17841 +/* read `inode' from the disk. This is what was previously in
17842 +   reiserfs_read_inode2().
17843 +
17844 +   Must be called with inode locked. Return inode still locked.
17845 +*/
17846 +static int read_inode(struct inode *inode /* inode to read from disk */ ,
17847 +                     const reiser4_key * key /* key of stat data */ ,
17848 +                     int silent)
17849 +{
17850 +       int result;
17851 +       lock_handle lh;
17852 +       reiser4_inode *info;
17853 +       coord_t coord;
17854 +
17855 +       assert("nikita-298", inode != NULL);
17856 +       assert("nikita-1945", !is_inode_loaded(inode));
17857 +
17858 +       info = reiser4_inode_data(inode);
17859 +       assert("nikita-300", info->locality_id != 0);
17860 +
17861 +       coord_init_zero(&coord);
17862 +       init_lh(&lh);
17863 +       /* locate stat-data in a tree and return znode locked */
17864 +       result = lookup_sd(inode, ZNODE_READ_LOCK, &coord, &lh, key, silent);
17865 +       assert("nikita-301", !is_inode_loaded(inode));
17866 +       if (result == 0) {
17867 +               /* use stat-data plugin to load sd into inode. */
17868 +               result = init_inode(inode, &coord);
17869 +               if (result == 0) {
17870 +                       /* initialize stat-data seal */
17871 +                       spin_lock_inode(inode);
17872 +                       reiser4_seal_init(&info->sd_seal, &coord, key);
17873 +                       info->sd_coord = coord;
17874 +                       spin_unlock_inode(inode);
17875 +
17876 +                       /* call file plugin's method to initialize plugin
17877 +                        * specific part of inode */
17878 +                       if (inode_file_plugin(inode)->init_inode_data)
17879 +                               inode_file_plugin(inode)->init_inode_data(inode,
17880 +                                                                         NULL,
17881 +                                                                         0);
17882 +                       /* load detached directory cursors for stateless
17883 +                        * directory readers (NFS). */
17884 +                       reiser4_load_cursors(inode);
17885 +
17886 +                       /* Check the opened inode for consistency. */
17887 +                       result =
17888 +                           get_super_private(inode->i_sb)->df_plug->
17889 +                           check_open(inode);
17890 +               }
17891 +       }
17892 +       /* lookup_sd() doesn't release coord because we want znode
17893 +          stay read-locked while stat-data fields are accessed in
17894 +          init_inode() */
17895 +       done_lh(&lh);
17896 +
17897 +       if (result != 0)
17898 +               reiser4_make_bad_inode(inode);
17899 +       return result;
17900 +}
17901 +
17902 +/* initialise new reiser4 inode being inserted into hash table. */
17903 +static int init_locked_inode(struct inode *inode /* new inode */ ,
17904 +                            void *opaque       /* key of stat data passed to the
17905 +                                                * iget5_locked as cookie */ )
17906 +{
17907 +       reiser4_key *key;
17908 +
17909 +       assert("nikita-1995", inode != NULL);
17910 +       assert("nikita-1996", opaque != NULL);
17911 +       key = opaque;
17912 +       set_inode_oid(inode, get_key_objectid(key));
17913 +       reiser4_inode_data(inode)->locality_id = get_key_locality(key);
17914 +       return 0;
17915 +}
17916 +
17917 +/* reiser4_inode_find_actor() - "find actor" supplied by reiser4 to iget5_locked().
17918 +
17919 +   This function is called by iget5_locked() to distinguish reiser4 inodes
17920 +   having the same inode numbers. Such inodes can only exist due to some error
17921 +   condition. One of them should be bad. Inodes with identical inode numbers
17922 +   (objectids) are distinguished by their packing locality.
17923 +
17924 +*/
17925 +static int reiser4_inode_find_actor(struct inode *inode        /* inode from hash table to
17926 +                                                        * check */ ,
17927 +                                   void *opaque        /* "cookie" passed to
17928 +                                                        * iget5_locked(). This is stat data
17929 +                                                        * key */ )
17930 +{
17931 +       reiser4_key *key;
17932 +
17933 +       key = opaque;
17934 +       return
17935 +           /* oid is unique, so first term is enough, actually. */
17936 +           get_inode_oid(inode) == get_key_objectid(key) &&
17937 +           /*
17938 +            * also, locality should be checked, but locality is stored in
17939 +            * the reiser4-specific part of the inode, and actor can be
17940 +            * called against arbitrary inode that happened to be in this
17941 +            * hash chain. Hence we first have to check that this is
17942 +            * reiser4 inode at least. is_reiser4_inode() is probably too
17943 +            * early to call, as inode may have ->i_op not yet
17944 +            * initialised.
17945 +            */
17946 +           is_reiser4_super(inode->i_sb) &&
17947 +           /*
17948 +            * usually objectid is unique, but pseudo files use counter to
17949 +            * generate objectid. All pseudo files are placed into special
17950 +            * (otherwise unused) locality.
17951 +            */
17952 +           reiser4_inode_data(inode)->locality_id == get_key_locality(key);
17953 +}
17954 +
17955 +/* hook for kmem_cache_create */
17956 +void loading_init_once(reiser4_inode * info)
17957 +{
17958 +       mutex_init(&info->loading);
17959 +}
17960 +
17961 +/* for reiser4_alloc_inode */
17962 +void loading_alloc(reiser4_inode * info)
17963 +{
17964 +       assert("vs-1717", !mutex_is_locked(&info->loading));
17965 +}
17966 +
17967 +/* for reiser4_destroy */
17968 +void loading_destroy(reiser4_inode * info)
17969 +{
17970 +       assert("vs-1717a", !mutex_is_locked(&info->loading));
17971 +}
17972 +
17973 +static void loading_begin(reiser4_inode * info)
17974 +{
17975 +       mutex_lock(&info->loading);
17976 +}
17977 +
17978 +static void loading_end(reiser4_inode * info)
17979 +{
17980 +       mutex_unlock(&info->loading);
17981 +}
17982 +
17983 +/**
17984 + * reiser4_iget - obtain inode via iget5_locked, read from disk if necessary
17985 + * @super: super block of filesystem
17986 + * @key: key of inode's stat-data
17987 + * @silent:
17988 + *
17989 + * This is our helper function a la iget(). This is be called by
17990 + * lookup_common() and reiser4_read_super(). Return inode locked or error
17991 + * encountered.
17992 + */
17993 +struct inode *reiser4_iget(struct super_block *super, const reiser4_key *key,
17994 +                          int silent)
17995 +{
17996 +       struct inode *inode;
17997 +       int result;
17998 +       reiser4_inode *info;
17999 +
18000 +       assert("nikita-302", super != NULL);
18001 +       assert("nikita-303", key != NULL);
18002 +
18003 +       result = 0;
18004 +
18005 +       /* call iget(). Our ->read_inode() is dummy, so this will either
18006 +          find inode in cache or return uninitialised inode */
18007 +       inode = iget5_locked(super,
18008 +                            (unsigned long)get_key_objectid(key),
18009 +                            reiser4_inode_find_actor,
18010 +                            init_locked_inode, (reiser4_key *) key);
18011 +       if (inode == NULL)
18012 +               return ERR_PTR(RETERR(-ENOMEM));
18013 +       if (is_bad_inode(inode)) {
18014 +               warning("nikita-304", "Bad inode found");
18015 +               reiser4_print_key("key", key);
18016 +               iput(inode);
18017 +               return ERR_PTR(RETERR(-EIO));
18018 +       }
18019 +
18020 +       info = reiser4_inode_data(inode);
18021 +
18022 +       /* Reiser4 inode state bit REISER4_LOADED is used to distinguish fully
18023 +          loaded and initialized inode from just allocated inode. If
18024 +          REISER4_LOADED bit is not set, reiser4_iget() completes loading under
18025 +          info->loading.  The place in reiser4 which uses not initialized inode
18026 +          is the reiser4 repacker, see repacker-related functions in
18027 +          plugin/item/extent.c */
18028 +       if (!is_inode_loaded(inode)) {
18029 +               loading_begin(info);
18030 +               if (!is_inode_loaded(inode)) {
18031 +                       /* locking: iget5_locked returns locked inode */
18032 +                       assert("nikita-1941", !is_inode_loaded(inode));
18033 +                       assert("nikita-1949",
18034 +                              reiser4_inode_find_actor(inode,
18035 +                                                       (reiser4_key *) key));
18036 +                       /* now, inode has objectid as ->i_ino and locality in
18037 +                          reiser4-specific part. This is enough for
18038 +                          read_inode() to read stat data from the disk */
18039 +                       result = read_inode(inode, key, silent);
18040 +               } else
18041 +                       loading_end(info);
18042 +       }
18043 +
18044 +       if (inode->i_state & I_NEW)
18045 +               unlock_new_inode(inode);
18046 +
18047 +       if (is_bad_inode(inode)) {
18048 +               assert("vs-1717", result != 0);
18049 +               loading_end(info);
18050 +               iput(inode);
18051 +               inode = ERR_PTR(result);
18052 +       } else if (REISER4_DEBUG) {
18053 +               reiser4_key found_key;
18054 +
18055 +               assert("vs-1717", result == 0);
18056 +               build_sd_key(inode, &found_key);
18057 +               if (!keyeq(&found_key, key)) {
18058 +                       warning("nikita-305", "Wrong key in sd");
18059 +                       reiser4_print_key("sought for", key);
18060 +                       reiser4_print_key("found", &found_key);
18061 +               }
18062 +               if (inode->i_nlink == 0) {
18063 +                       warning("nikita-3559", "Unlinked inode found: %llu\n",
18064 +                               (unsigned long long)get_inode_oid(inode));
18065 +               }
18066 +       }
18067 +       return inode;
18068 +}
18069 +
18070 +/* reiser4_iget() may return not fully initialized inode, this function should
18071 + * be called after one completes reiser4 inode initializing. */
18072 +void reiser4_iget_complete(struct inode *inode)
18073 +{
18074 +       assert("zam-988", is_reiser4_inode(inode));
18075 +
18076 +       if (!is_inode_loaded(inode)) {
18077 +               reiser4_inode_set_flag(inode, REISER4_LOADED);
18078 +               loading_end(reiser4_inode_data(inode));
18079 +       }
18080 +}
18081 +
18082 +void reiser4_make_bad_inode(struct inode *inode)
18083 +{
18084 +       assert("nikita-1934", inode != NULL);
18085 +
18086 +       /* clear LOADED bit */
18087 +       reiser4_inode_clr_flag(inode, REISER4_LOADED);
18088 +       make_bad_inode(inode);
18089 +       return;
18090 +}
18091 +
18092 +file_plugin *inode_file_plugin(const struct inode * inode)
18093 +{
18094 +       assert("nikita-1997", inode != NULL);
18095 +       return reiser4_inode_data(inode)->pset->file;
18096 +}
18097 +
18098 +dir_plugin *inode_dir_plugin(const struct inode * inode)
18099 +{
18100 +       assert("nikita-1998", inode != NULL);
18101 +       return reiser4_inode_data(inode)->pset->dir;
18102 +}
18103 +
18104 +formatting_plugin *inode_formatting_plugin(const struct inode * inode)
18105 +{
18106 +       assert("nikita-2000", inode != NULL);
18107 +       return reiser4_inode_data(inode)->pset->formatting;
18108 +}
18109 +
18110 +hash_plugin *inode_hash_plugin(const struct inode * inode)
18111 +{
18112 +       assert("nikita-2001", inode != NULL);
18113 +       return reiser4_inode_data(inode)->pset->hash;
18114 +}
18115 +
18116 +fibration_plugin *inode_fibration_plugin(const struct inode * inode)
18117 +{
18118 +       assert("nikita-2001", inode != NULL);
18119 +       return reiser4_inode_data(inode)->pset->fibration;
18120 +}
18121 +
18122 +cipher_plugin *inode_cipher_plugin(const struct inode * inode)
18123 +{
18124 +       assert("edward-36", inode != NULL);
18125 +       return reiser4_inode_data(inode)->pset->cipher;
18126 +}
18127 +
18128 +compression_plugin *inode_compression_plugin(const struct inode * inode)
18129 +{
18130 +       assert("edward-37", inode != NULL);
18131 +       return reiser4_inode_data(inode)->pset->compression;
18132 +}
18133 +
18134 +compression_mode_plugin *inode_compression_mode_plugin(const struct inode *
18135 +                                                      inode)
18136 +{
18137 +       assert("edward-1330", inode != NULL);
18138 +       return reiser4_inode_data(inode)->pset->compression_mode;
18139 +}
18140 +
18141 +cluster_plugin *inode_cluster_plugin(const struct inode * inode)
18142 +{
18143 +       assert("edward-1328", inode != NULL);
18144 +       return reiser4_inode_data(inode)->pset->cluster;
18145 +}
18146 +
18147 +file_plugin *inode_create_plugin(const struct inode * inode)
18148 +{
18149 +       assert("edward-1329", inode != NULL);
18150 +       return reiser4_inode_data(inode)->pset->create;
18151 +}
18152 +
18153 +digest_plugin *inode_digest_plugin(const struct inode * inode)
18154 +{
18155 +       assert("edward-86", inode != NULL);
18156 +       return reiser4_inode_data(inode)->pset->digest;
18157 +}
18158 +
18159 +item_plugin *inode_sd_plugin(const struct inode * inode)
18160 +{
18161 +       assert("vs-534", inode != NULL);
18162 +       return reiser4_inode_data(inode)->pset->sd;
18163 +}
18164 +
18165 +item_plugin *inode_dir_item_plugin(const struct inode * inode)
18166 +{
18167 +       assert("vs-534", inode != NULL);
18168 +       return reiser4_inode_data(inode)->pset->dir_item;
18169 +}
18170 +
18171 +file_plugin *child_create_plugin(const struct inode * inode)
18172 +{
18173 +       assert("edward-1329", inode != NULL);
18174 +       return reiser4_inode_data(inode)->hset->create;
18175 +}
18176 +
18177 +void inode_set_extension(struct inode *inode, sd_ext_bits ext)
18178 +{
18179 +       reiser4_inode *state;
18180 +
18181 +       assert("nikita-2716", inode != NULL);
18182 +       assert("nikita-2717", ext < LAST_SD_EXTENSION);
18183 +       assert("nikita-3491", spin_inode_is_locked(inode));
18184 +
18185 +       state = reiser4_inode_data(inode);
18186 +       state->extmask |= 1 << ext;
18187 +       /* force re-calculation of stat-data length on next call to
18188 +          update_sd(). */
18189 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18190 +}
18191 +
18192 +void inode_clr_extension(struct inode *inode, sd_ext_bits ext)
18193 +{
18194 +       reiser4_inode *state;
18195 +
18196 +       assert("vpf-1926", inode != NULL);
18197 +       assert("vpf-1927", ext < LAST_SD_EXTENSION);
18198 +       assert("vpf-1928", spin_inode_is_locked(inode));
18199 +
18200 +       state = reiser4_inode_data(inode);
18201 +       state->extmask &= ~(1 << ext);
18202 +       /* force re-calculation of stat-data length on next call to
18203 +          update_sd(). */
18204 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18205 +}
18206 +
18207 +void inode_check_scale_nolock(struct inode *inode, __u64 old, __u64 new)
18208 +{
18209 +       assert("edward-1287", inode != NULL);
18210 +       if (!dscale_fit(old, new))
18211 +               reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
18212 +       return;
18213 +}
18214 +
18215 +void inode_check_scale(struct inode *inode, __u64 old, __u64 new)
18216 +{
18217 +       assert("nikita-2875", inode != NULL);
18218 +       spin_lock_inode(inode);
18219 +       inode_check_scale_nolock(inode, old, new);
18220 +       spin_unlock_inode(inode);
18221 +}
18222 +
18223 +/*
18224 + * initialize ->ordering field of inode. This field defines how file stat-data
18225 + * and body is ordered within a tree with respect to other objects within the
18226 + * same parent directory.
18227 + */
18228 +void
18229 +init_inode_ordering(struct inode *inode,
18230 +                   reiser4_object_create_data * crd, int create)
18231 +{
18232 +       reiser4_key key;
18233 +
18234 +       if (create) {
18235 +               struct inode *parent;
18236 +
18237 +               parent = crd->parent;
18238 +               assert("nikita-3224", inode_dir_plugin(parent) != NULL);
18239 +               inode_dir_plugin(parent)->build_entry_key(parent,
18240 +                                                         &crd->dentry->d_name,
18241 +                                                         &key);
18242 +       } else {
18243 +               coord_t *coord;
18244 +
18245 +               coord = &reiser4_inode_data(inode)->sd_coord;
18246 +               coord_clear_iplug(coord);
18247 +               /* safe to use ->sd_coord, because node is under long term
18248 +                * lock */
18249 +               WITH_DATA(coord->node, item_key_by_coord(coord, &key));
18250 +       }
18251 +
18252 +       set_inode_ordering(inode, get_key_ordering(&key));
18253 +}
18254 +
18255 +znode *inode_get_vroot(struct inode *inode)
18256 +{
18257 +       reiser4_block_nr blk;
18258 +       znode *result;
18259 +
18260 +       spin_lock_inode(inode);
18261 +       blk = reiser4_inode_data(inode)->vroot;
18262 +       spin_unlock_inode(inode);
18263 +       if (!disk_addr_eq(&UBER_TREE_ADDR, &blk))
18264 +               result = zlook(reiser4_tree_by_inode(inode), &blk);
18265 +       else
18266 +               result = NULL;
18267 +       return result;
18268 +}
18269 +
18270 +void inode_set_vroot(struct inode *inode, znode *vroot)
18271 +{
18272 +       spin_lock_inode(inode);
18273 +       reiser4_inode_data(inode)->vroot = *znode_get_block(vroot);
18274 +       spin_unlock_inode(inode);
18275 +}
18276 +
18277 +#if REISER4_DEBUG
18278 +
18279 +void reiser4_inode_invariant(const struct inode *inode)
18280 +{
18281 +       assert("nikita-3077", spin_inode_is_locked(inode));
18282 +}
18283 +
18284 +int inode_has_no_jnodes(reiser4_inode * r4_inode)
18285 +{
18286 +       return jnode_tree_by_reiser4_inode(r4_inode)->rnode == NULL &&
18287 +               r4_inode->nr_jnodes == 0;
18288 +}
18289 +
18290 +#endif
18291 +
18292 +/* true if directory is empty (only contains dot and dotdot) */
18293 +/* FIXME: shouldn't it be dir plugin method? */
18294 +int is_dir_empty(const struct inode *dir)
18295 +{
18296 +       assert("nikita-1976", dir != NULL);
18297 +
18298 +       /* rely on our method to maintain directory i_size being equal to the
18299 +          number of entries. */
18300 +       return dir->i_size <= 2 ? 0 : RETERR(-ENOTEMPTY);
18301 +}
18302 +
18303 +/* Make Linus happy.
18304 +   Local variables:
18305 +   c-indentation-style: "K&R"
18306 +   mode-name: "LC"
18307 +   c-basic-offset: 8
18308 +   tab-width: 8
18309 +   fill-column: 120
18310 +   End:
18311 +*/
18312 diff --git a/fs/reiser4/inode.h b/fs/reiser4/inode.h
18313 new file mode 100644
18314 index 0000000..2cc1d82
18315 --- /dev/null
18316 +++ b/fs/reiser4/inode.h
18317 @@ -0,0 +1,438 @@
18318 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
18319 +
18320 +/* Inode functions. */
18321 +
18322 +#if !defined( __REISER4_INODE_H__ )
18323 +#define __REISER4_INODE_H__
18324 +
18325 +#include "forward.h"
18326 +#include "debug.h"
18327 +#include "key.h"
18328 +#include "seal.h"
18329 +#include "plugin/plugin.h"
18330 +#include "plugin/file/cryptcompress.h"
18331 +#include "plugin/file/file.h"
18332 +#include "plugin/dir/dir.h"
18333 +#include "plugin/plugin_set.h"
18334 +#include "plugin/security/perm.h"
18335 +#include "vfs_ops.h"
18336 +#include "jnode.h"
18337 +#include "fsdata.h"
18338 +
18339 +#include <linux/types.h>       /* for __u?? , ino_t */
18340 +#include <linux/fs.h>          /* for struct super_block, struct
18341 +                                * rw_semaphore, etc  */
18342 +#include <linux/spinlock.h>
18343 +#include <asm/types.h>
18344 +
18345 +/* reiser4-specific inode flags. They are "transient" and are not
18346 +   supposed to be stored on disk. Used to trace "state" of
18347 +   inode
18348 +*/
18349 +typedef enum {
18350 +       /* this is light-weight inode, inheriting some state from its
18351 +          parent  */
18352 +       REISER4_LIGHT_WEIGHT = 0,
18353 +       /* stat data wasn't yet created */
18354 +       REISER4_NO_SD = 1,
18355 +       /* internal immutable flag. Currently is only used
18356 +          to avoid race condition during file creation.
18357 +          See comment in create_object(). */
18358 +       REISER4_IMMUTABLE = 2,
18359 +       /* inode was read from storage */
18360 +       REISER4_LOADED = 3,
18361 +       /* this bit is set for symlinks. inode->i_private points to target
18362 +          name of symlink. */
18363 +       REISER4_GENERIC_PTR_USED = 4,
18364 +       /* set if size of stat-data item for this inode is known. If this is
18365 +        * set we can avoid recalculating size of stat-data on each update. */
18366 +       REISER4_SDLEN_KNOWN = 5,
18367 +       /* reiser4_inode->crypt points to the crypto stat */
18368 +       REISER4_CRYPTO_STAT_LOADED = 6,
18369 +       /* cryptcompress_inode_data points to the secret key */
18370 +       REISER4_SECRET_KEY_INSTALLED = 7,
18371 +       /* File (possibly) has pages corresponding to the tail items, that
18372 +        * were created by ->readpage. It is set by mmap_unix_file() and
18373 +        * sendfile_unix_file(). This bit is inspected by write_unix_file and
18374 +        * kill-hook of tail items. It is never cleared once set. This bit is
18375 +        * modified and inspected under i_mutex. */
18376 +       REISER4_HAS_MMAP = 8,
18377 +       REISER4_PART_MIXED = 9,
18378 +       REISER4_PART_IN_CONV = 10,
18379 +       /* This flag indicates that file plugin conversion is in progress */
18380 +       REISER4_FILE_CONV_IN_PROGRESS = 11
18381 +} reiser4_file_plugin_flags;
18382 +
18383 +/* state associated with each inode.
18384 +   reiser4 inode.
18385 +
18386 +   NOTE-NIKITA In 2.5 kernels it is not necessary that all file-system inodes
18387 +   be of the same size. File-system allocates inodes by itself through
18388 +   s_op->allocate_inode() method. So, it is possible to adjust size of inode
18389 +   at the time of its creation.
18390 +
18391 +   Invariants involving parts of this data-type:
18392 +
18393 +      [inode->eflushed]
18394 +
18395 +*/
18396 +
18397 +typedef struct reiser4_inode reiser4_inode;
18398 +/* return pointer to reiser4-specific part of inode */
18399 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18400 +                                               /* inode queried */ );
18401 +
18402 +#if BITS_PER_LONG == 64
18403 +
18404 +#define REISER4_INO_IS_OID (1)
18405 +typedef struct {;
18406 +} oid_hi_t;
18407 +
18408 +/* BITS_PER_LONG == 64 */
18409 +#else
18410 +
18411 +#define REISER4_INO_IS_OID (0)
18412 +typedef __u32 oid_hi_t;
18413 +
18414 +/* BITS_PER_LONG == 64 */
18415 +#endif
18416 +
18417 +struct reiser4_inode {
18418 +       /* spin lock protecting fields of this structure. */
18419 +       spinlock_t guard;
18420 +       /* main plugin set that control the file
18421 +          (see comments in plugin/plugin_set.c) */
18422 +       plugin_set *pset;
18423 +       /* plugin set for inheritance
18424 +          (see comments in plugin/plugin_set.c) */
18425 +       plugin_set *hset;
18426 +       /* high 32 bits of object id */
18427 +       oid_hi_t oid_hi;
18428 +       /* seal for stat-data */
18429 +       seal_t sd_seal;
18430 +       /* locality id for this file */
18431 +       oid_t locality_id;
18432 +#if REISER4_LARGE_KEY
18433 +       __u64 ordering;
18434 +#endif
18435 +       /* coord of stat-data in sealed node */
18436 +       coord_t sd_coord;
18437 +       /* bit-mask of stat-data extentions used by this file */
18438 +       __u64 extmask;
18439 +       /* bitmask of non-default plugins for this inode */
18440 +       __u16 plugin_mask;
18441 +       /* bitmask of set heir plugins for this inode. */
18442 +       __u16 heir_mask;
18443 +       union {
18444 +               struct list_head readdir_list;
18445 +               struct list_head not_used;
18446 +       } lists;
18447 +       /* per-inode flags. Filled by values of reiser4_file_plugin_flags */
18448 +       unsigned long flags;
18449 +       union {
18450 +               /* fields specific to unix_file plugin */
18451 +               unix_file_info_t unix_file_info;
18452 +               /* fields specific to cryptcompress plugin */
18453 +               cryptcompress_info_t cryptcompress_info;
18454 +       } file_plugin_data;
18455 +
18456 +       /* this semaphore is to serialize readers and writers of @pset->file
18457 +        * when file plugin conversion is enabled
18458 +        */
18459 +       struct rw_semaphore conv_sem;
18460 +
18461 +       /* tree of jnodes. Phantom jnodes (ones not attched to any atom) are
18462 +          tagged in that tree by EFLUSH_TAG_ANONYMOUS */
18463 +       struct radix_tree_root jnodes_tree;
18464 +#if REISER4_DEBUG
18465 +       /* number of unformatted node jnodes of this file in jnode hash table */
18466 +       unsigned long nr_jnodes;
18467 +#endif
18468 +
18469 +       /* block number of virtual root for this object. See comment above
18470 +        * fs/reiser4/search.c:handle_vroot() */
18471 +       reiser4_block_nr vroot;
18472 +       struct mutex loading;
18473 +};
18474 +
18475 +void loading_init_once(reiser4_inode *);
18476 +void loading_alloc(reiser4_inode *);
18477 +void loading_destroy(reiser4_inode *);
18478 +
18479 +typedef struct reiser4_inode_object {
18480 +       /* private part */
18481 +       reiser4_inode p;
18482 +       /* generic fields not specific to reiser4, but used by VFS */
18483 +       struct inode vfs_inode;
18484 +} reiser4_inode_object;
18485 +
18486 +/* return pointer to the reiser4 specific portion of @inode */
18487 +static inline reiser4_inode *reiser4_inode_data(const struct inode *inode
18488 +                                               /* inode queried */ )
18489 +{
18490 +       assert("nikita-254", inode != NULL);
18491 +       return &container_of(inode, reiser4_inode_object, vfs_inode)->p;
18492 +}
18493 +
18494 +static inline struct inode *inode_by_reiser4_inode(const reiser4_inode *
18495 +                                                  r4_inode /* inode queried */
18496 +                                                  )
18497 +{
18498 +       return &container_of(r4_inode, reiser4_inode_object, p)->vfs_inode;
18499 +}
18500 +
18501 +/*
18502 + * reiser4 inodes are identified by 64bit object-id (oid_t), but in struct
18503 + * inode ->i_ino field is of type ino_t (long) that can be either 32 or 64
18504 + * bits.
18505 + *
18506 + * If ->i_ino is 32 bits we store remaining 32 bits in reiser4 specific part
18507 + * of inode, otherwise whole oid is stored in i_ino.
18508 + *
18509 + * Wrappers below ([sg]et_inode_oid()) are used to hide this difference.
18510 + */
18511 +
18512 +#define OID_HI_SHIFT (sizeof(ino_t) * 8)
18513 +
18514 +#if REISER4_INO_IS_OID
18515 +
18516 +static inline oid_t get_inode_oid(const struct inode *inode)
18517 +{
18518 +       return inode->i_ino;
18519 +}
18520 +
18521 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18522 +{
18523 +       inode->i_ino = oid;
18524 +}
18525 +
18526 +/* REISER4_INO_IS_OID */
18527 +#else
18528 +
18529 +static inline oid_t get_inode_oid(const struct inode *inode)
18530 +{
18531 +       return
18532 +           ((__u64) reiser4_inode_data(inode)->oid_hi << OID_HI_SHIFT) |
18533 +           inode->i_ino;
18534 +}
18535 +
18536 +static inline void set_inode_oid(struct inode *inode, oid_t oid)
18537 +{
18538 +       assert("nikita-2519", inode != NULL);
18539 +       inode->i_ino = (ino_t) (oid);
18540 +       reiser4_inode_data(inode)->oid_hi = (oid) >> OID_HI_SHIFT;
18541 +       assert("nikita-2521", get_inode_oid(inode) == (oid));
18542 +}
18543 +
18544 +/* REISER4_INO_IS_OID */
18545 +#endif
18546 +
18547 +static inline oid_t get_inode_locality(const struct inode *inode)
18548 +{
18549 +       return reiser4_inode_data(inode)->locality_id;
18550 +}
18551 +
18552 +#if REISER4_LARGE_KEY
18553 +static inline __u64 get_inode_ordering(const struct inode *inode)
18554 +{
18555 +       return reiser4_inode_data(inode)->ordering;
18556 +}
18557 +
18558 +static inline void set_inode_ordering(const struct inode *inode, __u64 ordering)
18559 +{
18560 +       reiser4_inode_data(inode)->ordering = ordering;
18561 +}
18562 +
18563 +#else
18564 +
18565 +#define get_inode_ordering(inode) (0)
18566 +#define set_inode_ordering(inode, val) noop
18567 +
18568 +#endif
18569 +
18570 +/* return inode in which @uf_info is embedded */
18571 +static inline struct inode *unix_file_info_to_inode(const unix_file_info_t *
18572 +                                                   uf_info)
18573 +{
18574 +       return &container_of(uf_info, reiser4_inode_object,
18575 +                            p.file_plugin_data.unix_file_info)->vfs_inode;
18576 +}
18577 +
18578 +extern ino_t oid_to_ino(oid_t oid) __attribute__ ((const));
18579 +extern ino_t oid_to_uino(oid_t oid) __attribute__ ((const));
18580 +
18581 +extern reiser4_tree *reiser4_tree_by_inode(const struct inode *inode);
18582 +
18583 +#if REISER4_DEBUG
18584 +extern void reiser4_inode_invariant(const struct inode *inode);
18585 +extern int inode_has_no_jnodes(reiser4_inode *);
18586 +#else
18587 +#define reiser4_inode_invariant(inode) noop
18588 +#endif
18589 +
18590 +static inline int spin_inode_is_locked(const struct inode *inode)
18591 +{
18592 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
18593 +       return 1;
18594 +}
18595 +
18596 +/**
18597 + * spin_lock_inode - lock reiser4_inode' embedded spinlock
18598 + * @inode: inode to lock
18599 + *
18600 + * In debug mode it checks that lower priority locks are not held and
18601 + * increments reiser4_context's lock counters on which lock ordering checking
18602 + * is based.
18603 + */
18604 +static inline void spin_lock_inode(struct inode *inode)
18605 +{
18606 +       assert("", LOCK_CNT_NIL(spin_locked));
18607 +       /* check lock ordering */
18608 +       assert_spin_not_locked(&d_lock);
18609 +
18610 +       spin_lock(&reiser4_inode_data(inode)->guard);
18611 +
18612 +       LOCK_CNT_INC(spin_locked_inode);
18613 +       LOCK_CNT_INC(spin_locked);
18614 +
18615 +       reiser4_inode_invariant(inode);
18616 +}
18617 +
18618 +/**
18619 + * spin_unlock_inode - unlock reiser4_inode' embedded spinlock
18620 + * @inode: inode to unlock
18621 + *
18622 + * In debug mode it checks that spinlock is held and decrements
18623 + * reiser4_context's lock counters on which lock ordering checking is based.
18624 + */
18625 +static inline void spin_unlock_inode(struct inode *inode)
18626 +{
18627 +       assert_spin_locked(&reiser4_inode_data(inode)->guard);
18628 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_inode));
18629 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
18630 +
18631 +       reiser4_inode_invariant(inode);
18632 +
18633 +       LOCK_CNT_DEC(spin_locked_inode);
18634 +       LOCK_CNT_DEC(spin_locked);
18635 +
18636 +       spin_unlock(&reiser4_inode_data(inode)->guard);
18637 +}
18638 +
18639 +extern znode *inode_get_vroot(struct inode *inode);
18640 +extern void inode_set_vroot(struct inode *inode, znode * vroot);
18641 +
18642 +extern int reiser4_max_filename_len(const struct inode *inode);
18643 +extern int max_hash_collisions(const struct inode *dir);
18644 +extern void reiser4_unlock_inode(struct inode *inode);
18645 +extern int is_reiser4_inode(const struct inode *inode);
18646 +extern int setup_inode_ops(struct inode *inode, reiser4_object_create_data *);
18647 +extern struct inode *reiser4_iget(struct super_block *super,
18648 +                                 const reiser4_key * key, int silent);
18649 +extern void reiser4_iget_complete(struct inode *inode);
18650 +extern void reiser4_inode_set_flag(struct inode *inode, reiser4_file_plugin_flags f);
18651 +extern void reiser4_inode_clr_flag(struct inode *inode, reiser4_file_plugin_flags f);
18652 +extern int reiser4_inode_get_flag(const struct inode *inode,
18653 +                                 reiser4_file_plugin_flags f);
18654 +
18655 +/*  has inode been initialized? */
18656 +static inline int
18657 +is_inode_loaded(const struct inode *inode /* inode queried */ )
18658 +{
18659 +       assert("nikita-1120", inode != NULL);
18660 +       return reiser4_inode_get_flag(inode, REISER4_LOADED);
18661 +}
18662 +
18663 +extern file_plugin *inode_file_plugin(const struct inode *inode);
18664 +extern dir_plugin *inode_dir_plugin(const struct inode *inode);
18665 +extern formatting_plugin *inode_formatting_plugin(const struct inode *inode);
18666 +extern hash_plugin *inode_hash_plugin(const struct inode *inode);
18667 +extern fibration_plugin *inode_fibration_plugin(const struct inode *inode);
18668 +extern cipher_plugin *inode_cipher_plugin(const struct inode *inode);
18669 +extern digest_plugin *inode_digest_plugin(const struct inode *inode);
18670 +extern compression_plugin *inode_compression_plugin(const struct inode *inode);
18671 +extern compression_mode_plugin *inode_compression_mode_plugin(const struct inode
18672 +                                                             *inode);
18673 +extern cluster_plugin *inode_cluster_plugin(const struct inode *inode);
18674 +extern file_plugin *inode_create_plugin(const struct inode *inode);
18675 +extern item_plugin *inode_sd_plugin(const struct inode *inode);
18676 +extern item_plugin *inode_dir_item_plugin(const struct inode *inode);
18677 +extern file_plugin *child_create_plugin(const struct inode *inode);
18678 +
18679 +extern void reiser4_make_bad_inode(struct inode *inode);
18680 +
18681 +extern void inode_set_extension(struct inode *inode, sd_ext_bits ext);
18682 +extern void inode_clr_extension(struct inode *inode, sd_ext_bits ext);
18683 +extern void inode_check_scale(struct inode *inode, __u64 old, __u64 new);
18684 +extern void inode_check_scale_nolock(struct inode * inode, __u64 old, __u64 new);
18685 +
18686 +/*
18687 + * update field @field in inode @i to contain value @value.
18688 + */
18689 +#define INODE_SET_FIELD(i, field, value)               \
18690 +({                                                     \
18691 +       struct inode *__i;                              \
18692 +       typeof(value) __v;                              \
18693 +                                                       \
18694 +       __i = (i);                                      \
18695 +       __v = (value);                                  \
18696 +       inode_check_scale(__i, __i->field, __v);        \
18697 +       __i->field = __v;                               \
18698 +})
18699 +
18700 +#define INODE_INC_FIELD(i, field)                              \
18701 +({                                                             \
18702 +       struct inode *__i;                                      \
18703 +                                                               \
18704 +       __i = (i);                                              \
18705 +       inode_check_scale(__i, __i->field, __i->field + 1);     \
18706 +       ++ __i->field;                                          \
18707 +})
18708 +
18709 +#define INODE_DEC_FIELD(i, field)                              \
18710 +({                                                             \
18711 +       struct inode *__i;                                      \
18712 +                                                               \
18713 +       __i = (i);                                              \
18714 +       inode_check_scale(__i, __i->field, __i->field - 1);     \
18715 +       -- __i->field;                                          \
18716 +})
18717 +
18718 +/* See comment before reiser4_readdir_common() for description. */
18719 +static inline struct list_head *get_readdir_list(const struct inode *inode)
18720 +{
18721 +       return &reiser4_inode_data(inode)->lists.readdir_list;
18722 +}
18723 +
18724 +extern void init_inode_ordering(struct inode *inode,
18725 +                               reiser4_object_create_data * crd, int create);
18726 +
18727 +static inline struct radix_tree_root *jnode_tree_by_inode(struct inode *inode)
18728 +{
18729 +       return &reiser4_inode_data(inode)->jnodes_tree;
18730 +}
18731 +
18732 +static inline struct radix_tree_root *jnode_tree_by_reiser4_inode(reiser4_inode
18733 +                                                                 * r4_inode)
18734 +{
18735 +       return &r4_inode->jnodes_tree;
18736 +}
18737 +
18738 +#if REISER4_DEBUG
18739 +extern void print_inode(const char *prefix, const struct inode *i);
18740 +#endif
18741 +
18742 +int is_dir_empty(const struct inode *);
18743 +
18744 +/* __REISER4_INODE_H__ */
18745 +#endif
18746 +
18747 +/* Make Linus happy.
18748 +   Local variables:
18749 +   c-indentation-style: "K&R"
18750 +   mode-name: "LC"
18751 +   c-basic-offset: 8
18752 +   tab-width: 8
18753 +   fill-column: 120
18754 +   End:
18755 +*/
18756 diff --git a/fs/reiser4/ioctl.h b/fs/reiser4/ioctl.h
18757 new file mode 100644
18758 index 0000000..4d57737
18759 --- /dev/null
18760 +++ b/fs/reiser4/ioctl.h
18761 @@ -0,0 +1,41 @@
18762 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
18763 + * reiser4/README */
18764 +
18765 +#if !defined( __REISER4_IOCTL_H__ )
18766 +#define __REISER4_IOCTL_H__
18767 +
18768 +#include <linux/fs.h>
18769 +
18770 +/*
18771 + * ioctl(2) command used to "unpack" reiser4 file, that is, convert it into
18772 + * extents and fix in this state. This is used by applications that rely on
18773 + *
18774 + *     . files being block aligned, and
18775 + *
18776 + *     . files never migrating on disk
18777 + *
18778 + * for example, boot loaders (LILO) need this.
18779 + *
18780 + * This ioctl should be used as
18781 + *
18782 + *     result = ioctl(fd, REISER4_IOC_UNPACK);
18783 + *
18784 + * File behind fd descriptor will be converted to the extents (if necessary),
18785 + * and its stat-data will be updated so that it will never be converted back
18786 + * into tails again.
18787 + */
18788 +#define REISER4_IOC_UNPACK _IOW(0xCD,1,long)
18789 +
18790 +/* __REISER4_IOCTL_H__ */
18791 +#endif
18792 +
18793 +/* Make Linus happy.
18794 +   Local variables:
18795 +   c-indentation-style: "K&R"
18796 +   mode-name: "LC"
18797 +   c-basic-offset: 8
18798 +   tab-width: 8
18799 +   fill-column: 120
18800 +   scroll-step: 1
18801 +   End:
18802 +*/
18803 diff --git a/fs/reiser4/jnode.c b/fs/reiser4/jnode.c
18804 new file mode 100644
18805 index 0000000..1d16d41
18806 --- /dev/null
18807 +++ b/fs/reiser4/jnode.c
18808 @@ -0,0 +1,1925 @@
18809 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
18810 + * reiser4/README */
18811 +/* Jnode manipulation functions. */
18812 +/* Jnode is entity used to track blocks with data and meta-data in reiser4.
18813 +
18814 +   In particular, jnodes are used to track transactional information
18815 +   associated with each block. Each znode contains jnode as ->zjnode field.
18816 +
18817 +   Jnode stands for either Josh or Journal node.
18818 +*/
18819 +
18820 +/*
18821 + * Taxonomy.
18822 + *
18823 + *     Jnode represents block containing data or meta-data. There are jnodes
18824 + *     for:
18825 + *
18826 + *         unformatted blocks (jnodes proper). There are plans, however to
18827 + *         have a handle per extent unit rather than per each unformatted
18828 + *         block, because there are so many of them.
18829 + *
18830 + *         For bitmaps. Each bitmap is actually represented by two jnodes--one
18831 + *         for working and another for "commit" data, together forming bnode.
18832 + *
18833 + *         For io-heads. These are used by log writer.
18834 + *
18835 + *         For formatted nodes (znode). See comment at the top of znode.c for
18836 + *         details specific to the formatted nodes (znodes).
18837 + *
18838 + * Node data.
18839 + *
18840 + *     Jnode provides access to the data of node it represents. Data are
18841 + *     stored in a page. Page is kept in a page cache. This means, that jnodes
18842 + *     are highly interconnected with page cache and VM internals.
18843 + *
18844 + *     jnode has a pointer to page (->pg) containing its data. Pointer to data
18845 + *     themselves is cached in ->data field to avoid frequent calls to
18846 + *     page_address().
18847 + *
18848 + *     jnode and page are attached to each other by jnode_attach_page(). This
18849 + *     function places pointer to jnode in set_page_private(), sets PG_private
18850 + *     flag and increments page counter.
18851 + *
18852 + *     Opposite operation is performed by page_clear_jnode().
18853 + *
18854 + *     jnode->pg is protected by jnode spin lock, and page->private is
18855 + *     protected by page lock. See comment at the top of page_cache.c for
18856 + *     more.
18857 + *
18858 + *     page can be detached from jnode for two reasons:
18859 + *
18860 + *         . jnode is removed from a tree (file is truncated, of formatted
18861 + *         node is removed by balancing).
18862 + *
18863 + *         . during memory pressure, VM calls ->releasepage() method
18864 + *         (reiser4_releasepage()) to evict page from memory.
18865 + *
18866 + *    (there, of course, is also umount, but this is special case we are not
18867 + *    concerned with here).
18868 + *
18869 + *    To protect jnode page from eviction, one calls jload() function that
18870 + *    "pins" page in memory (loading it if necessary), increments
18871 + *    jnode->d_count, and kmap()s page. Page is unpinned through call to
18872 + *    jrelse().
18873 + *
18874 + * Jnode life cycle.
18875 + *
18876 + *    jnode is created, placed in hash table, and, optionally, in per-inode
18877 + *    radix tree. Page can be attached to jnode, pinned, released, etc.
18878 + *
18879 + *    When jnode is captured into atom its reference counter is
18880 + *    increased. While being part of an atom, jnode can be "early
18881 + *    flushed". This means that as part of flush procedure, jnode is placed
18882 + *    into "relocate set", and its page is submitted to the disk. After io
18883 + *    completes, page can be detached, then loaded again, re-dirtied, etc.
18884 + *
18885 + *    Thread acquired reference to jnode by calling jref() and releases it by
18886 + *    jput(). When last reference is removed, jnode is still retained in
18887 + *    memory (cached) if it has page attached, _unless_ it is scheduled for
18888 + *    destruction (has JNODE_HEARD_BANSHEE bit set).
18889 + *
18890 + *    Tree read-write lock was used as "existential" lock for jnodes. That is,
18891 + *    jnode->x_count could be changed from 0 to 1 only under tree write lock,
18892 + *    that is, tree lock protected unreferenced jnodes stored in the hash
18893 + *    table, from recycling.
18894 + *
18895 + *    This resulted in high contention on tree lock, because jref()/jput() is
18896 + *    frequent operation. To ameliorate this problem, RCU is used: when jput()
18897 + *    is just about to release last reference on jnode it sets JNODE_RIP bit
18898 + *    on it, and then proceed with jnode destruction (removing jnode from hash
18899 + *    table, cbk_cache, detaching page, etc.). All places that change jnode
18900 + *    reference counter from 0 to 1 (jlookup(), zlook(), zget(), and
18901 + *    cbk_cache_scan_slots()) check for JNODE_RIP bit (this is done by
18902 + *    jnode_rip_check() function), and pretend that nothing was found in hash
18903 + *    table if bit is set.
18904 + *
18905 + *    jput defers actual return of jnode into slab cache to some later time
18906 + *    (by call_rcu()), this guarantees that other threads can safely continue
18907 + *    working with JNODE_RIP-ped jnode.
18908 + *
18909 + */
18910 +
18911 +#include "reiser4.h"
18912 +#include "debug.h"
18913 +#include "dformat.h"
18914 +#include "jnode.h"
18915 +#include "plugin/plugin_header.h"
18916 +#include "plugin/plugin.h"
18917 +#include "txnmgr.h"
18918 +/*#include "jnode.h"*/
18919 +#include "znode.h"
18920 +#include "tree.h"
18921 +#include "tree_walk.h"
18922 +#include "super.h"
18923 +#include "inode.h"
18924 +#include "page_cache.h"
18925 +
18926 +#include <asm/uaccess.h>       /* UML needs this for PAGE_OFFSET */
18927 +#include <linux/types.h>
18928 +#include <linux/slab.h>
18929 +#include <linux/pagemap.h>
18930 +#include <linux/swap.h>
18931 +#include <linux/fs.h>          /* for struct address_space  */
18932 +#include <linux/writeback.h>   /* for inode_lock */
18933 +
18934 +static struct kmem_cache *_jnode_slab = NULL;
18935 +
18936 +static void jnode_set_type(jnode * node, jnode_type type);
18937 +static int jdelete(jnode * node);
18938 +static int jnode_try_drop(jnode * node);
18939 +
18940 +#if REISER4_DEBUG
18941 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked);
18942 +#endif
18943 +
18944 +/* true if valid page is attached to jnode */
18945 +static inline int jnode_is_parsed(jnode * node)
18946 +{
18947 +       return JF_ISSET(node, JNODE_PARSED);
18948 +}
18949 +
18950 +/* hash table support */
18951 +
18952 +/* compare two jnode keys for equality. Used by hash-table macros */
18953 +static inline int jnode_key_eq(const jnode_key_t * k1, const jnode_key_t * k2)
18954 +{
18955 +       assert("nikita-2350", k1 != NULL);
18956 +       assert("nikita-2351", k2 != NULL);
18957 +
18958 +       return (k1->index == k2->index && k1->objectid == k2->objectid);
18959 +}
18960 +
18961 +/* Hash jnode by its key (inode plus offset). Used by hash-table macros */
18962 +static inline __u32
18963 +jnode_key_hashfn(j_hash_table * table, const jnode_key_t * key)
18964 +{
18965 +       assert("nikita-2352", key != NULL);
18966 +       assert("nikita-3346", IS_POW(table->_buckets));
18967 +
18968 +       /* yes, this is remarkable simply (where not stupid) hash function. */
18969 +       return (key->objectid + key->index) & (table->_buckets - 1);
18970 +}
18971 +
18972 +/* The hash table definition */
18973 +#define KMALLOC(size) reiser4_vmalloc(size)
18974 +#define KFREE(ptr, size) vfree(ptr)
18975 +TYPE_SAFE_HASH_DEFINE(j, jnode, jnode_key_t, key.j, link.j, jnode_key_hashfn,
18976 +                     jnode_key_eq);
18977 +#undef KFREE
18978 +#undef KMALLOC
18979 +
18980 +/* call this to initialise jnode hash table */
18981 +int jnodes_tree_init(reiser4_tree * tree /* tree to initialise jnodes for */ )
18982 +{
18983 +       assert("nikita-2359", tree != NULL);
18984 +       return j_hash_init(&tree->jhash_table, 16384);
18985 +}
18986 +
18987 +/* call this to destroy jnode hash table. This is called during umount. */
18988 +int jnodes_tree_done(reiser4_tree * tree /* tree to destroy jnodes for */ )
18989 +{
18990 +       j_hash_table *jtable;
18991 +       jnode *node;
18992 +       jnode *next;
18993 +
18994 +       assert("nikita-2360", tree != NULL);
18995 +
18996 +       /*
18997 +        * Scan hash table and free all jnodes.
18998 +        */
18999 +       jtable = &tree->jhash_table;
19000 +       if (jtable->_table) {
19001 +               for_all_in_htable(jtable, j, node, next) {
19002 +                       assert("nikita-2361", !atomic_read(&node->x_count));
19003 +                       jdrop(node);
19004 +               }
19005 +
19006 +               j_hash_done(&tree->jhash_table);
19007 +       }
19008 +       return 0;
19009 +}
19010 +
19011 +/**
19012 + * init_jnodes - create jnode cache
19013 + *
19014 + * Initializes slab cache jnodes. It is part of reiser4 module initialization.
19015 + */
19016 +int init_jnodes(void)
19017 +{
19018 +       assert("umka-168", _jnode_slab == NULL);
19019 +
19020 +       _jnode_slab = kmem_cache_create("jnode", sizeof(jnode), 0,
19021 +                                       SLAB_HWCACHE_ALIGN |
19022 +                                       SLAB_RECLAIM_ACCOUNT, NULL, NULL);
19023 +       if (_jnode_slab == NULL)
19024 +               return RETERR(-ENOMEM);
19025 +
19026 +       return 0;
19027 +}
19028 +
19029 +/**
19030 + * done_znodes - delete znode cache
19031 + *
19032 + * This is called on reiser4 module unloading or system shutdown.
19033 + */
19034 +void done_jnodes(void)
19035 +{
19036 +       destroy_reiser4_cache(&_jnode_slab);
19037 +}
19038 +
19039 +/* Initialize a jnode. */
19040 +void jnode_init(jnode * node, reiser4_tree * tree, jnode_type type)
19041 +{
19042 +       assert("umka-175", node != NULL);
19043 +
19044 +       memset(node, 0, sizeof(jnode));
19045 +       ON_DEBUG(node->magic = JMAGIC);
19046 +       jnode_set_type(node, type);
19047 +       atomic_set(&node->d_count, 0);
19048 +       atomic_set(&node->x_count, 0);
19049 +       spin_lock_init(&node->guard);
19050 +       spin_lock_init(&node->load);
19051 +       node->atom = NULL;
19052 +       node->tree = tree;
19053 +       INIT_LIST_HEAD(&node->capture_link);
19054 +
19055 +       ASSIGN_NODE_LIST(node, NOT_CAPTURED);
19056 +
19057 +       INIT_RCU_HEAD(&node->rcu);
19058 +
19059 +#if REISER4_DEBUG
19060 +       {
19061 +               reiser4_super_info_data *sbinfo;
19062 +
19063 +               sbinfo = get_super_private(tree->super);
19064 +               spin_lock_irq(&sbinfo->all_guard);
19065 +               list_add(&node->jnodes, &sbinfo->all_jnodes);
19066 +               spin_unlock_irq(&sbinfo->all_guard);
19067 +       }
19068 +#endif
19069 +}
19070 +
19071 +#if REISER4_DEBUG
19072 +/*
19073 + * Remove jnode from ->all_jnodes list.
19074 + */
19075 +static void jnode_done(jnode * node, reiser4_tree * tree)
19076 +{
19077 +       reiser4_super_info_data *sbinfo;
19078 +
19079 +       sbinfo = get_super_private(tree->super);
19080 +
19081 +       spin_lock_irq(&sbinfo->all_guard);
19082 +       assert("nikita-2422", !list_empty(&node->jnodes));
19083 +       list_del_init(&node->jnodes);
19084 +       spin_unlock_irq(&sbinfo->all_guard);
19085 +}
19086 +#endif
19087 +
19088 +/* return already existing jnode of page */
19089 +jnode *jnode_by_page(struct page *pg)
19090 +{
19091 +       assert("nikita-2066", pg != NULL);
19092 +       assert("nikita-2400", PageLocked(pg));
19093 +       assert("nikita-2068", PagePrivate(pg));
19094 +       assert("nikita-2067", jprivate(pg) != NULL);
19095 +       return jprivate(pg);
19096 +}
19097 +
19098 +/* exported functions to allocate/free jnode objects outside this file */
19099 +jnode *jalloc(void)
19100 +{
19101 +       jnode *jal = kmem_cache_alloc(_jnode_slab, reiser4_ctx_gfp_mask_get());
19102 +       return jal;
19103 +}
19104 +
19105 +/* return jnode back to the slab allocator */
19106 +inline void jfree(jnode * node)
19107 +{
19108 +       assert("zam-449", node != NULL);
19109 +
19110 +       assert("nikita-2663", (list_empty_careful(&node->capture_link) &&
19111 +                              NODE_LIST(node) == NOT_CAPTURED));
19112 +       assert("nikita-3222", list_empty(&node->jnodes));
19113 +       assert("nikita-3221", jnode_page(node) == NULL);
19114 +
19115 +       /* not yet phash_jnode_destroy(node); */
19116 +
19117 +       kmem_cache_free(_jnode_slab, node);
19118 +}
19119 +
19120 +/*
19121 + * This function is supplied as RCU callback. It actually frees jnode when
19122 + * last reference to it is gone.
19123 + */
19124 +static void jnode_free_actor(struct rcu_head *head)
19125 +{
19126 +       jnode *node;
19127 +       jnode_type jtype;
19128 +
19129 +       node = container_of(head, jnode, rcu);
19130 +       jtype = jnode_get_type(node);
19131 +
19132 +       ON_DEBUG(jnode_done(node, jnode_get_tree(node)));
19133 +
19134 +       switch (jtype) {
19135 +       case JNODE_IO_HEAD:
19136 +       case JNODE_BITMAP:
19137 +       case JNODE_UNFORMATTED_BLOCK:
19138 +               jfree(node);
19139 +               break;
19140 +       case JNODE_FORMATTED_BLOCK:
19141 +               zfree(JZNODE(node));
19142 +               break;
19143 +       case JNODE_INODE:
19144 +       default:
19145 +               wrong_return_value("nikita-3197", "Wrong jnode type");
19146 +       }
19147 +}
19148 +
19149 +/*
19150 + * Free a jnode. Post a callback to be executed later through RCU when all
19151 + * references to @node are released.
19152 + */
19153 +static inline void jnode_free(jnode * node, jnode_type jtype)
19154 +{
19155 +       if (jtype != JNODE_INODE) {
19156 +               /*assert("nikita-3219", list_empty(&node->rcu.list)); */
19157 +               call_rcu(&node->rcu, jnode_free_actor);
19158 +       } else
19159 +               jnode_list_remove(node);
19160 +}
19161 +
19162 +/* allocate new unformatted jnode */
19163 +static jnode *jnew_unformatted(void)
19164 +{
19165 +       jnode *jal;
19166 +
19167 +       jal = jalloc();
19168 +       if (jal == NULL)
19169 +               return NULL;
19170 +
19171 +       jnode_init(jal, current_tree, JNODE_UNFORMATTED_BLOCK);
19172 +       jal->key.j.mapping = NULL;
19173 +       jal->key.j.index = (unsigned long)-1;
19174 +       jal->key.j.objectid = 0;
19175 +       return jal;
19176 +}
19177 +
19178 +/* look for jnode with given mapping and offset within hash table */
19179 +jnode *jlookup(reiser4_tree * tree, oid_t objectid, unsigned long index)
19180 +{
19181 +       jnode_key_t jkey;
19182 +       jnode *node;
19183 +
19184 +       assert("nikita-2353", tree != NULL);
19185 +
19186 +       jkey.objectid = objectid;
19187 +       jkey.index = index;
19188 +
19189 +       /*
19190 +        * hash table is _not_ protected by any lock during lookups. All we
19191 +        * have to do is to disable preemption to keep RCU happy.
19192 +        */
19193 +
19194 +       rcu_read_lock();
19195 +       node = j_hash_find(&tree->jhash_table, &jkey);
19196 +       if (node != NULL) {
19197 +               /* protect @node from recycling */
19198 +               jref(node);
19199 +               assert("nikita-2955", jnode_invariant(node, 0, 0));
19200 +               node = jnode_rip_check(tree, node);
19201 +       }
19202 +       rcu_read_unlock();
19203 +       return node;
19204 +}
19205 +
19206 +/* per inode radix tree of jnodes is protected by tree's read write spin lock */
19207 +static jnode *jfind_nolock(struct address_space *mapping, unsigned long index)
19208 +{
19209 +       assert("vs-1694", mapping->host != NULL);
19210 +
19211 +       return radix_tree_lookup(jnode_tree_by_inode(mapping->host), index);
19212 +}
19213 +
19214 +jnode *jfind(struct address_space * mapping, unsigned long index)
19215 +{
19216 +       reiser4_tree *tree;
19217 +       jnode *node;
19218 +
19219 +       assert("vs-1694", mapping->host != NULL);
19220 +       tree = reiser4_tree_by_inode(mapping->host);
19221 +
19222 +       read_lock_tree(tree);
19223 +       node = jfind_nolock(mapping, index);
19224 +       if (node != NULL)
19225 +               jref(node);
19226 +       read_unlock_tree(tree);
19227 +       return node;
19228 +}
19229 +
19230 +static void inode_attach_jnode(jnode * node)
19231 +{
19232 +       struct inode *inode;
19233 +       reiser4_inode *info;
19234 +       struct radix_tree_root *rtree;
19235 +
19236 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19237 +       assert("zam-1043", node->key.j.mapping != NULL);
19238 +       inode = node->key.j.mapping->host;
19239 +       info = reiser4_inode_data(inode);
19240 +       rtree = jnode_tree_by_reiser4_inode(info);
19241 +       if (rtree->rnode == NULL) {
19242 +               /* prevent inode from being pruned when it has jnodes attached
19243 +                  to it */
19244 +               write_lock_irq(&inode->i_data.tree_lock);
19245 +               inode->i_data.nrpages++;
19246 +               write_unlock_irq(&inode->i_data.tree_lock);
19247 +       }
19248 +       assert("zam-1049", equi(rtree->rnode != NULL, info->nr_jnodes != 0));
19249 +       check_me("zam-1045",
19250 +                !radix_tree_insert(rtree, node->key.j.index, node));
19251 +       ON_DEBUG(info->nr_jnodes++);
19252 +}
19253 +
19254 +static void inode_detach_jnode(jnode * node)
19255 +{
19256 +       struct inode *inode;
19257 +       reiser4_inode *info;
19258 +       struct radix_tree_root *rtree;
19259 +
19260 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19261 +       assert("zam-1044", node->key.j.mapping != NULL);
19262 +       inode = node->key.j.mapping->host;
19263 +       info = reiser4_inode_data(inode);
19264 +       rtree = jnode_tree_by_reiser4_inode(info);
19265 +
19266 +       assert("zam-1051", info->nr_jnodes != 0);
19267 +       assert("zam-1052", rtree->rnode != NULL);
19268 +       ON_DEBUG(info->nr_jnodes--);
19269 +
19270 +       /* delete jnode from inode's radix tree of jnodes */
19271 +       check_me("zam-1046", radix_tree_delete(rtree, node->key.j.index));
19272 +       if (rtree->rnode == NULL) {
19273 +               /* inode can be pruned now */
19274 +               write_lock_irq(&inode->i_data.tree_lock);
19275 +               inode->i_data.nrpages--;
19276 +               write_unlock_irq(&inode->i_data.tree_lock);
19277 +       }
19278 +}
19279 +
19280 +/* put jnode into hash table (where they can be found by flush who does not know
19281 +   mapping) and to inode's tree of jnodes (where they can be found (hopefully
19282 +   faster) in places where mapping is known). Currently it is used by
19283 +   fs/reiser4/plugin/item/extent_file_ops.c:index_extent_jnode when new jnode is
19284 +   created */
19285 +static void
19286 +hash_unformatted_jnode(jnode * node, struct address_space *mapping,
19287 +                      unsigned long index)
19288 +{
19289 +       j_hash_table *jtable;
19290 +
19291 +       assert("vs-1446", jnode_is_unformatted(node));
19292 +       assert("vs-1442", node->key.j.mapping == 0);
19293 +       assert("vs-1443", node->key.j.objectid == 0);
19294 +       assert("vs-1444", node->key.j.index == (unsigned long)-1);
19295 +       assert_rw_write_locked(&(jnode_get_tree(node)->tree_lock));
19296 +
19297 +       node->key.j.mapping = mapping;
19298 +       node->key.j.objectid = get_inode_oid(mapping->host);
19299 +       node->key.j.index = index;
19300 +
19301 +       jtable = &jnode_get_tree(node)->jhash_table;
19302 +
19303 +       /* race with some other thread inserting jnode into the hash table is
19304 +        * impossible, because we keep the page lock. */
19305 +       /*
19306 +        * following assertion no longer holds because of RCU: it is possible
19307 +        * jnode is in the hash table, but with JNODE_RIP bit set.
19308 +        */
19309 +       /* assert("nikita-3211", j_hash_find(jtable, &node->key.j) == NULL); */
19310 +       j_hash_insert_rcu(jtable, node);
19311 +       inode_attach_jnode(node);
19312 +}
19313 +
19314 +static void unhash_unformatted_node_nolock(jnode * node)
19315 +{
19316 +       assert("vs-1683", node->key.j.mapping != NULL);
19317 +       assert("vs-1684",
19318 +              node->key.j.objectid ==
19319 +              get_inode_oid(node->key.j.mapping->host));
19320 +
19321 +       /* remove jnode from hash-table */
19322 +       j_hash_remove_rcu(&node->tree->jhash_table, node);
19323 +       inode_detach_jnode(node);
19324 +       node->key.j.mapping = NULL;
19325 +       node->key.j.index = (unsigned long)-1;
19326 +       node->key.j.objectid = 0;
19327 +
19328 +}
19329 +
19330 +/* remove jnode from hash table and from inode's tree of jnodes. This is used in
19331 +   reiser4_invalidatepage and in kill_hook_extent -> truncate_inode_jnodes ->
19332 +   reiser4_uncapture_jnode */
19333 +void unhash_unformatted_jnode(jnode * node)
19334 +{
19335 +       assert("vs-1445", jnode_is_unformatted(node));
19336 +
19337 +       write_lock_tree(node->tree);
19338 +       unhash_unformatted_node_nolock(node);
19339 +       write_unlock_tree(node->tree);
19340 +}
19341 +
19342 +/*
19343 + * search hash table for a jnode with given oid and index. If not found,
19344 + * allocate new jnode, insert it, and also insert into radix tree for the
19345 + * given inode/mapping.
19346 + */
19347 +static jnode *find_get_jnode(reiser4_tree * tree,
19348 +                            struct address_space *mapping,
19349 +                            oid_t oid, unsigned long index)
19350 +{
19351 +       jnode *result;
19352 +       jnode *shadow;
19353 +       int preload;
19354 +
19355 +       result = jnew_unformatted();
19356 +
19357 +       if (unlikely(result == NULL))
19358 +               return ERR_PTR(RETERR(-ENOMEM));
19359 +
19360 +       preload = radix_tree_preload(reiser4_ctx_gfp_mask_get());
19361 +       if (preload != 0)
19362 +               return ERR_PTR(preload);
19363 +
19364 +       write_lock_tree(tree);
19365 +       shadow = jfind_nolock(mapping, index);
19366 +       if (likely(shadow == NULL)) {
19367 +               /* add new jnode to hash table and inode's radix tree of jnodes */
19368 +               jref(result);
19369 +               hash_unformatted_jnode(result, mapping, index);
19370 +       } else {
19371 +               /* jnode is found in inode's radix tree of jnodes */
19372 +               jref(shadow);
19373 +               jnode_free(result, JNODE_UNFORMATTED_BLOCK);
19374 +               assert("vs-1498", shadow->key.j.mapping == mapping);
19375 +               result = shadow;
19376 +       }
19377 +       write_unlock_tree(tree);
19378 +
19379 +       assert("nikita-2955",
19380 +              ergo(result != NULL, jnode_invariant(result, 0, 0)));
19381 +       radix_tree_preload_end();
19382 +       return result;
19383 +}
19384 +
19385 +/* jget() (a la zget() but for unformatted nodes). Returns (and possibly
19386 +   creates) jnode corresponding to page @pg. jnode is attached to page and
19387 +   inserted into jnode hash-table. */
19388 +static jnode *do_jget(reiser4_tree * tree, struct page *pg)
19389 +{
19390 +       /*
19391 +        * There are two ways to create jnode: starting with pre-existing page
19392 +        * and without page.
19393 +        *
19394 +        * When page already exists, jnode is created
19395 +        * (jnode_of_page()->do_jget()) under page lock. This is done in
19396 +        * ->writepage(), or when capturing anonymous page dirtied through
19397 +        * mmap.
19398 +        *
19399 +        * Jnode without page is created by index_extent_jnode().
19400 +        *
19401 +        */
19402 +
19403 +       jnode *result;
19404 +       oid_t oid = get_inode_oid(pg->mapping->host);
19405 +
19406 +       assert("umka-176", pg != NULL);
19407 +       assert("nikita-2394", PageLocked(pg));
19408 +
19409 +       result = jprivate(pg);
19410 +       if (likely(result != NULL))
19411 +               return jref(result);
19412 +
19413 +       tree = reiser4_tree_by_page(pg);
19414 +
19415 +       /* check hash-table first */
19416 +       result = jfind(pg->mapping, pg->index);
19417 +       if (unlikely(result != NULL)) {
19418 +               spin_lock_jnode(result);
19419 +               jnode_attach_page(result, pg);
19420 +               spin_unlock_jnode(result);
19421 +               result->key.j.mapping = pg->mapping;
19422 +               return result;
19423 +       }
19424 +
19425 +       /* since page is locked, jnode should be allocated with GFP_NOFS flag */
19426 +       reiser4_ctx_gfp_mask_force(GFP_NOFS);
19427 +       result = find_get_jnode(tree, pg->mapping, oid, pg->index);
19428 +       if (unlikely(IS_ERR(result)))
19429 +               return result;
19430 +       /* attach jnode to page */
19431 +       spin_lock_jnode(result);
19432 +       jnode_attach_page(result, pg);
19433 +       spin_unlock_jnode(result);
19434 +       return result;
19435 +}
19436 +
19437 +/*
19438 + * return jnode for @pg, creating it if necessary.
19439 + */
19440 +jnode *jnode_of_page(struct page * pg)
19441 +{
19442 +       jnode *result;
19443 +
19444 +       assert("umka-176", pg != NULL);
19445 +       assert("nikita-2394", PageLocked(pg));
19446 +
19447 +       result = do_jget(reiser4_tree_by_page(pg), pg);
19448 +
19449 +       if (REISER4_DEBUG && !IS_ERR(result)) {
19450 +               assert("nikita-3210", result == jprivate(pg));
19451 +               assert("nikita-2046", jnode_page(jprivate(pg)) == pg);
19452 +               if (jnode_is_unformatted(jprivate(pg))) {
19453 +                       assert("nikita-2364",
19454 +                              jprivate(pg)->key.j.index == pg->index);
19455 +                       assert("nikita-2367",
19456 +                              jprivate(pg)->key.j.mapping == pg->mapping);
19457 +                       assert("nikita-2365",
19458 +                              jprivate(pg)->key.j.objectid ==
19459 +                              get_inode_oid(pg->mapping->host));
19460 +                       assert("vs-1200",
19461 +                              jprivate(pg)->key.j.objectid ==
19462 +                              pg->mapping->host->i_ino);
19463 +                       assert("nikita-2356",
19464 +                              jnode_is_unformatted(jnode_by_page(pg)));
19465 +               }
19466 +               assert("nikita-2956", jnode_invariant(jprivate(pg), 0, 0));
19467 +       }
19468 +       return result;
19469 +}
19470 +
19471 +/* attach page to jnode: set ->pg pointer in jnode, and ->private one in the
19472 + * page.*/
19473 +void jnode_attach_page(jnode * node, struct page *pg)
19474 +{
19475 +       assert("nikita-2060", node != NULL);
19476 +       assert("nikita-2061", pg != NULL);
19477 +
19478 +       assert("nikita-2050", jprivate(pg) == 0ul);
19479 +       assert("nikita-2393", !PagePrivate(pg));
19480 +       assert("vs-1741", node->pg == NULL);
19481 +
19482 +       assert("nikita-2396", PageLocked(pg));
19483 +       assert_spin_locked(&(node->guard));
19484 +
19485 +       page_cache_get(pg);
19486 +       set_page_private(pg, (unsigned long)node);
19487 +       node->pg = pg;
19488 +       SetPagePrivate(pg);
19489 +}
19490 +
19491 +/* Dual to jnode_attach_page: break a binding between page and jnode */
19492 +void page_clear_jnode(struct page *page, jnode * node)
19493 +{
19494 +       assert("nikita-2424", page != NULL);
19495 +       assert("nikita-2425", PageLocked(page));
19496 +       assert("nikita-2426", node != NULL);
19497 +       assert_spin_locked(&(node->guard));
19498 +       assert("nikita-2428", PagePrivate(page));
19499 +
19500 +       assert("nikita-3551", !PageWriteback(page));
19501 +
19502 +       JF_CLR(node, JNODE_PARSED);
19503 +       set_page_private(page, 0ul);
19504 +       ClearPagePrivate(page);
19505 +       node->pg = NULL;
19506 +       page_cache_release(page);
19507 +}
19508 +
19509 +#if 0
19510 +/* it is only used in one place to handle error */
19511 +void
19512 +page_detach_jnode(struct page *page, struct address_space *mapping,
19513 +                 unsigned long index)
19514 +{
19515 +       assert("nikita-2395", page != NULL);
19516 +
19517 +       lock_page(page);
19518 +       if ((page->mapping == mapping) && (page->index == index)
19519 +           && PagePrivate(page)) {
19520 +               jnode *node;
19521 +
19522 +               node = jprivate(page);
19523 +               spin_lock_jnode(node);
19524 +               page_clear_jnode(page, node);
19525 +               spin_unlock_jnode(node);
19526 +       }
19527 +       unlock_page(page);
19528 +}
19529 +#endif  /*  0  */
19530 +
19531 +/* return @node page locked.
19532 +
19533 +   Locking ordering requires that one first takes page lock and afterwards
19534 +   spin lock on node attached to this page. Sometimes it is necessary to go in
19535 +   the opposite direction. This is done through standard trylock-and-release
19536 +   loop.
19537 +*/
19538 +static struct page *jnode_lock_page(jnode * node)
19539 +{
19540 +       struct page *page;
19541 +
19542 +       assert("nikita-2052", node != NULL);
19543 +       assert("nikita-2401", LOCK_CNT_NIL(spin_locked_jnode));
19544 +
19545 +       while (1) {
19546 +
19547 +               spin_lock_jnode(node);
19548 +               page = jnode_page(node);
19549 +               if (page == NULL) {
19550 +                       break;
19551 +               }
19552 +
19553 +               /* no need to page_cache_get( page ) here, because page cannot
19554 +                  be evicted from memory without detaching it from jnode and
19555 +                  this requires spin lock on jnode that we already hold.
19556 +                */
19557 +               if (!TestSetPageLocked(page)) {
19558 +                       /* We won a lock on jnode page, proceed. */
19559 +                       break;
19560 +               }
19561 +
19562 +               /* Page is locked by someone else. */
19563 +               page_cache_get(page);
19564 +               spin_unlock_jnode(node);
19565 +               wait_on_page_locked(page);
19566 +               /* it is possible that page was detached from jnode and
19567 +                  returned to the free pool, or re-assigned while we were
19568 +                  waiting on locked bit. This will be rechecked on the next
19569 +                  loop iteration.
19570 +                */
19571 +               page_cache_release(page);
19572 +
19573 +               /* try again */
19574 +       }
19575 +       return page;
19576 +}
19577 +
19578 +/*
19579 + * is JNODE_PARSED bit is not set, call ->parse() method of jnode, to verify
19580 + * validness of jnode content.
19581 + */
19582 +static inline int jparse(jnode * node)
19583 +{
19584 +       int result;
19585 +
19586 +       assert("nikita-2466", node != NULL);
19587 +
19588 +       spin_lock_jnode(node);
19589 +       if (likely(!jnode_is_parsed(node))) {
19590 +               result = jnode_ops(node)->parse(node);
19591 +               if (likely(result == 0))
19592 +                       JF_SET(node, JNODE_PARSED);
19593 +       } else
19594 +               result = 0;
19595 +       spin_unlock_jnode(node);
19596 +       return result;
19597 +}
19598 +
19599 +/* Lock a page attached to jnode, create and attach page to jnode if it had no
19600 + * one. */
19601 +static struct page *jnode_get_page_locked(jnode * node, gfp_t gfp_flags)
19602 +{
19603 +       struct page *page;
19604 +
19605 +       spin_lock_jnode(node);
19606 +       page = jnode_page(node);
19607 +
19608 +       if (page == NULL) {
19609 +               spin_unlock_jnode(node);
19610 +               page = find_or_create_page(jnode_get_mapping(node),
19611 +                                          jnode_get_index(node), gfp_flags);
19612 +               if (page == NULL)
19613 +                       return ERR_PTR(RETERR(-ENOMEM));
19614 +       } else {
19615 +               if (!TestSetPageLocked(page)) {
19616 +                       spin_unlock_jnode(node);
19617 +                       return page;
19618 +               }
19619 +               page_cache_get(page);
19620 +               spin_unlock_jnode(node);
19621 +               lock_page(page);
19622 +               assert("nikita-3134", page->mapping == jnode_get_mapping(node));
19623 +       }
19624 +
19625 +       spin_lock_jnode(node);
19626 +       if (!jnode_page(node))
19627 +               jnode_attach_page(node, page);
19628 +       spin_unlock_jnode(node);
19629 +
19630 +       page_cache_release(page);
19631 +       assert("zam-894", jnode_page(node) == page);
19632 +       return page;
19633 +}
19634 +
19635 +/* Start read operation for jnode's page if page is not up-to-date. */
19636 +static int jnode_start_read(jnode * node, struct page *page)
19637 +{
19638 +       assert("zam-893", PageLocked(page));
19639 +
19640 +       if (PageUptodate(page)) {
19641 +               unlock_page(page);
19642 +               return 0;
19643 +       }
19644 +       return reiser4_page_io(page, node, READ, reiser4_ctx_gfp_mask_get());
19645 +}
19646 +
19647 +#if REISER4_DEBUG
19648 +static void check_jload(jnode * node, struct page *page)
19649 +{
19650 +       if (jnode_is_znode(node)) {
19651 +               node40_header *nh;
19652 +               znode *z;
19653 +
19654 +               z = JZNODE(node);
19655 +               if (znode_is_any_locked(z)) {
19656 +                       nh = (node40_header *) kmap(page);
19657 +                       /* this only works for node40-only file systems. For
19658 +                        * debugging. */
19659 +                       assert("nikita-3253",
19660 +                              z->nr_items == le16_to_cpu(get_unaligned(&nh->nr_items)));
19661 +                       kunmap(page);
19662 +               }
19663 +               assert("nikita-3565", znode_invariant(z));
19664 +       }
19665 +}
19666 +#else
19667 +#define check_jload(node, page) noop
19668 +#endif
19669 +
19670 +/* prefetch jnode to speed up next call to jload. Call this when you are going
19671 + * to call jload() shortly. This will bring appropriate portion of jnode into
19672 + * CPU cache. */
19673 +void jload_prefetch(jnode * node)
19674 +{
19675 +       prefetchw(&node->x_count);
19676 +}
19677 +
19678 +/* load jnode's data into memory */
19679 +int jload_gfp(jnode * node /* node to load */ ,
19680 +             gfp_t gfp_flags /* allocation flags */ ,
19681 +             int do_kmap /* true if page should be kmapped */ )
19682 +{
19683 +       struct page *page;
19684 +       int result = 0;
19685 +       int parsed;
19686 +
19687 +       assert("nikita-3010", reiser4_schedulable());
19688 +
19689 +       prefetchw(&node->pg);
19690 +
19691 +       /* taking d-reference implies taking x-reference. */
19692 +       jref(node);
19693 +
19694 +       /*
19695 +        * acquiring d-reference to @jnode and check for JNODE_PARSED bit
19696 +        * should be atomic, otherwise there is a race against
19697 +        * reiser4_releasepage().
19698 +        */
19699 +       spin_lock(&(node->load));
19700 +       add_d_ref(node);
19701 +       parsed = jnode_is_parsed(node);
19702 +       spin_unlock(&(node->load));
19703 +
19704 +       if (unlikely(!parsed)) {
19705 +               page = jnode_get_page_locked(node, gfp_flags);
19706 +               if (unlikely(IS_ERR(page))) {
19707 +                       result = PTR_ERR(page);
19708 +                       goto failed;
19709 +               }
19710 +
19711 +               result = jnode_start_read(node, page);
19712 +               if (unlikely(result != 0))
19713 +                       goto failed;
19714 +
19715 +               wait_on_page_locked(page);
19716 +               if (unlikely(!PageUptodate(page))) {
19717 +                       result = RETERR(-EIO);
19718 +                       goto failed;
19719 +               }
19720 +
19721 +               if (do_kmap)
19722 +                       node->data = kmap(page);
19723 +
19724 +               result = jparse(node);
19725 +               if (unlikely(result != 0)) {
19726 +                       if (do_kmap)
19727 +                               kunmap(page);
19728 +                       goto failed;
19729 +               }
19730 +               check_jload(node, page);
19731 +       } else {
19732 +               page = jnode_page(node);
19733 +               check_jload(node, page);
19734 +               if (do_kmap)
19735 +                       node->data = kmap(page);
19736 +       }
19737 +
19738 +       if (!is_writeout_mode())
19739 +               /* We do not mark pages active if jload is called as a part of
19740 +                * jnode_flush() or reiser4_write_logs().  Both jnode_flush()
19741 +                * and write_logs() add no value to cached data, there is no
19742 +                * sense to mark pages as active when they go to disk, it just
19743 +                * confuses vm scanning routines because clean page could be
19744 +                * moved out from inactive list as a result of this
19745 +                * mark_page_accessed() call. */
19746 +               mark_page_accessed(page);
19747 +
19748 +       return 0;
19749 +
19750 +      failed:
19751 +       jrelse_tail(node);
19752 +       return result;
19753 +
19754 +}
19755 +
19756 +/* start asynchronous reading for given jnode's page. */
19757 +int jstartio(jnode * node)
19758 +{
19759 +       struct page *page;
19760 +
19761 +       page = jnode_get_page_locked(node, reiser4_ctx_gfp_mask_get());
19762 +       if (IS_ERR(page))
19763 +               return PTR_ERR(page);
19764 +
19765 +       return jnode_start_read(node, page);
19766 +}
19767 +
19768 +/* Initialize a node by calling appropriate plugin instead of reading
19769 + * node from disk as in jload(). */
19770 +int jinit_new(jnode * node, gfp_t gfp_flags)
19771 +{
19772 +       struct page *page;
19773 +       int result;
19774 +
19775 +       jref(node);
19776 +       add_d_ref(node);
19777 +
19778 +       page = jnode_get_page_locked(node, gfp_flags);
19779 +       if (IS_ERR(page)) {
19780 +               result = PTR_ERR(page);
19781 +               goto failed;
19782 +       }
19783 +
19784 +       SetPageUptodate(page);
19785 +       unlock_page(page);
19786 +
19787 +       node->data = kmap(page);
19788 +
19789 +       if (!jnode_is_parsed(node)) {
19790 +               jnode_plugin *jplug = jnode_ops(node);
19791 +               spin_lock_jnode(node);
19792 +               result = jplug->init(node);
19793 +               spin_unlock_jnode(node);
19794 +               if (result) {
19795 +                       kunmap(page);
19796 +                       goto failed;
19797 +               }
19798 +               JF_SET(node, JNODE_PARSED);
19799 +       }
19800 +
19801 +       return 0;
19802 +
19803 +      failed:
19804 +       jrelse(node);
19805 +       return result;
19806 +}
19807 +
19808 +/* release a reference to jnode acquired by jload(), decrement ->d_count */
19809 +void jrelse_tail(jnode * node /* jnode to release references to */ )
19810 +{
19811 +       assert("nikita-489", atomic_read(&node->d_count) > 0);
19812 +       atomic_dec(&node->d_count);
19813 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
19814 +               LOCK_CNT_DEC(d_refs);
19815 +       /* release reference acquired in jload_gfp() or jinit_new() */
19816 +       jput(node);
19817 +}
19818 +
19819 +/* drop reference to node data. When last reference is dropped, data are
19820 +   unloaded. */
19821 +void jrelse(jnode * node /* jnode to release references to */ )
19822 +{
19823 +       struct page *page;
19824 +
19825 +       assert("nikita-487", node != NULL);
19826 +       assert_spin_not_locked(&(node->guard));
19827 +
19828 +       page = jnode_page(node);
19829 +       if (likely(page != NULL)) {
19830 +               /*
19831 +                * it is safe not to lock jnode here, because at this point
19832 +                * @node->d_count is greater than zero (if jrelse() is used
19833 +                * correctly, that is). JNODE_PARSED may be not set yet, if,
19834 +                * for example, we got here as a result of error handling path
19835 +                * in jload(). Anyway, page cannot be detached by
19836 +                * reiser4_releasepage(). truncate will invalidate page
19837 +                * regardless, but this should not be a problem.
19838 +                */
19839 +               kunmap(page);
19840 +       }
19841 +       jrelse_tail(node);
19842 +}
19843 +
19844 +/* called from jput() to wait for io completion */
19845 +static void jnode_finish_io(jnode * node)
19846 +{
19847 +       struct page *page;
19848 +
19849 +       assert("nikita-2922", node != NULL);
19850 +
19851 +       spin_lock_jnode(node);
19852 +       page = jnode_page(node);
19853 +       if (page != NULL) {
19854 +               page_cache_get(page);
19855 +               spin_unlock_jnode(node);
19856 +               wait_on_page_writeback(page);
19857 +               page_cache_release(page);
19858 +       } else
19859 +               spin_unlock_jnode(node);
19860 +}
19861 +
19862 +/*
19863 + * This is called by jput() when last reference to jnode is released. This is
19864 + * separate function, because we want fast path of jput() to be inline and,
19865 + * therefore, small.
19866 + */
19867 +void jput_final(jnode * node)
19868 +{
19869 +       int r_i_p;
19870 +
19871 +       /* A fast check for keeping node in cache. We always keep node in cache
19872 +        * if its page is present and node was not marked for deletion */
19873 +       if (jnode_page(node) != NULL && !JF_ISSET(node, JNODE_HEARD_BANSHEE)) {
19874 +               rcu_read_unlock();
19875 +               return;
19876 +       }
19877 +       assert("edward-1432", node->page_count == 0);
19878 +
19879 +       r_i_p = !JF_TEST_AND_SET(node, JNODE_RIP);
19880 +       /*
19881 +        * if r_i_p is true, we were first to set JNODE_RIP on this node. In
19882 +        * this case it is safe to access node after unlock.
19883 +        */
19884 +       rcu_read_unlock();
19885 +       if (r_i_p) {
19886 +               jnode_finish_io(node);
19887 +               if (JF_ISSET(node, JNODE_HEARD_BANSHEE))
19888 +                       /* node is removed from the tree. */
19889 +                       jdelete(node);
19890 +               else
19891 +                       jnode_try_drop(node);
19892 +       }
19893 +       /* if !r_i_p some other thread is already killing it */
19894 +}
19895 +
19896 +int jwait_io(jnode * node, int rw)
19897 +{
19898 +       struct page *page;
19899 +       int result;
19900 +
19901 +       assert("zam-447", node != NULL);
19902 +       assert("zam-448", jnode_page(node) != NULL);
19903 +
19904 +       page = jnode_page(node);
19905 +
19906 +       result = 0;
19907 +       if (rw == READ) {
19908 +               wait_on_page_locked(page);
19909 +       } else {
19910 +               assert("nikita-2227", rw == WRITE);
19911 +               wait_on_page_writeback(page);
19912 +       }
19913 +       if (PageError(page))
19914 +               result = RETERR(-EIO);
19915 +
19916 +       return result;
19917 +}
19918 +
19919 +/*
19920 + * jnode types and plugins.
19921 + *
19922 + * jnode by itself is a "base type". There are several different jnode
19923 + * flavors, called "jnode types" (see jnode_type for a list). Sometimes code
19924 + * has to do different things based on jnode type. In the standard reiser4 way
19925 + * this is done by having jnode plugin (see fs/reiser4/plugin.h:jnode_plugin).
19926 + *
19927 + * Functions below deal with jnode types and define methods of jnode plugin.
19928 + *
19929 + */
19930 +
19931 +/* set jnode type. This is done during jnode initialization. */
19932 +static void jnode_set_type(jnode * node, jnode_type type)
19933 +{
19934 +       static unsigned long type_to_mask[] = {
19935 +               [JNODE_UNFORMATTED_BLOCK] = 1,
19936 +               [JNODE_FORMATTED_BLOCK] = 0,
19937 +               [JNODE_BITMAP] = 2,
19938 +               [JNODE_IO_HEAD] = 6,
19939 +               [JNODE_INODE] = 4
19940 +       };
19941 +
19942 +       assert("zam-647", type < LAST_JNODE_TYPE);
19943 +       assert("nikita-2815", !jnode_is_loaded(node));
19944 +       assert("nikita-3386", node->state == 0);
19945 +
19946 +       node->state |= (type_to_mask[type] << JNODE_TYPE_1);
19947 +}
19948 +
19949 +/* ->init() method of jnode plugin for jnodes that don't require plugin
19950 + * specific initialization. */
19951 +static int init_noinit(jnode * node UNUSED_ARG)
19952 +{
19953 +       return 0;
19954 +}
19955 +
19956 +/* ->parse() method of jnode plugin for jnodes that don't require plugin
19957 + * specific pasring. */
19958 +static int parse_noparse(jnode * node UNUSED_ARG)
19959 +{
19960 +       return 0;
19961 +}
19962 +
19963 +/* ->mapping() method for unformatted jnode */
19964 +struct address_space *mapping_jnode(const jnode * node)
19965 +{
19966 +       struct address_space *map;
19967 +
19968 +       assert("nikita-2713", node != NULL);
19969 +
19970 +       /* mapping is stored in jnode */
19971 +
19972 +       map = node->key.j.mapping;
19973 +       assert("nikita-2714", map != NULL);
19974 +       assert("nikita-2897", is_reiser4_inode(map->host));
19975 +       assert("nikita-2715", get_inode_oid(map->host) == node->key.j.objectid);
19976 +       return map;
19977 +}
19978 +
19979 +/* ->index() method for unformatted jnodes */
19980 +unsigned long index_jnode(const jnode * node)
19981 +{
19982 +       /* index is stored in jnode */
19983 +       return node->key.j.index;
19984 +}
19985 +
19986 +/* ->remove() method for unformatted jnodes */
19987 +static inline void remove_jnode(jnode * node, reiser4_tree * tree)
19988 +{
19989 +       /* remove jnode from hash table and radix tree */
19990 +       if (node->key.j.mapping)
19991 +               unhash_unformatted_node_nolock(node);
19992 +}
19993 +
19994 +/* ->mapping() method for znodes */
19995 +static struct address_space *mapping_znode(const jnode * node)
19996 +{
19997 +       /* all znodes belong to fake inode */
19998 +       return reiser4_get_super_fake(jnode_get_tree(node)->super)->i_mapping;
19999 +}
20000 +
20001 +/* ->index() method for znodes */
20002 +static unsigned long index_znode(const jnode * node)
20003 +{
20004 +       unsigned long addr;
20005 +       assert("nikita-3317", (1 << znode_shift_order) < sizeof(znode));
20006 +
20007 +       /* index of znode is just its address (shifted) */
20008 +       addr = (unsigned long)node;
20009 +       return (addr - PAGE_OFFSET) >> znode_shift_order;
20010 +}
20011 +
20012 +/* ->mapping() method for bitmap jnode */
20013 +static struct address_space *mapping_bitmap(const jnode * node)
20014 +{
20015 +       /* all bitmap blocks belong to special bitmap inode */
20016 +       return get_super_private(jnode_get_tree(node)->super)->bitmap->
20017 +           i_mapping;
20018 +}
20019 +
20020 +/* ->index() method for jnodes that are indexed by address */
20021 +static unsigned long index_is_address(const jnode * node)
20022 +{
20023 +       unsigned long ind;
20024 +
20025 +       ind = (unsigned long)node;
20026 +       return ind - PAGE_OFFSET;
20027 +}
20028 +
20029 +/* resolve race with jput */
20030 +jnode *jnode_rip_sync(reiser4_tree *tree, jnode *node)
20031 +{
20032 +       /*
20033 +        * This is used as part of RCU-based jnode handling.
20034 +        *
20035 +        * jlookup(), zlook(), zget(), and cbk_cache_scan_slots() have to work
20036 +        * with unreferenced jnodes (ones with ->x_count == 0). Hash table is
20037 +        * not protected during this, so concurrent thread may execute
20038 +        * zget-set-HEARD_BANSHEE-zput, or somehow else cause jnode to be
20039 +        * freed in jput_final(). To avoid such races, jput_final() sets
20040 +        * JNODE_RIP on jnode (under tree lock). All places that work with
20041 +        * unreferenced jnodes call this function. It checks for JNODE_RIP bit
20042 +        * (first without taking tree lock), and if this bit is set, released
20043 +        * reference acquired by the current thread and returns NULL.
20044 +        *
20045 +        * As a result, if jnode is being concurrently freed, NULL is returned
20046 +        * and caller should pretend that jnode wasn't found in the first
20047 +        * place.
20048 +        *
20049 +        * Otherwise it's safe to release "rcu-read-lock" and continue with
20050 +        * jnode.
20051 +        */
20052 +       if (unlikely(JF_ISSET(node, JNODE_RIP))) {
20053 +               read_lock_tree(tree);
20054 +               if (JF_ISSET(node, JNODE_RIP)) {
20055 +                       dec_x_ref(node);
20056 +                       node = NULL;
20057 +               }
20058 +               read_unlock_tree(tree);
20059 +       }
20060 +       return node;
20061 +}
20062 +
20063 +reiser4_key *jnode_build_key(const jnode * node, reiser4_key * key)
20064 +{
20065 +       struct inode *inode;
20066 +       item_plugin *iplug;
20067 +       loff_t off;
20068 +
20069 +       assert("nikita-3092", node != NULL);
20070 +       assert("nikita-3093", key != NULL);
20071 +       assert("nikita-3094", jnode_is_unformatted(node));
20072 +
20073 +       off = ((loff_t) index_jnode(node)) << PAGE_CACHE_SHIFT;
20074 +       inode = mapping_jnode(node)->host;
20075 +
20076 +       if (node->parent_item_id != 0)
20077 +               iplug = item_plugin_by_id(node->parent_item_id);
20078 +       else
20079 +               iplug = NULL;
20080 +
20081 +       if (iplug != NULL && iplug->f.key_by_offset)
20082 +               iplug->f.key_by_offset(inode, off, key);
20083 +       else {
20084 +               file_plugin *fplug;
20085 +
20086 +               fplug = inode_file_plugin(inode);
20087 +               assert("zam-1007", fplug != NULL);
20088 +               assert("zam-1008", fplug->key_by_inode != NULL);
20089 +
20090 +               fplug->key_by_inode(inode, off, key);
20091 +       }
20092 +
20093 +       return key;
20094 +}
20095 +
20096 +/* ->parse() method for formatted nodes */
20097 +static int parse_znode(jnode * node)
20098 +{
20099 +       return zparse(JZNODE(node));
20100 +}
20101 +
20102 +/* ->delete() method for formatted nodes */
20103 +static void delete_znode(jnode * node, reiser4_tree * tree)
20104 +{
20105 +       znode *z;
20106 +
20107 +       assert_rw_write_locked(&(tree->tree_lock));
20108 +       assert("vs-898", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20109 +
20110 +       z = JZNODE(node);
20111 +       assert("vs-899", z->c_count == 0);
20112 +
20113 +       /* delete znode from sibling list. */
20114 +       sibling_list_remove(z);
20115 +
20116 +       znode_remove(z, tree);
20117 +}
20118 +
20119 +/* ->remove() method for formatted nodes */
20120 +static int remove_znode(jnode * node, reiser4_tree * tree)
20121 +{
20122 +       znode *z;
20123 +
20124 +       assert_rw_write_locked(&(tree->tree_lock));
20125 +       z = JZNODE(node);
20126 +
20127 +       if (z->c_count == 0) {
20128 +               /* detach znode from sibling list. */
20129 +               sibling_list_drop(z);
20130 +               /* this is called with tree spin-lock held, so call
20131 +                  znode_remove() directly (rather than znode_lock_remove()). */
20132 +               znode_remove(z, tree);
20133 +               return 0;
20134 +       }
20135 +       return RETERR(-EBUSY);
20136 +}
20137 +
20138 +/* ->init() method for formatted nodes */
20139 +static int init_znode(jnode * node)
20140 +{
20141 +       znode *z;
20142 +
20143 +       z = JZNODE(node);
20144 +       /* call node plugin to do actual initialization */
20145 +       return z->nplug->init(z);
20146 +}
20147 +
20148 +/* ->clone() method for formatted nodes */
20149 +static jnode *clone_formatted(jnode * node)
20150 +{
20151 +       znode *clone;
20152 +
20153 +       assert("vs-1430", jnode_is_znode(node));
20154 +       clone = zalloc(reiser4_ctx_gfp_mask_get());
20155 +       if (clone == NULL)
20156 +               return ERR_PTR(RETERR(-ENOMEM));
20157 +       zinit(clone, NULL, current_tree);
20158 +       jnode_set_block(ZJNODE(clone), jnode_get_block(node));
20159 +       /* ZJNODE(clone)->key.z is not initialized */
20160 +       clone->level = JZNODE(node)->level;
20161 +
20162 +       return ZJNODE(clone);
20163 +}
20164 +
20165 +/* jplug->clone for unformatted nodes */
20166 +static jnode *clone_unformatted(jnode * node)
20167 +{
20168 +       jnode *clone;
20169 +
20170 +       assert("vs-1431", jnode_is_unformatted(node));
20171 +       clone = jalloc();
20172 +       if (clone == NULL)
20173 +               return ERR_PTR(RETERR(-ENOMEM));
20174 +
20175 +       jnode_init(clone, current_tree, JNODE_UNFORMATTED_BLOCK);
20176 +       jnode_set_block(clone, jnode_get_block(node));
20177 +
20178 +       return clone;
20179 +
20180 +}
20181 +
20182 +/*
20183 + * Setup jnode plugin methods for various jnode types.
20184 + */
20185 +jnode_plugin jnode_plugins[LAST_JNODE_TYPE] = {
20186 +       [JNODE_UNFORMATTED_BLOCK] = {
20187 +               .h = {
20188 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20189 +                       .id = JNODE_UNFORMATTED_BLOCK,
20190 +                       .pops = NULL,
20191 +                       .label = "unformatted",
20192 +                       .desc = "unformatted node",
20193 +                       .linkage = {NULL, NULL}
20194 +               },
20195 +               .init = init_noinit,
20196 +               .parse = parse_noparse,
20197 +               .mapping = mapping_jnode,
20198 +               .index = index_jnode,
20199 +               .clone = clone_unformatted
20200 +       },
20201 +       [JNODE_FORMATTED_BLOCK] = {
20202 +               .h = {
20203 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20204 +                       .id = JNODE_FORMATTED_BLOCK,
20205 +                       .pops = NULL,
20206 +                       .label = "formatted",
20207 +                       .desc = "formatted tree node",
20208 +                       .linkage = {NULL, NULL}
20209 +               },
20210 +               .init = init_znode,
20211 +               .parse = parse_znode,
20212 +               .mapping = mapping_znode,
20213 +               .index = index_znode,
20214 +               .clone = clone_formatted
20215 +       },
20216 +       [JNODE_BITMAP] = {
20217 +               .h = {
20218 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20219 +                       .id = JNODE_BITMAP,
20220 +                       .pops = NULL,
20221 +                       .label = "bitmap",
20222 +                       .desc = "bitmap node",
20223 +                       .linkage = {NULL, NULL}
20224 +               },
20225 +               .init = init_noinit,
20226 +               .parse = parse_noparse,
20227 +               .mapping = mapping_bitmap,
20228 +               .index = index_is_address,
20229 +               .clone = NULL
20230 +       },
20231 +       [JNODE_IO_HEAD] = {
20232 +               .h = {
20233 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20234 +                       .id = JNODE_IO_HEAD,
20235 +                       .pops = NULL,
20236 +                       .label = "io head",
20237 +                       .desc = "io head",
20238 +                       .linkage = {NULL, NULL}
20239 +               },
20240 +               .init = init_noinit,
20241 +               .parse = parse_noparse,
20242 +               .mapping = mapping_bitmap,
20243 +               .index = index_is_address,
20244 +               .clone = NULL
20245 +       },
20246 +       [JNODE_INODE] = {
20247 +               .h = {
20248 +                       .type_id = REISER4_JNODE_PLUGIN_TYPE,
20249 +                       .id = JNODE_INODE,
20250 +                       .pops = NULL,
20251 +                       .label = "inode",
20252 +                       .desc = "inode's builtin jnode",
20253 +                       .linkage = {NULL, NULL}
20254 +               },
20255 +               .init = NULL,
20256 +               .parse = NULL,
20257 +               .mapping = NULL,
20258 +               .index = NULL,
20259 +               .clone = NULL
20260 +       }
20261 +};
20262 +
20263 +/*
20264 + * jnode destruction.
20265 + *
20266 + * Thread may use a jnode after it acquired a reference to it. References are
20267 + * counted in ->x_count field. Reference protects jnode from being
20268 + * recycled. This is different from protecting jnode data (that are stored in
20269 + * jnode page) from being evicted from memory. Data are protected by jload()
20270 + * and released by jrelse().
20271 + *
20272 + * If thread already possesses a reference to the jnode it can acquire another
20273 + * one through jref(). Initial reference is obtained (usually) by locating
20274 + * jnode in some indexing structure that depends on jnode type: formatted
20275 + * nodes are kept in global hash table, where they are indexed by block
20276 + * number, and also in the cbk cache. Unformatted jnodes are also kept in hash
20277 + * table, which is indexed by oid and offset within file, and in per-inode
20278 + * radix tree.
20279 + *
20280 + * Reference to jnode is released by jput(). If last reference is released,
20281 + * jput_final() is called. This function determines whether jnode has to be
20282 + * deleted (this happens when corresponding node is removed from the file
20283 + * system, jnode is marked with JNODE_HEARD_BANSHEE bit in this case), or it
20284 + * should be just "removed" (deleted from memory).
20285 + *
20286 + * Jnode destruction is signally delicate dance because of locking and RCU.
20287 + */
20288 +
20289 +/*
20290 + * Returns true if jnode cannot be removed right now. This check is called
20291 + * under tree lock. If it returns true, jnode is irrevocably committed to be
20292 + * deleted/removed.
20293 + */
20294 +static inline int jnode_is_busy(const jnode * node, jnode_type jtype)
20295 +{
20296 +       /* if other thread managed to acquire a reference to this jnode, don't
20297 +        * free it. */
20298 +       if (atomic_read(&node->x_count) > 0)
20299 +               return 1;
20300 +       /* also, don't free znode that has children in memory */
20301 +       if (jtype == JNODE_FORMATTED_BLOCK && JZNODE(node)->c_count > 0)
20302 +               return 1;
20303 +       return 0;
20304 +}
20305 +
20306 +/*
20307 + * this is called as part of removing jnode. Based on jnode type, call
20308 + * corresponding function that removes jnode from indices and returns it back
20309 + * to the appropriate slab (through RCU).
20310 + */
20311 +static inline void
20312 +jnode_remove(jnode * node, jnode_type jtype, reiser4_tree * tree)
20313 +{
20314 +       switch (jtype) {
20315 +       case JNODE_UNFORMATTED_BLOCK:
20316 +               remove_jnode(node, tree);
20317 +               break;
20318 +       case JNODE_IO_HEAD:
20319 +       case JNODE_BITMAP:
20320 +               break;
20321 +       case JNODE_INODE:
20322 +               break;
20323 +       case JNODE_FORMATTED_BLOCK:
20324 +               remove_znode(node, tree);
20325 +               break;
20326 +       default:
20327 +               wrong_return_value("nikita-3196", "Wrong jnode type");
20328 +       }
20329 +}
20330 +
20331 +/*
20332 + * this is called as part of deleting jnode. Based on jnode type, call
20333 + * corresponding function that removes jnode from indices and returns it back
20334 + * to the appropriate slab (through RCU).
20335 + *
20336 + * This differs from jnode_remove() only for formatted nodes---for them
20337 + * sibling list handling is different for removal and deletion.
20338 + */
20339 +static inline void
20340 +jnode_delete(jnode * node, jnode_type jtype, reiser4_tree * tree UNUSED_ARG)
20341 +{
20342 +       switch (jtype) {
20343 +       case JNODE_UNFORMATTED_BLOCK:
20344 +               remove_jnode(node, tree);
20345 +               break;
20346 +       case JNODE_IO_HEAD:
20347 +       case JNODE_BITMAP:
20348 +               break;
20349 +       case JNODE_FORMATTED_BLOCK:
20350 +               delete_znode(node, tree);
20351 +               break;
20352 +       case JNODE_INODE:
20353 +       default:
20354 +               wrong_return_value("nikita-3195", "Wrong jnode type");
20355 +       }
20356 +}
20357 +
20358 +#if REISER4_DEBUG
20359 +/*
20360 + * remove jnode from the debugging list of all jnodes hanging off super-block.
20361 + */
20362 +void jnode_list_remove(jnode * node)
20363 +{
20364 +       reiser4_super_info_data *sbinfo;
20365 +
20366 +       sbinfo = get_super_private(jnode_get_tree(node)->super);
20367 +
20368 +       spin_lock_irq(&sbinfo->all_guard);
20369 +       assert("nikita-2422", !list_empty(&node->jnodes));
20370 +       list_del_init(&node->jnodes);
20371 +       spin_unlock_irq(&sbinfo->all_guard);
20372 +}
20373 +#endif
20374 +
20375 +/*
20376 + * this is called by jput_final() to remove jnode when last reference to it is
20377 + * released.
20378 + */
20379 +static int jnode_try_drop(jnode * node)
20380 +{
20381 +       int result;
20382 +       reiser4_tree *tree;
20383 +       jnode_type jtype;
20384 +
20385 +       assert("nikita-2491", node != NULL);
20386 +       assert("nikita-2583", JF_ISSET(node, JNODE_RIP));
20387 +
20388 +       tree = jnode_get_tree(node);
20389 +       jtype = jnode_get_type(node);
20390 +
20391 +       spin_lock_jnode(node);
20392 +       write_lock_tree(tree);
20393 +       /*
20394 +        * if jnode has a page---leave it alone. Memory pressure will
20395 +        * eventually kill page and jnode.
20396 +        */
20397 +       if (jnode_page(node) != NULL) {
20398 +               write_unlock_tree(tree);
20399 +               spin_unlock_jnode(node);
20400 +               JF_CLR(node, JNODE_RIP);
20401 +               return RETERR(-EBUSY);
20402 +       }
20403 +
20404 +       /* re-check ->x_count under tree lock. */
20405 +       result = jnode_is_busy(node, jtype);
20406 +       if (result == 0) {
20407 +               assert("nikita-2582", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20408 +               assert("jmacd-511/b", atomic_read(&node->d_count) == 0);
20409 +
20410 +               spin_unlock_jnode(node);
20411 +               /* no page and no references---despatch him. */
20412 +               jnode_remove(node, jtype, tree);
20413 +               write_unlock_tree(tree);
20414 +               jnode_free(node, jtype);
20415 +       } else {
20416 +               /* busy check failed: reference was acquired by concurrent
20417 +                * thread. */
20418 +               write_unlock_tree(tree);
20419 +               spin_unlock_jnode(node);
20420 +               JF_CLR(node, JNODE_RIP);
20421 +       }
20422 +       return result;
20423 +}
20424 +
20425 +/* jdelete() -- Delete jnode from the tree and file system */
20426 +static int jdelete(jnode * node /* jnode to finish with */ )
20427 +{
20428 +       struct page *page;
20429 +       int result;
20430 +       reiser4_tree *tree;
20431 +       jnode_type jtype;
20432 +
20433 +       assert("nikita-467", node != NULL);
20434 +       assert("nikita-2531", JF_ISSET(node, JNODE_RIP));
20435 +
20436 +       jtype = jnode_get_type(node);
20437 +
20438 +       page = jnode_lock_page(node);
20439 +       assert_spin_locked(&(node->guard));
20440 +
20441 +       tree = jnode_get_tree(node);
20442 +
20443 +       write_lock_tree(tree);
20444 +       /* re-check ->x_count under tree lock. */
20445 +       result = jnode_is_busy(node, jtype);
20446 +       if (likely(!result)) {
20447 +               assert("nikita-2123", JF_ISSET(node, JNODE_HEARD_BANSHEE));
20448 +               assert("jmacd-511", atomic_read(&node->d_count) == 0);
20449 +
20450 +               /* detach page */
20451 +               if (page != NULL) {
20452 +                       /*
20453 +                        * FIXME this is racy against jnode_extent_write().
20454 +                        */
20455 +                       page_clear_jnode(page, node);
20456 +               }
20457 +               spin_unlock_jnode(node);
20458 +               /* goodbye */
20459 +               jnode_delete(node, jtype, tree);
20460 +               write_unlock_tree(tree);
20461 +               jnode_free(node, jtype);
20462 +               /* @node is no longer valid pointer */
20463 +               if (page != NULL)
20464 +                       reiser4_drop_page(page);
20465 +       } else {
20466 +               /* busy check failed: reference was acquired by concurrent
20467 +                * thread. */
20468 +               JF_CLR(node, JNODE_RIP);
20469 +               write_unlock_tree(tree);
20470 +               spin_unlock_jnode(node);
20471 +               if (page != NULL)
20472 +                       unlock_page(page);
20473 +       }
20474 +       return result;
20475 +}
20476 +
20477 +/* drop jnode on the floor.
20478 +
20479 +   Return value:
20480 +
20481 +    -EBUSY:  failed to drop jnode, because there are still references to it
20482 +
20483 +    0:       successfully dropped jnode
20484 +
20485 +*/
20486 +static int jdrop_in_tree(jnode * node, reiser4_tree * tree)
20487 +{
20488 +       struct page *page;
20489 +       jnode_type jtype;
20490 +       int result;
20491 +
20492 +       assert("zam-602", node != NULL);
20493 +       assert_rw_not_read_locked(&(tree->tree_lock));
20494 +       assert_rw_not_write_locked(&(tree->tree_lock));
20495 +       assert("nikita-2403", !JF_ISSET(node, JNODE_HEARD_BANSHEE));
20496 +
20497 +       jtype = jnode_get_type(node);
20498 +
20499 +       page = jnode_lock_page(node);
20500 +       assert_spin_locked(&(node->guard));
20501 +
20502 +       write_lock_tree(tree);
20503 +
20504 +       /* re-check ->x_count under tree lock. */
20505 +       result = jnode_is_busy(node, jtype);
20506 +       if (!result) {
20507 +               assert("nikita-2488", page == jnode_page(node));
20508 +               assert("nikita-2533", atomic_read(&node->d_count) == 0);
20509 +               if (page != NULL) {
20510 +                       assert("nikita-2126", !PageDirty(page));
20511 +                       assert("nikita-2127", PageUptodate(page));
20512 +                       assert("nikita-2181", PageLocked(page));
20513 +                       page_clear_jnode(page, node);
20514 +               }
20515 +               spin_unlock_jnode(node);
20516 +               jnode_remove(node, jtype, tree);
20517 +               write_unlock_tree(tree);
20518 +               jnode_free(node, jtype);
20519 +               if (page != NULL) {
20520 +                       reiser4_drop_page(page);
20521 +               }
20522 +       } else {
20523 +               /* busy check failed: reference was acquired by concurrent
20524 +                * thread. */
20525 +               JF_CLR(node, JNODE_RIP);
20526 +               write_unlock_tree(tree);
20527 +               spin_unlock_jnode(node);
20528 +               if (page != NULL)
20529 +                       unlock_page(page);
20530 +       }
20531 +       return result;
20532 +}
20533 +
20534 +/* This function frees jnode "if possible". In particular, [dcx]_count has to
20535 +   be 0 (where applicable).  */
20536 +void jdrop(jnode * node)
20537 +{
20538 +       jdrop_in_tree(node, jnode_get_tree(node));
20539 +}
20540 +
20541 +/* IO head jnode implementation; The io heads are simple j-nodes with limited
20542 +   functionality (these j-nodes are not in any hash table) just for reading
20543 +   from and writing to disk. */
20544 +
20545 +jnode *reiser4_alloc_io_head(const reiser4_block_nr * block)
20546 +{
20547 +       jnode *jal = jalloc();
20548 +
20549 +       if (jal != NULL) {
20550 +               jnode_init(jal, current_tree, JNODE_IO_HEAD);
20551 +               jnode_set_block(jal, block);
20552 +       }
20553 +
20554 +       jref(jal);
20555 +
20556 +       return jal;
20557 +}
20558 +
20559 +void reiser4_drop_io_head(jnode * node)
20560 +{
20561 +       assert("zam-648", jnode_get_type(node) == JNODE_IO_HEAD);
20562 +
20563 +       jput(node);
20564 +       jdrop(node);
20565 +}
20566 +
20567 +/* protect keep jnode data from reiser4_releasepage()  */
20568 +void pin_jnode_data(jnode * node)
20569 +{
20570 +       assert("zam-671", jnode_page(node) != NULL);
20571 +       page_cache_get(jnode_page(node));
20572 +}
20573 +
20574 +/* make jnode data free-able again */
20575 +void unpin_jnode_data(jnode * node)
20576 +{
20577 +       assert("zam-672", jnode_page(node) != NULL);
20578 +       page_cache_release(jnode_page(node));
20579 +}
20580 +
20581 +struct address_space *jnode_get_mapping(const jnode * node)
20582 +{
20583 +       assert("nikita-3162", node != NULL);
20584 +       return jnode_ops(node)->mapping(node);
20585 +}
20586 +
20587 +#if REISER4_DEBUG
20588 +/* debugging aid: jnode invariant */
20589 +int jnode_invariant_f(const jnode * node, char const **msg)
20590 +{
20591 +#define _ergo(ant, con)                                                \
20592 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
20593 +#define _check(exp) ((*msg) = #exp, (exp))
20594 +
20595 +       return _check(node != NULL) &&
20596 +           /* [jnode-queued] */
20597 +           /* only relocated node can be queued, except that when znode
20598 +            * is being deleted, its JNODE_RELOC bit is cleared */
20599 +           _ergo(JF_ISSET(node, JNODE_FLUSH_QUEUED),
20600 +                 JF_ISSET(node, JNODE_RELOC) ||
20601 +                 JF_ISSET(node, JNODE_HEARD_BANSHEE)) &&
20602 +           _check(node->jnodes.prev != NULL) &&
20603 +           _check(node->jnodes.next != NULL) &&
20604 +           /* [jnode-dirty] invariant */
20605 +           /* dirty inode is part of atom */
20606 +           _ergo(JF_ISSET(node, JNODE_DIRTY), node->atom != NULL) &&
20607 +           /* [jnode-oid] invariant */
20608 +           /* for unformatted node ->objectid and ->mapping fields are
20609 +            * consistent */
20610 +           _ergo(jnode_is_unformatted(node) && node->key.j.mapping != NULL,
20611 +                 node->key.j.objectid ==
20612 +                 get_inode_oid(node->key.j.mapping->host)) &&
20613 +           /* [jnode-atom-valid] invariant */
20614 +           /* node atom has valid state */
20615 +           _ergo(node->atom != NULL, node->atom->stage != ASTAGE_INVALID) &&
20616 +           /* [jnode-page-binding] invariant */
20617 +           /* if node points to page, it points back to node */
20618 +           _ergo(node->pg != NULL, jprivate(node->pg) == node) &&
20619 +           /* [jnode-refs] invariant */
20620 +           /* only referenced jnode can be loaded */
20621 +           _check(atomic_read(&node->x_count) >= atomic_read(&node->d_count));
20622 +
20623 +}
20624 +
20625 +static const char *jnode_type_name(jnode_type type)
20626 +{
20627 +       switch (type) {
20628 +       case JNODE_UNFORMATTED_BLOCK:
20629 +               return "unformatted";
20630 +       case JNODE_FORMATTED_BLOCK:
20631 +               return "formatted";
20632 +       case JNODE_BITMAP:
20633 +               return "bitmap";
20634 +       case JNODE_IO_HEAD:
20635 +               return "io head";
20636 +       case JNODE_INODE:
20637 +               return "inode";
20638 +       case LAST_JNODE_TYPE:
20639 +               return "last";
20640 +       default:{
20641 +                       static char unknown[30];
20642 +
20643 +                       sprintf(unknown, "unknown %i", type);
20644 +                       return unknown;
20645 +               }
20646 +       }
20647 +}
20648 +
20649 +#define jnode_state_name( node, flag )                 \
20650 +       ( JF_ISSET( ( node ), ( flag ) ) ? ((#flag "|")+6) : "" )
20651 +
20652 +/* debugging aid: output human readable information about @node */
20653 +static void info_jnode(const char *prefix /* prefix to print */ ,
20654 +                      const jnode * node /* node to print */ )
20655 +{
20656 +       assert("umka-068", prefix != NULL);
20657 +
20658 +       if (node == NULL) {
20659 +               printk("%s: null\n", prefix);
20660 +               return;
20661 +       }
20662 +
20663 +       printk
20664 +           ("%s: %p: state: %lx: [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s], level: %i,"
20665 +            " block: %s, d_count: %d, x_count: %d, "
20666 +            "pg: %p, atom: %p, lock: %i:%i, type: %s, ", prefix, node,
20667 +            node->state,
20668 +            jnode_state_name(node, JNODE_PARSED),
20669 +            jnode_state_name(node, JNODE_HEARD_BANSHEE),
20670 +            jnode_state_name(node, JNODE_LEFT_CONNECTED),
20671 +            jnode_state_name(node, JNODE_RIGHT_CONNECTED),
20672 +            jnode_state_name(node, JNODE_ORPHAN),
20673 +            jnode_state_name(node, JNODE_CREATED),
20674 +            jnode_state_name(node, JNODE_RELOC),
20675 +            jnode_state_name(node, JNODE_OVRWR),
20676 +            jnode_state_name(node, JNODE_DIRTY),
20677 +            jnode_state_name(node, JNODE_IS_DYING),
20678 +            jnode_state_name(node, JNODE_RIP),
20679 +            jnode_state_name(node, JNODE_MISSED_IN_CAPTURE),
20680 +            jnode_state_name(node, JNODE_WRITEBACK),
20681 +            jnode_state_name(node, JNODE_NEW),
20682 +            jnode_state_name(node, JNODE_DKSET),
20683 +            jnode_state_name(node, JNODE_REPACK),
20684 +            jnode_state_name(node, JNODE_CLUSTER_PAGE),
20685 +            jnode_get_level(node), sprint_address(jnode_get_block(node)),
20686 +            atomic_read(&node->d_count), atomic_read(&node->x_count),
20687 +            jnode_page(node), node->atom, 0, 0,
20688 +            jnode_type_name(jnode_get_type(node)));
20689 +       if (jnode_is_unformatted(node)) {
20690 +               printk("inode: %llu, index: %lu, ",
20691 +                      node->key.j.objectid, node->key.j.index);
20692 +       }
20693 +}
20694 +
20695 +/* debugging aid: check znode invariant and panic if it doesn't hold */
20696 +static int jnode_invariant(const jnode * node, int tlocked, int jlocked)
20697 +{
20698 +       char const *failed_msg;
20699 +       int result;
20700 +       reiser4_tree *tree;
20701 +
20702 +       tree = jnode_get_tree(node);
20703 +
20704 +       assert("umka-063312", node != NULL);
20705 +       assert("umka-064321", tree != NULL);
20706 +
20707 +       if (!jlocked && !tlocked)
20708 +               spin_lock_jnode((jnode *) node);
20709 +       if (!tlocked)
20710 +               read_lock_tree(jnode_get_tree(node));
20711 +       result = jnode_invariant_f(node, &failed_msg);
20712 +       if (!result) {
20713 +               info_jnode("corrupted node", node);
20714 +               warning("jmacd-555", "Condition %s failed", failed_msg);
20715 +       }
20716 +       if (!tlocked)
20717 +               read_unlock_tree(jnode_get_tree(node));
20718 +       if (!jlocked && !tlocked)
20719 +               spin_unlock_jnode((jnode *) node);
20720 +       return result;
20721 +}
20722 +
20723 +#endif                         /* REISER4_DEBUG */
20724 +
20725 +/* Make Linus happy.
20726 +   Local variables:
20727 +   c-indentation-style: "K&R"
20728 +   mode-name: "LC"
20729 +   c-basic-offset: 8
20730 +   tab-width: 8
20731 +   fill-column: 80
20732 +   End:
20733 +*/
20734 diff --git a/fs/reiser4/jnode.h b/fs/reiser4/jnode.h
20735 new file mode 100644
20736 index 0000000..c05d88e
20737 --- /dev/null
20738 +++ b/fs/reiser4/jnode.h
20739 @@ -0,0 +1,705 @@
20740 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
20741 + * reiser4/README */
20742 +
20743 +/* Declaration of jnode. See jnode.c for details. */
20744 +
20745 +#ifndef __JNODE_H__
20746 +#define __JNODE_H__
20747 +
20748 +#include "forward.h"
20749 +#include "type_safe_hash.h"
20750 +#include "txnmgr.h"
20751 +#include "key.h"
20752 +#include "debug.h"
20753 +#include "dformat.h"
20754 +#include "page_cache.h"
20755 +#include "context.h"
20756 +
20757 +#include "plugin/plugin.h"
20758 +
20759 +#include <linux/fs.h>
20760 +#include <linux/mm.h>
20761 +#include <linux/spinlock.h>
20762 +#include <asm/atomic.h>
20763 +#include <asm/bitops.h>
20764 +#include <linux/list.h>
20765 +#include <linux/rcupdate.h>
20766 +
20767 +/* declare hash table of jnodes (jnodes proper, that is, unformatted
20768 +   nodes)  */
20769 +TYPE_SAFE_HASH_DECLARE(j, jnode);
20770 +
20771 +/* declare hash table of znodes */
20772 +TYPE_SAFE_HASH_DECLARE(z, znode);
20773 +
20774 +typedef struct {
20775 +       __u64 objectid;
20776 +       unsigned long index;
20777 +       struct address_space *mapping;
20778 +} jnode_key_t;
20779 +
20780 +/*
20781 +   Jnode is the "base class" of other nodes in reiser4. It is also happens to
20782 +   be exactly the node we use for unformatted tree nodes.
20783 +
20784 +   Jnode provides following basic functionality:
20785 +
20786 +   . reference counting and indexing.
20787 +
20788 +   . integration with page cache. Jnode has ->pg reference to which page can
20789 +   be attached.
20790 +
20791 +   . interface to transaction manager. It is jnode that is kept in transaction
20792 +   manager lists, attached to atoms, etc. (NOTE-NIKITA one may argue that this
20793 +   means, there should be special type of jnode for inode.)
20794 +
20795 +   Locking:
20796 +
20797 +   Spin lock: the following fields are protected by the per-jnode spin lock:
20798 +
20799 +    ->state
20800 +    ->atom
20801 +    ->capture_link
20802 +
20803 +   Following fields are protected by the global tree lock:
20804 +
20805 +    ->link
20806 +    ->key.z (content of ->key.z is only changed in znode_rehash())
20807 +    ->key.j
20808 +
20809 +   Atomic counters
20810 +
20811 +    ->x_count
20812 +    ->d_count
20813 +
20814 +    ->pg, and ->data are protected by spin lock for unused jnode and are
20815 +    immutable for used jnode (one for which fs/reiser4/vfs_ops.c:releasable()
20816 +    is false).
20817 +
20818 +    ->tree is immutable after creation
20819 +
20820 +   Unclear
20821 +
20822 +    ->blocknr: should be under jnode spin-lock, but current interface is based
20823 +    on passing of block address.
20824 +
20825 +   If you ever need to spin lock two nodes at once, do this in "natural"
20826 +   memory order: lock znode with lower address first. (See lock_two_nodes().)
20827 +
20828 +   Invariants involving this data-type:
20829 +
20830 +      [jnode-dirty]
20831 +      [jnode-refs]
20832 +      [jnode-oid]
20833 +      [jnode-queued]
20834 +      [jnode-atom-valid]
20835 +      [jnode-page-binding]
20836 +*/
20837 +
20838 +struct jnode {
20839 +#if REISER4_DEBUG
20840 +#define JMAGIC 0x52654973      /* "ReIs" */
20841 +       int magic;
20842 +#endif
20843 +       /* FIRST CACHE LINE (16 bytes): data used by jload */
20844 +
20845 +       /* jnode's state: bitwise flags from the reiser4_jnode_state enum. */
20846 +       /*   0 */ unsigned long state;
20847 +
20848 +       /* lock, protecting jnode's fields. */
20849 +       /*   4 */ spinlock_t load;
20850 +
20851 +       /* counter of references to jnode itself. Increased on jref().
20852 +          Decreased on jput().
20853 +        */
20854 +       /*   8 */ atomic_t x_count;
20855 +
20856 +       /* counter of references to jnode's data. Pin data page(s) in
20857 +          memory while this is greater than 0. Increased on jload().
20858 +          Decreased on jrelse().
20859 +        */
20860 +       /*   12 */ atomic_t d_count;
20861 +
20862 +       /* SECOND CACHE LINE: data used by hash table lookups */
20863 +
20864 +       /*   16 */ union {
20865 +               /* znodes are hashed by block number */
20866 +               reiser4_block_nr z;
20867 +               /* unformatted nodes are hashed by mapping plus offset */
20868 +               jnode_key_t j;
20869 +       } key;
20870 +
20871 +       /* THIRD CACHE LINE */
20872 +
20873 +       /*   32 */ union {
20874 +               /* pointers to maintain hash-table */
20875 +               z_hash_link z;
20876 +               j_hash_link j;
20877 +       } link;
20878 +
20879 +       /* pointer to jnode page.  */
20880 +       /*   36 */ struct page *pg;
20881 +       /* pointer to node itself. This is page_address(node->pg) when page is
20882 +          attached to the jnode
20883 +        */
20884 +       /*   40 */ void *data;
20885 +
20886 +       /*   44 */ reiser4_tree *tree;
20887 +
20888 +       /* FOURTH CACHE LINE: atom related fields */
20889 +
20890 +       /*   48 */ spinlock_t guard;
20891 +
20892 +       /* atom the block is in, if any */
20893 +       /*   52 */ txn_atom *atom;
20894 +
20895 +       /* capture list */
20896 +       /*   56 */ struct list_head capture_link;
20897 +
20898 +       /* FIFTH CACHE LINE */
20899 +
20900 +       /*   64 */ struct rcu_head rcu;
20901 +       /* crosses cache line */
20902 +
20903 +       /* SIXTH CACHE LINE */
20904 +
20905 +       /* the real blocknr (where io is going to/from) */
20906 +       /*   80 */ reiser4_block_nr blocknr;
20907 +       /* Parent item type, unformatted and CRC need it for offset => key conversion.  */
20908 +       /* NOTE: this parent_item_id looks like jnode type. */
20909 +       /*   88 */ reiser4_plugin_id parent_item_id;
20910 +       /*   92 */
20911 +#if REISER4_DEBUG
20912 +       /* number of pages referenced by the jnode (meaningful while capturing of
20913 +          page clusters) */
20914 +       int page_count;
20915 +       /* list of all jnodes for debugging purposes. */
20916 +       struct list_head jnodes;
20917 +       /* how many times this jnode was written in one transaction */
20918 +       int written;
20919 +       /* this indicates which atom's list the jnode is on */
20920 +       atom_list list;
20921 +#endif
20922 +} __attribute__ ((aligned(16)));
20923 +
20924 +/*
20925 + * jnode types. Enumeration of existing jnode types.
20926 + */
20927 +typedef enum {
20928 +       JNODE_UNFORMATTED_BLOCK,        /* unformatted block */
20929 +       JNODE_FORMATTED_BLOCK,  /* formatted block, znode */
20930 +       JNODE_BITMAP,           /* bitmap */
20931 +       JNODE_IO_HEAD,          /* jnode representing a block in the
20932 +                                * wandering log */
20933 +       JNODE_INODE,            /* jnode embedded into inode */
20934 +       LAST_JNODE_TYPE
20935 +} jnode_type;
20936 +
20937 +/* jnode states */
20938 +typedef enum {
20939 +       /* jnode's page is loaded and data checked */
20940 +       JNODE_PARSED = 0,
20941 +       /* node was deleted, not all locks on it were released. This
20942 +          node is empty and is going to be removed from the tree
20943 +          shortly. */
20944 +       JNODE_HEARD_BANSHEE = 1,
20945 +       /* left sibling pointer is valid */
20946 +       JNODE_LEFT_CONNECTED = 2,
20947 +       /* right sibling pointer is valid */
20948 +       JNODE_RIGHT_CONNECTED = 3,
20949 +
20950 +       /* znode was just created and doesn't yet have a pointer from
20951 +          its parent */
20952 +       JNODE_ORPHAN = 4,
20953 +
20954 +       /* this node was created by its transaction and has not been assigned
20955 +          a block address. */
20956 +       JNODE_CREATED = 5,
20957 +
20958 +       /* this node is currently relocated */
20959 +       JNODE_RELOC = 6,
20960 +       /* this node is currently wandered */
20961 +       JNODE_OVRWR = 7,
20962 +
20963 +       /* this znode has been modified */
20964 +       JNODE_DIRTY = 8,
20965 +
20966 +       /* znode lock is being invalidated */
20967 +       JNODE_IS_DYING = 9,
20968 +
20969 +       /* THIS PLACE IS INTENTIONALLY LEFT BLANK */
20970 +
20971 +       /* jnode is queued for flushing. */
20972 +       JNODE_FLUSH_QUEUED = 12,
20973 +
20974 +       /* In the following bits jnode type is encoded. */
20975 +       JNODE_TYPE_1 = 13,
20976 +       JNODE_TYPE_2 = 14,
20977 +       JNODE_TYPE_3 = 15,
20978 +
20979 +       /* jnode is being destroyed */
20980 +       JNODE_RIP = 16,
20981 +
20982 +       /* znode was not captured during locking (it might so be because
20983 +          ->level != LEAF_LEVEL and lock_mode == READ_LOCK) */
20984 +       JNODE_MISSED_IN_CAPTURE = 17,
20985 +
20986 +       /* write is in progress */
20987 +       JNODE_WRITEBACK = 18,
20988 +
20989 +       /* FIXME: now it is used by crypto-compress plugin only */
20990 +       JNODE_NEW = 19,
20991 +
20992 +       /* delimiting keys are already set for this znode. */
20993 +       JNODE_DKSET = 20,
20994 +
20995 +       /* when this bit is set page and jnode can not be disconnected */
20996 +       JNODE_WRITE_PREPARED = 21,
20997 +
20998 +       JNODE_CLUSTER_PAGE = 22,
20999 +       /* Jnode is marked for repacking, that means the reiser4 flush and the
21000 +        * block allocator should process this node special way  */
21001 +       JNODE_REPACK = 23,
21002 +       /* node should be converted by flush in squalloc phase */
21003 +       JNODE_CONVERTIBLE = 24,
21004 +       /*
21005 +        * When jnode is dirtied for the first time in given transaction,
21006 +        * do_jnode_make_dirty() checks whether this jnode can possible became
21007 +        * member of overwrite set. If so, this bit is set, and one block is
21008 +        * reserved in the ->flush_reserved space of atom.
21009 +        *
21010 +        * This block is "used" (and JNODE_FLUSH_RESERVED bit is cleared) when
21011 +        *
21012 +        *     (1) flush decides that we want this block to go into relocate
21013 +        *     set after all.
21014 +        *
21015 +        *     (2) wandering log is allocated (by log writer)
21016 +        *
21017 +        *     (3) extent is allocated
21018 +        *
21019 +        */
21020 +       JNODE_FLUSH_RESERVED = 29
21021 +} reiser4_jnode_state;
21022 +
21023 +/* Macros for accessing the jnode state. */
21024 +
21025 +static inline void JF_CLR(jnode * j, int f)
21026 +{
21027 +       assert("unknown-1", j->magic == JMAGIC);
21028 +       clear_bit(f, &j->state);
21029 +}
21030 +static inline int JF_ISSET(const jnode * j, int f)
21031 +{
21032 +       assert("unknown-2", j->magic == JMAGIC);
21033 +       return test_bit(f, &((jnode *) j)->state);
21034 +}
21035 +static inline void JF_SET(jnode * j, int f)
21036 +{
21037 +       assert("unknown-3", j->magic == JMAGIC);
21038 +       set_bit(f, &j->state);
21039 +}
21040 +
21041 +static inline int JF_TEST_AND_SET(jnode * j, int f)
21042 +{
21043 +       assert("unknown-4", j->magic == JMAGIC);
21044 +       return test_and_set_bit(f, &j->state);
21045 +}
21046 +
21047 +static inline void spin_lock_jnode(jnode *node)
21048 +{
21049 +       /* check that spinlocks of lower priorities are not held */
21050 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
21051 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
21052 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
21053 +                   LOCK_CNT_NIL(rw_locked_dk) &&
21054 +                   LOCK_CNT_LT(spin_locked_jnode, 2)));
21055 +
21056 +       spin_lock(&(node->guard));
21057 +
21058 +       LOCK_CNT_INC(spin_locked_jnode);
21059 +       LOCK_CNT_INC(spin_locked);
21060 +}
21061 +
21062 +static inline void spin_unlock_jnode(jnode *node)
21063 +{
21064 +       assert_spin_locked(&(node->guard));
21065 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_jnode));
21066 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
21067 +
21068 +       LOCK_CNT_DEC(spin_locked_jnode);
21069 +       LOCK_CNT_DEC(spin_locked);
21070 +
21071 +       spin_unlock(&(node->guard));
21072 +}
21073 +
21074 +static inline int jnode_is_in_deleteset(const jnode * node)
21075 +{
21076 +       return JF_ISSET(node, JNODE_RELOC);
21077 +}
21078 +
21079 +extern int init_jnodes(void);
21080 +extern void done_jnodes(void);
21081 +
21082 +/* Jnode routines */
21083 +extern jnode *jalloc(void);
21084 +extern void jfree(jnode * node) NONNULL;
21085 +extern jnode *jclone(jnode *);
21086 +extern jnode *jlookup(reiser4_tree * tree,
21087 +                     oid_t objectid, unsigned long ind) NONNULL;
21088 +extern jnode *jfind(struct address_space *, unsigned long index) NONNULL;
21089 +extern jnode *jnode_by_page(struct page *pg) NONNULL;
21090 +extern jnode *jnode_of_page(struct page *pg) NONNULL;
21091 +void jnode_attach_page(jnode * node, struct page *pg);
21092 +
21093 +void unhash_unformatted_jnode(jnode *);
21094 +extern jnode *page_next_jnode(jnode * node) NONNULL;
21095 +extern void jnode_init(jnode * node, reiser4_tree * tree, jnode_type) NONNULL;
21096 +extern void jnode_make_dirty(jnode * node) NONNULL;
21097 +extern void jnode_make_clean(jnode * node) NONNULL;
21098 +extern void jnode_make_wander_nolock(jnode * node) NONNULL;
21099 +extern void jnode_make_wander(jnode *) NONNULL;
21100 +extern void znode_make_reloc(znode *, flush_queue_t *) NONNULL;
21101 +extern void unformatted_make_reloc(jnode *, flush_queue_t *) NONNULL;
21102 +extern struct address_space *jnode_get_mapping(const jnode * node) NONNULL;
21103 +
21104 +/**
21105 + * jnode_get_block
21106 + * @node: jnode to query
21107 + *
21108 + */
21109 +static inline const reiser4_block_nr *jnode_get_block(const jnode *node)
21110 +{
21111 +       assert("nikita-528", node != NULL);
21112 +
21113 +       return &node->blocknr;
21114 +}
21115 +
21116 +/**
21117 + * jnode_set_block
21118 + * @node: jnode to update
21119 + * @blocknr: new block nr
21120 + */
21121 +static inline void jnode_set_block(jnode *node, const reiser4_block_nr *blocknr)
21122 +{
21123 +       assert("nikita-2020", node != NULL);
21124 +       assert("umka-055", blocknr != NULL);
21125 +       node->blocknr = *blocknr;
21126 +}
21127 +
21128 +
21129 +/* block number for IO. Usually this is the same as jnode_get_block(), unless
21130 + * jnode was emergency flushed---then block number chosen by eflush is
21131 + * used. */
21132 +static inline const reiser4_block_nr *jnode_get_io_block(jnode * node)
21133 +{
21134 +       assert("nikita-2768", node != NULL);
21135 +       assert_spin_locked(&(node->guard));
21136 +
21137 +       return jnode_get_block(node);
21138 +}
21139 +
21140 +/* Jnode flush interface. */
21141 +extern reiser4_blocknr_hint *reiser4_pos_hint(flush_pos_t * pos);
21142 +extern flush_queue_t *reiser4_pos_fq(flush_pos_t * pos);
21143 +
21144 +/* FIXME-VS: these are used in plugin/item/extent.c */
21145 +
21146 +/* does extent_get_block have to be called */
21147 +#define jnode_mapped(node)     JF_ISSET (node, JNODE_MAPPED)
21148 +#define jnode_set_mapped(node) JF_SET (node, JNODE_MAPPED)
21149 +
21150 +/* the node should be converted during flush squalloc phase */
21151 +#define jnode_convertible(node)        JF_ISSET (node, JNODE_CONVERTIBLE)
21152 +#define jnode_set_convertible(node)    JF_SET (node, JNODE_CONVERTIBLE)
21153 +
21154 +/* Macros to convert from jnode to znode, znode to jnode.  These are macros
21155 +   because C doesn't allow overloading of const prototypes. */
21156 +#define ZJNODE(x) (& (x) -> zjnode)
21157 +#define JZNODE(x)                                              \
21158 +({                                                             \
21159 +       typeof (x) __tmp_x;                                     \
21160 +                                                               \
21161 +       __tmp_x = (x);                                          \
21162 +       assert ("jmacd-1300", jnode_is_znode (__tmp_x));        \
21163 +       (znode*) __tmp_x;                                       \
21164 +})
21165 +
21166 +extern int jnodes_tree_init(reiser4_tree * tree);
21167 +extern int jnodes_tree_done(reiser4_tree * tree);
21168 +
21169 +#if REISER4_DEBUG
21170 +
21171 +extern int znode_is_any_locked(const znode * node);
21172 +extern void jnode_list_remove(jnode * node);
21173 +
21174 +#else
21175 +
21176 +#define jnode_list_remove(node) noop
21177 +
21178 +#endif
21179 +
21180 +int znode_is_root(const znode * node) NONNULL;
21181 +
21182 +/* bump reference counter on @node */
21183 +static inline void add_x_ref(jnode * node /* node to increase x_count of */ )
21184 +{
21185 +       assert("nikita-1911", node != NULL);
21186 +
21187 +       atomic_inc(&node->x_count);
21188 +       LOCK_CNT_INC(x_refs);
21189 +}
21190 +
21191 +static inline void dec_x_ref(jnode * node)
21192 +{
21193 +       assert("nikita-3215", node != NULL);
21194 +       assert("nikita-3216", atomic_read(&node->x_count) > 0);
21195 +
21196 +       atomic_dec(&node->x_count);
21197 +       assert("nikita-3217", LOCK_CNT_GTZ(x_refs));
21198 +       LOCK_CNT_DEC(x_refs);
21199 +}
21200 +
21201 +/* jref() - increase counter of references to jnode/znode (x_count) */
21202 +static inline jnode *jref(jnode * node)
21203 +{
21204 +       assert("jmacd-508", (node != NULL) && !IS_ERR(node));
21205 +       add_x_ref(node);
21206 +       return node;
21207 +}
21208 +
21209 +/* get the page of jnode */
21210 +static inline struct page *jnode_page(const jnode * node)
21211 +{
21212 +       return node->pg;
21213 +}
21214 +
21215 +/* return pointer to jnode data */
21216 +static inline char *jdata(const jnode * node)
21217 +{
21218 +       assert("nikita-1415", node != NULL);
21219 +       assert("nikita-3198", jnode_page(node) != NULL);
21220 +       return node->data;
21221 +}
21222 +
21223 +static inline int jnode_is_loaded(const jnode * node)
21224 +{
21225 +       assert("zam-506", node != NULL);
21226 +       return atomic_read(&node->d_count) > 0;
21227 +}
21228 +
21229 +extern void page_clear_jnode(struct page *page, jnode * node) NONNULL;
21230 +
21231 +static inline void jnode_set_reloc(jnode * node)
21232 +{
21233 +       assert("nikita-2431", node != NULL);
21234 +       assert("nikita-2432", !JF_ISSET(node, JNODE_OVRWR));
21235 +       JF_SET(node, JNODE_RELOC);
21236 +}
21237 +
21238 +/* jload/jwrite/junload give a bread/bwrite/brelse functionality for jnodes */
21239 +
21240 +extern int jload_gfp(jnode *, gfp_t, int do_kmap) NONNULL;
21241 +
21242 +static inline int jload(jnode *node)
21243 +{
21244 +       return jload_gfp(node, reiser4_ctx_gfp_mask_get(), 1);
21245 +}
21246 +
21247 +extern int jinit_new(jnode *, gfp_t) NONNULL;
21248 +extern int jstartio(jnode *) NONNULL;
21249 +
21250 +extern void jdrop(jnode *) NONNULL;
21251 +extern int jwait_io(jnode *, int rw) NONNULL;
21252 +
21253 +void jload_prefetch(jnode *);
21254 +
21255 +extern jnode *reiser4_alloc_io_head(const reiser4_block_nr * block) NONNULL;
21256 +extern void reiser4_drop_io_head(jnode * node) NONNULL;
21257 +
21258 +static inline reiser4_tree *jnode_get_tree(const jnode * node)
21259 +{
21260 +       assert("nikita-2691", node != NULL);
21261 +       return node->tree;
21262 +}
21263 +
21264 +extern void pin_jnode_data(jnode *);
21265 +extern void unpin_jnode_data(jnode *);
21266 +
21267 +static inline jnode_type jnode_get_type(const jnode * node)
21268 +{
21269 +       static const unsigned long state_mask =
21270 +           (1 << JNODE_TYPE_1) | (1 << JNODE_TYPE_2) | (1 << JNODE_TYPE_3);
21271 +
21272 +       static jnode_type mask_to_type[] = {
21273 +               /*  JNODE_TYPE_3 : JNODE_TYPE_2 : JNODE_TYPE_1 */
21274 +
21275 +               /* 000 */
21276 +               [0] = JNODE_FORMATTED_BLOCK,
21277 +               /* 001 */
21278 +               [1] = JNODE_UNFORMATTED_BLOCK,
21279 +               /* 010 */
21280 +               [2] = JNODE_BITMAP,
21281 +               /* 011 */
21282 +               [3] = LAST_JNODE_TYPE,  /*invalid */
21283 +               /* 100 */
21284 +               [4] = JNODE_INODE,
21285 +               /* 101 */
21286 +               [5] = LAST_JNODE_TYPE,
21287 +               /* 110 */
21288 +               [6] = JNODE_IO_HEAD,
21289 +               /* 111 */
21290 +               [7] = LAST_JNODE_TYPE,  /* invalid */
21291 +       };
21292 +
21293 +       return mask_to_type[(node->state & state_mask) >> JNODE_TYPE_1];
21294 +}
21295 +
21296 +/* returns true if node is a znode */
21297 +static inline int jnode_is_znode(const jnode * node)
21298 +{
21299 +       return jnode_get_type(node) == JNODE_FORMATTED_BLOCK;
21300 +}
21301 +
21302 +static inline int jnode_is_flushprepped(jnode * node)
21303 +{
21304 +       assert("jmacd-78212", node != NULL);
21305 +       assert_spin_locked(&(node->guard));
21306 +       return !JF_ISSET(node, JNODE_DIRTY) || JF_ISSET(node, JNODE_RELOC) ||
21307 +               JF_ISSET(node, JNODE_OVRWR);
21308 +}
21309 +
21310 +/* Return true if @node has already been processed by the squeeze and allocate
21311 +   process.  This implies the block address has been finalized for the
21312 +   duration of this atom (or it is clean and will remain in place).  If this
21313 +   returns true you may use the block number as a hint. */
21314 +static inline int jnode_check_flushprepped(jnode * node)
21315 +{
21316 +       int result;
21317 +
21318 +       /* It must be clean or relocated or wandered.  New allocations are set to relocate. */
21319 +       spin_lock_jnode(node);
21320 +       result = jnode_is_flushprepped(node);
21321 +       spin_unlock_jnode(node);
21322 +       return result;
21323 +}
21324 +
21325 +/* returns true if node is unformatted */
21326 +static inline int jnode_is_unformatted(const jnode * node)
21327 +{
21328 +       assert("jmacd-0123", node != NULL);
21329 +       return jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK;
21330 +}
21331 +
21332 +/* returns true if node represents a cluster cache page */
21333 +static inline int jnode_is_cluster_page(const jnode * node)
21334 +{
21335 +       assert("edward-50", node != NULL);
21336 +       return (JF_ISSET(node, JNODE_CLUSTER_PAGE));
21337 +}
21338 +
21339 +/* returns true is node is builtin inode's jnode */
21340 +static inline int jnode_is_inode(const jnode * node)
21341 +{
21342 +       assert("vs-1240", node != NULL);
21343 +       return jnode_get_type(node) == JNODE_INODE;
21344 +}
21345 +
21346 +static inline jnode_plugin *jnode_ops_of(const jnode_type type)
21347 +{
21348 +       assert("nikita-2367", type < LAST_JNODE_TYPE);
21349 +       return jnode_plugin_by_id((reiser4_plugin_id) type);
21350 +}
21351 +
21352 +static inline jnode_plugin *jnode_ops(const jnode * node)
21353 +{
21354 +       assert("nikita-2366", node != NULL);
21355 +
21356 +       return jnode_ops_of(jnode_get_type(node));
21357 +}
21358 +
21359 +/* Get the index of a block. */
21360 +static inline unsigned long jnode_get_index(jnode * node)
21361 +{
21362 +       return jnode_ops(node)->index(node);
21363 +}
21364 +
21365 +/* return true if "node" is the root */
21366 +static inline int jnode_is_root(const jnode * node)
21367 +{
21368 +       return jnode_is_znode(node) && znode_is_root(JZNODE(node));
21369 +}
21370 +
21371 +extern struct address_space *mapping_jnode(const jnode * node);
21372 +extern unsigned long index_jnode(const jnode * node);
21373 +
21374 +static inline void jput(jnode * node);
21375 +extern void jput_final(jnode * node);
21376 +
21377 +/* bump data counter on @node */
21378 +static inline void add_d_ref(jnode * node /* node to increase d_count of */ )
21379 +{
21380 +       assert("nikita-1962", node != NULL);
21381 +
21382 +       atomic_inc(&node->d_count);
21383 +       if (jnode_is_unformatted(node) || jnode_is_znode(node))
21384 +               LOCK_CNT_INC(d_refs);
21385 +}
21386 +
21387 +/* jput() - decrement x_count reference counter on znode.
21388 +
21389 +   Count may drop to 0, jnode stays in cache until memory pressure causes the
21390 +   eviction of its page. The c_count variable also ensures that children are
21391 +   pressured out of memory before the parent. The jnode remains hashed as
21392 +   long as the VM allows its page to stay in memory.
21393 +*/
21394 +static inline void jput(jnode * node)
21395 +{
21396 +       assert("jmacd-509", node != NULL);
21397 +       assert("jmacd-510", atomic_read(&node->x_count) > 0);
21398 +       assert("zam-926", reiser4_schedulable());
21399 +       LOCK_CNT_DEC(x_refs);
21400 +
21401 +       rcu_read_lock();
21402 +       /*
21403 +        * we don't need any kind of lock here--jput_final() uses RCU.
21404 +        */
21405 +       if (unlikely(atomic_dec_and_test(&node->x_count))) {
21406 +               jput_final(node);
21407 +       } else
21408 +               rcu_read_unlock();
21409 +       assert("nikita-3473", reiser4_schedulable());
21410 +}
21411 +
21412 +extern void jrelse(jnode * node);
21413 +extern void jrelse_tail(jnode * node);
21414 +
21415 +extern jnode *jnode_rip_sync(reiser4_tree * t, jnode * node);
21416 +
21417 +/* resolve race with jput */
21418 +static inline jnode *jnode_rip_check(reiser4_tree * tree, jnode * node)
21419 +{
21420 +       if (unlikely(JF_ISSET(node, JNODE_RIP)))
21421 +               node = jnode_rip_sync(tree, node);
21422 +       return node;
21423 +}
21424 +
21425 +extern reiser4_key *jnode_build_key(const jnode *node, reiser4_key * key);
21426 +
21427 +#if REISER4_DEBUG
21428 +extern int jnode_invariant_f(const jnode *node, char const **msg);
21429 +#endif
21430 +
21431 +extern jnode_plugin jnode_plugins[LAST_JNODE_TYPE];
21432 +
21433 +/* __JNODE_H__ */
21434 +#endif
21435 +
21436 +/* Make Linus happy.
21437 +   Local variables:
21438 +   c-indentation-style: "K&R"
21439 +   mode-name: "LC"
21440 +   c-basic-offset: 8
21441 +   tab-width: 8
21442 +   fill-column: 120
21443 +   End:
21444 +*/
21445 diff --git a/fs/reiser4/kassign.c b/fs/reiser4/kassign.c
21446 new file mode 100644
21447 index 0000000..3c8f9f5
21448 --- /dev/null
21449 +++ b/fs/reiser4/kassign.c
21450 @@ -0,0 +1,661 @@
21451 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
21452 + * reiser4/README */
21453 +
21454 +/* Key assignment policy implementation */
21455 +
21456 +/*
21457 + * In reiser4 every piece of file system data and meta-data has a key. Keys
21458 + * are used to store information in and retrieve it from reiser4 internal
21459 + * tree. In addition to this, keys define _ordering_ of all file system
21460 + * information: things having close keys are placed into the same or
21461 + * neighboring (in the tree order) nodes of the tree. As our block allocator
21462 + * tries to respect tree order (see flush.c), keys also define order in which
21463 + * things are laid out on the disk, and hence, affect performance directly.
21464 + *
21465 + * Obviously, assignment of keys to data and meta-data should be consistent
21466 + * across whole file system. Algorithm that calculates a key for a given piece
21467 + * of data or meta-data is referred to as "key assignment".
21468 + *
21469 + * Key assignment is too expensive to be implemented as a plugin (that is,
21470 + * with an ability to support different key assignment schemas in the same
21471 + * compiled kernel image). As a compromise, all key-assignment functions and
21472 + * data-structures are collected in this single file, so that modifications to
21473 + * key assignment algorithm can be localized. Additional changes may be
21474 + * required in key.[ch].
21475 + *
21476 + * Current default reiser4 key assignment algorithm is dubbed "Plan A". As one
21477 + * may guess, there is "Plan B" too.
21478 + *
21479 + */
21480 +
21481 +/*
21482 + * Additional complication with key assignment implementation is a requirement
21483 + * to support different key length.
21484 + */
21485 +
21486 +/*
21487 + *                   KEY ASSIGNMENT: PLAN A, LONG KEYS.
21488 + *
21489 + * DIRECTORY ITEMS
21490 + *
21491 + *  |       60     | 4 | 7 |1|   56        |        64        |        64       |
21492 + *  +--------------+---+---+-+-------------+------------------+-----------------+
21493 + *  |    dirid     | 0 | F |H|  prefix-1   |    prefix-2      |  prefix-3/hash  |
21494 + *  +--------------+---+---+-+-------------+------------------+-----------------+
21495 + *  |                  |                   |                  |                 |
21496 + *  |    8 bytes       |      8 bytes      |     8 bytes      |     8 bytes     |
21497 + *
21498 + * dirid         objectid of directory this item is for
21499 + *
21500 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
21501 + *
21502 + * H             1 if last 8 bytes of the key contain hash,
21503 + *               0 if last 8 bytes of the key contain prefix-3
21504 + *
21505 + * prefix-1      first 7 characters of file name.
21506 + *               Padded by zeroes if name is not long enough.
21507 + *
21508 + * prefix-2      next 8 characters of the file name.
21509 + *
21510 + * prefix-3      next 8 characters of the file name.
21511 + *
21512 + * hash          hash of the rest of file name (i.e., portion of file
21513 + *               name not included into prefix-1 and prefix-2).
21514 + *
21515 + * File names shorter than 23 (== 7 + 8 + 8) characters are completely encoded
21516 + * in the key. Such file names are called "short". They are distinguished by H
21517 + * bit set 0 in the key.
21518 + *
21519 + * Other file names are "long". For long name, H bit is 1, and first 15 (== 7
21520 + * + 8) characters are encoded in prefix-1 and prefix-2 portions of the
21521 + * key. Last 8 bytes of the key are occupied by hash of the remaining
21522 + * characters of the name.
21523 + *
21524 + * This key assignment reaches following important goals:
21525 + *
21526 + *     (1) directory entries are sorted in approximately lexicographical
21527 + *     order.
21528 + *
21529 + *     (2) collisions (when multiple directory items have the same key), while
21530 + *     principally unavoidable in a tree with fixed length keys, are rare.
21531 + *
21532 + * STAT DATA
21533 + *
21534 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
21535 + *  +--------------+---+-----------------+---+--------------+-----------------+
21536 + *  |  locality id | 1 |    ordering     | 0 |  objectid    |        0        |
21537 + *  +--------------+---+-----------------+---+--------------+-----------------+
21538 + *  |                  |                 |                  |                 |
21539 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
21540 + *
21541 + * locality id     object id of a directory where first name was created for
21542 + *                 the object
21543 + *
21544 + * ordering        copy of second 8-byte portion of the key of directory
21545 + *                 entry for the first name of this object. Ordering has a form
21546 + *                         {
21547 + *                                 fibration :7;
21548 + *                                 h         :1;
21549 + *                                 prefix1   :56;
21550 + *                         }
21551 + *                 see description of key for directory entry above.
21552 + *
21553 + * objectid        object id for this object
21554 + *
21555 + * This key assignment policy is designed to keep stat-data in the same order
21556 + * as corresponding directory items, thus speeding up readdir/stat types of
21557 + * workload.
21558 + *
21559 + * FILE BODY
21560 + *
21561 + *  |       60     | 4 |       64        | 4 |     60       |        64       |
21562 + *  +--------------+---+-----------------+---+--------------+-----------------+
21563 + *  |  locality id | 4 |    ordering     | 0 |  objectid    |      offset     |
21564 + *  +--------------+---+-----------------+---+--------------+-----------------+
21565 + *  |                  |                 |                  |                 |
21566 + *  |    8 bytes       |    8 bytes      |     8 bytes      |     8 bytes     |
21567 + *
21568 + * locality id     object id of a directory where first name was created for
21569 + *                 the object
21570 + *
21571 + * ordering        the same as in the key of stat-data for this object
21572 + *
21573 + * objectid        object id for this object
21574 + *
21575 + * offset          logical offset from the beginning of this file.
21576 + *                 Measured in bytes.
21577 + *
21578 + *
21579 + *                   KEY ASSIGNMENT: PLAN A, SHORT KEYS.
21580 + *
21581 + * DIRECTORY ITEMS
21582 + *
21583 + *  |       60     | 4 | 7 |1|   56        |        64       |
21584 + *  +--------------+---+---+-+-------------+-----------------+
21585 + *  |    dirid     | 0 | F |H|  prefix-1   |  prefix-2/hash  |
21586 + *  +--------------+---+---+-+-------------+-----------------+
21587 + *  |                  |                   |                 |
21588 + *  |    8 bytes       |      8 bytes      |     8 bytes     |
21589 + *
21590 + * dirid         objectid of directory this item is for
21591 + *
21592 + * F             fibration, see fs/reiser4/plugin/fibration.[ch]
21593 + *
21594 + * H             1 if last 8 bytes of the key contain hash,
21595 + *               0 if last 8 bytes of the key contain prefix-2
21596 + *
21597 + * prefix-1      first 7 characters of file name.
21598 + *               Padded by zeroes if name is not long enough.
21599 + *
21600 + * prefix-2      next 8 characters of the file name.
21601 + *
21602 + * hash          hash of the rest of file name (i.e., portion of file
21603 + *               name not included into prefix-1).
21604 + *
21605 + * File names shorter than 15 (== 7 + 8) characters are completely encoded in
21606 + * the key. Such file names are called "short". They are distinguished by H
21607 + * bit set in the key.
21608 + *
21609 + * Other file names are "long". For long name, H bit is 0, and first 7
21610 + * characters are encoded in prefix-1 portion of the key. Last 8 bytes of the
21611 + * key are occupied by hash of the remaining characters of the name.
21612 + *
21613 + * STAT DATA
21614 + *
21615 + *  |       60     | 4 | 4 |     60       |        64       |
21616 + *  +--------------+---+---+--------------+-----------------+
21617 + *  |  locality id | 1 | 0 |  objectid    |        0        |
21618 + *  +--------------+---+---+--------------+-----------------+
21619 + *  |                  |                  |                 |
21620 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
21621 + *
21622 + * locality id     object id of a directory where first name was created for
21623 + *                 the object
21624 + *
21625 + * objectid        object id for this object
21626 + *
21627 + * FILE BODY
21628 + *
21629 + *  |       60     | 4 | 4 |     60       |        64       |
21630 + *  +--------------+---+---+--------------+-----------------+
21631 + *  |  locality id | 4 | 0 |  objectid    |      offset     |
21632 + *  +--------------+---+---+--------------+-----------------+
21633 + *  |                  |                  |                 |
21634 + *  |    8 bytes       |     8 bytes      |     8 bytes     |
21635 + *
21636 + * locality id     object id of a directory where first name was created for
21637 + *                 the object
21638 + *
21639 + * objectid        object id for this object
21640 + *
21641 + * offset          logical offset from the beginning of this file.
21642 + *                 Measured in bytes.
21643 + *
21644 + *
21645 + */
21646 +
21647 +#include "debug.h"
21648 +#include "key.h"
21649 +#include "kassign.h"
21650 +#include "vfs_ops.h"
21651 +#include "inode.h"
21652 +#include "super.h"
21653 +#include "dscale.h"
21654 +
21655 +#include <linux/types.h>       /* for __u??  */
21656 +#include <linux/fs.h>          /* for struct super_block, etc  */
21657 +
21658 +/* bitmask for H bit (see comment at the beginning of this file */
21659 +static const __u64 longname_mark = 0x0100000000000000ull;
21660 +/* bitmask for F and H portions of the key. */
21661 +static const __u64 fibration_mask = 0xff00000000000000ull;
21662 +
21663 +/* return true if name is not completely encoded in @key */
21664 +int is_longname_key(const reiser4_key * key)
21665 +{
21666 +       __u64 highpart;
21667 +
21668 +       assert("nikita-2863", key != NULL);
21669 +       if (get_key_type(key) != KEY_FILE_NAME_MINOR)
21670 +               reiser4_print_key("oops", key);
21671 +       assert("nikita-2864", get_key_type(key) == KEY_FILE_NAME_MINOR);
21672 +
21673 +       if (REISER4_LARGE_KEY)
21674 +               highpart = get_key_ordering(key);
21675 +       else
21676 +               highpart = get_key_objectid(key);
21677 +
21678 +       return (highpart & longname_mark) ? 1 : 0;
21679 +}
21680 +
21681 +/* return true if @name is too long to be completely encoded in the key */
21682 +int is_longname(const char *name UNUSED_ARG, int len)
21683 +{
21684 +       if (REISER4_LARGE_KEY)
21685 +               return len > 23;
21686 +       else
21687 +               return len > 15;
21688 +}
21689 +
21690 +/* code ascii string into __u64.
21691 +
21692 +   Put characters of @name into result (@str) one after another starting
21693 +   from @start_idx-th highest (arithmetically) byte. This produces
21694 +   endian-safe encoding. memcpy(2) will not do.
21695 +
21696 +*/
21697 +static __u64 pack_string(const char *name /* string to encode */ ,
21698 +                        int start_idx  /* highest byte in result from
21699 +                                        * which to start encoding */ )
21700 +{
21701 +       unsigned i;
21702 +       __u64 str;
21703 +
21704 +       str = 0;
21705 +       for (i = 0; (i < sizeof str - start_idx) && name[i]; ++i) {
21706 +               str <<= 8;
21707 +               str |= (unsigned char)name[i];
21708 +       }
21709 +       str <<= (sizeof str - i - start_idx) << 3;
21710 +       return str;
21711 +}
21712 +
21713 +/* opposite to pack_string(). Takes value produced by pack_string(), restores
21714 + * string encoded in it and stores result in @buf */
21715 +char * reiser4_unpack_string(__u64 value, char *buf)
21716 +{
21717 +       do {
21718 +               *buf = value >> (64 - 8);
21719 +               if (*buf)
21720 +                       ++buf;
21721 +               value <<= 8;
21722 +       } while (value != 0);
21723 +       *buf = 0;
21724 +       return buf;
21725 +}
21726 +
21727 +/* obtain name encoded in @key and store it in @buf */
21728 +char *extract_name_from_key(const reiser4_key * key, char *buf)
21729 +{
21730 +       char *c;
21731 +
21732 +       assert("nikita-2868", !is_longname_key(key));
21733 +
21734 +       c = buf;
21735 +       if (REISER4_LARGE_KEY) {
21736 +               c = reiser4_unpack_string(get_key_ordering(key) &
21737 +                                         ~fibration_mask, c);
21738 +               c = reiser4_unpack_string(get_key_fulloid(key), c);
21739 +       } else
21740 +               c = reiser4_unpack_string(get_key_fulloid(key) &
21741 +                                         ~fibration_mask, c);
21742 +       reiser4_unpack_string(get_key_offset(key), c);
21743 +       return buf;
21744 +}
21745 +
21746 +/**
21747 + * complete_entry_key - calculate entry key by name
21748 + * @dir: directory where entry is (or will be) in
21749 + * @name: name to calculate key of
21750 + * @len: lenth of name
21751 + * @result: place to store result in
21752 + *
21753 + * Sets fields of entry key @result which depend on file name.
21754 + * When REISER4_LARGE_KEY is defined three fields of @result are set: ordering,
21755 + * objectid and offset. Otherwise, objectid and offset are set.
21756 + */
21757 +void complete_entry_key(const struct inode *dir, const char *name,
21758 +                       int len, reiser4_key *result)
21759 +{
21760 +#if REISER4_LARGE_KEY
21761 +       __u64 ordering;
21762 +       __u64 objectid;
21763 +       __u64 offset;
21764 +
21765 +       assert("nikita-1139", dir != NULL);
21766 +       assert("nikita-1142", result != NULL);
21767 +       assert("nikita-2867", strlen(name) == len);
21768 +
21769 +       /*
21770 +        * key allocation algorithm for directory entries in case of large
21771 +        * keys:
21772 +        *
21773 +        * If name is not longer than 7 + 8 + 8 = 23 characters, put first 7
21774 +        * characters into ordering field of key, next 8 charactes (if any)
21775 +        * into objectid field of key and next 8 ones (of any) into offset
21776 +        * field of key
21777 +        *
21778 +        * If file name is longer than 23 characters, put first 7 characters
21779 +        * into key's ordering, next 8 to objectid and hash of remaining
21780 +        * characters into offset field.
21781 +        *
21782 +        * To distinguish above cases, in latter set up unused high bit in
21783 +        * ordering field.
21784 +        */
21785 +
21786 +       /* [0-6] characters to ordering */
21787 +       ordering = pack_string(name, 1);
21788 +       if (len > 7) {
21789 +               /* [7-14] characters to objectid */
21790 +               objectid = pack_string(name + 7, 0);
21791 +               if (len > 15) {
21792 +                       if (len <= 23) {
21793 +                               /* [15-23] characters to offset */
21794 +                               offset = pack_string(name + 15, 0);
21795 +                       } else {
21796 +                               /* note in a key the fact that offset contains hash. */
21797 +                               ordering |= longname_mark;
21798 +
21799 +                               /* offset is the hash of the file name's tail. */
21800 +                               offset = inode_hash_plugin(dir)->hash(name + 15,
21801 +                                                                     len - 15);
21802 +                       }
21803 +               } else {
21804 +                       offset = 0ull;
21805 +               }
21806 +       } else {
21807 +               objectid = 0ull;
21808 +               offset = 0ull;
21809 +       }
21810 +
21811 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21812 +       ordering |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21813 +
21814 +       set_key_ordering(result, ordering);
21815 +       set_key_fulloid(result, objectid);
21816 +       set_key_offset(result, offset);
21817 +       return;
21818 +
21819 +#else
21820 +       __u64 objectid;
21821 +       __u64 offset;
21822 +
21823 +       assert("nikita-1139", dir != NULL);
21824 +       assert("nikita-1142", result != NULL);
21825 +       assert("nikita-2867", strlen(name) == len);
21826 +
21827 +       /*
21828 +        * key allocation algorithm for directory entries in case of not large
21829 +        * keys:
21830 +        *
21831 +        * If name is not longer than 7 + 8 = 15 characters, put first 7
21832 +        * characters into objectid field of key, next 8 charactes (if any)
21833 +        * into offset field of key
21834 +        *
21835 +        * If file name is longer than 15 characters, put first 7 characters
21836 +        * into key's objectid, and hash of remaining characters into offset
21837 +        * field.
21838 +        *
21839 +        * To distinguish above cases, in latter set up unused high bit in
21840 +        * objectid field.
21841 +        */
21842 +
21843 +       /* [0-6] characters to objectid */
21844 +       objectid = pack_string(name, 1);
21845 +       if (len > 7) {
21846 +               if (len <= 15) {
21847 +                       /* [7-14] characters to offset */
21848 +                       offset = pack_string(name + 7, 0);
21849 +               } else {
21850 +                       /* note in a key the fact that offset contains hash. */
21851 +                       objectid |= longname_mark;
21852 +
21853 +                       /* offset is the hash of the file name. */
21854 +                       offset = inode_hash_plugin(dir)->hash(name + 7,
21855 +                                                             len - 7);
21856 +               }
21857 +       } else
21858 +               offset = 0ull;
21859 +
21860 +       assert("nikita-3480", inode_fibration_plugin(dir) != NULL);
21861 +       objectid |= inode_fibration_plugin(dir)->fibre(dir, name, len);
21862 +
21863 +       set_key_fulloid(result, objectid);
21864 +       set_key_offset(result, offset);
21865 +       return;
21866 +#endif                         /* ! REISER4_LARGE_KEY */
21867 +}
21868 +
21869 +/* true, if @key is the key of "." */
21870 +int is_dot_key(const reiser4_key * key /* key to check */ )
21871 +{
21872 +       assert("nikita-1717", key != NULL);
21873 +       assert("nikita-1718", get_key_type(key) == KEY_FILE_NAME_MINOR);
21874 +       return
21875 +           (get_key_ordering(key) == 0ull) &&
21876 +           (get_key_objectid(key) == 0ull) && (get_key_offset(key) == 0ull);
21877 +}
21878 +
21879 +/* build key for stat-data.
21880 +
21881 +   return key of stat-data of this object. This should became sd plugin
21882 +   method in the future. For now, let it be here.
21883 +
21884 +*/
21885 +reiser4_key *build_sd_key(const struct inode * target /* inode of an object */ ,
21886 +                         reiser4_key * result  /* resulting key of @target
21887 +                                                  stat-data */ )
21888 +{
21889 +       assert("nikita-261", result != NULL);
21890 +
21891 +       reiser4_key_init(result);
21892 +       set_key_locality(result, reiser4_inode_data(target)->locality_id);
21893 +       set_key_ordering(result, get_inode_ordering(target));
21894 +       set_key_objectid(result, get_inode_oid(target));
21895 +       set_key_type(result, KEY_SD_MINOR);
21896 +       set_key_offset(result, (__u64) 0);
21897 +       return result;
21898 +}
21899 +
21900 +/* encode part of key into &obj_key_id
21901 +
21902 +   This encodes into @id part of @key sufficient to restore @key later,
21903 +   given that latter is key of object (key of stat-data).
21904 +
21905 +   See &obj_key_id
21906 +*/
21907 +int build_obj_key_id(const reiser4_key * key /* key to encode */ ,
21908 +                    obj_key_id * id /* id where key is encoded in */ )
21909 +{
21910 +       assert("nikita-1151", key != NULL);
21911 +       assert("nikita-1152", id != NULL);
21912 +
21913 +       memcpy(id, key, sizeof *id);
21914 +       return 0;
21915 +}
21916 +
21917 +/* encode reference to @obj in @id.
21918 +
21919 +   This is like build_obj_key_id() above, but takes inode as parameter. */
21920 +int build_inode_key_id(const struct inode *obj /* object to build key of */ ,
21921 +                      obj_key_id * id /* result */ )
21922 +{
21923 +       reiser4_key sdkey;
21924 +
21925 +       assert("nikita-1166", obj != NULL);
21926 +       assert("nikita-1167", id != NULL);
21927 +
21928 +       build_sd_key(obj, &sdkey);
21929 +       build_obj_key_id(&sdkey, id);
21930 +       return 0;
21931 +}
21932 +
21933 +/* decode @id back into @key
21934 +
21935 +   Restore key of object stat-data from @id. This is dual to
21936 +   build_obj_key_id() above.
21937 +*/
21938 +int extract_key_from_id(const obj_key_id * id  /* object key id to extract key
21939 +                                                * from */ ,
21940 +                       reiser4_key * key /* result */ )
21941 +{
21942 +       assert("nikita-1153", id != NULL);
21943 +       assert("nikita-1154", key != NULL);
21944 +
21945 +       reiser4_key_init(key);
21946 +       memcpy(key, id, sizeof *id);
21947 +       return 0;
21948 +}
21949 +
21950 +/* extract objectid of directory from key of directory entry within said
21951 +   directory.
21952 +   */
21953 +oid_t extract_dir_id_from_key(const reiser4_key * de_key       /* key of
21954 +                                                                * directory
21955 +                                                                * entry */ )
21956 +{
21957 +       assert("nikita-1314", de_key != NULL);
21958 +       return get_key_locality(de_key);
21959 +}
21960 +
21961 +/* encode into @id key of directory entry.
21962 +
21963 +   Encode into @id information sufficient to later distinguish directory
21964 +   entries within the same directory. This is not whole key, because all
21965 +   directory entries within directory item share locality which is equal
21966 +   to objectid of their directory.
21967 +
21968 +*/
21969 +int build_de_id(const struct inode *dir /* inode of directory */ ,
21970 +               const struct qstr *name /* name to be given to @obj by
21971 +                                        * directory entry being
21972 +                                        * constructed */ ,
21973 +               de_id * id /* short key of directory entry */ )
21974 +{
21975 +       reiser4_key key;
21976 +
21977 +       assert("nikita-1290", dir != NULL);
21978 +       assert("nikita-1292", id != NULL);
21979 +
21980 +       /* NOTE-NIKITA this is suboptimal. */
21981 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &key);
21982 +       return build_de_id_by_key(&key, id);
21983 +}
21984 +
21985 +/* encode into @id key of directory entry.
21986 +
21987 +   Encode into @id information sufficient to later distinguish directory
21988 +   entries within the same directory. This is not whole key, because all
21989 +   directory entries within directory item share locality which is equal
21990 +   to objectid of their directory.
21991 +
21992 +*/
21993 +int build_de_id_by_key(const reiser4_key * entry_key   /* full key of directory
21994 +                                                        * entry */ ,
21995 +                      de_id * id /* short key of directory entry */ )
21996 +{
21997 +       memcpy(id, ((__u64 *) entry_key) + 1, sizeof *id);
21998 +       return 0;
21999 +}
22000 +
22001 +/* restore from @id key of directory entry.
22002 +
22003 +   Function dual to build_de_id(): given @id and locality, build full
22004 +   key of directory entry within directory item.
22005 +
22006 +*/
22007 +int extract_key_from_de_id(const oid_t locality        /* locality of directory
22008 +                                                * entry */ ,
22009 +                          const de_id * id /* directory entry id */ ,
22010 +                          reiser4_key * key /* result */ )
22011 +{
22012 +       /* no need to initialise key here: all fields are overwritten */
22013 +       memcpy(((__u64 *) key) + 1, id, sizeof *id);
22014 +       set_key_locality(key, locality);
22015 +       set_key_type(key, KEY_FILE_NAME_MINOR);
22016 +       return 0;
22017 +}
22018 +
22019 +/* compare two &de_id's */
22020 +cmp_t de_id_cmp(const de_id * id1 /* first &de_id to compare */ ,
22021 +               const de_id * id2 /* second &de_id to compare */ )
22022 +{
22023 +       /* NOTE-NIKITA ugly implementation */
22024 +       reiser4_key k1;
22025 +       reiser4_key k2;
22026 +
22027 +       extract_key_from_de_id((oid_t) 0, id1, &k1);
22028 +       extract_key_from_de_id((oid_t) 0, id2, &k2);
22029 +       return keycmp(&k1, &k2);
22030 +}
22031 +
22032 +/* compare &de_id with key */
22033 +cmp_t de_id_key_cmp(const de_id * id /* directory entry id to compare */ ,
22034 +                   const reiser4_key * key /* key to compare */ )
22035 +{
22036 +       cmp_t result;
22037 +       reiser4_key *k1;
22038 +
22039 +       k1 = (reiser4_key *) (((unsigned long)id) - sizeof key->el[0]);
22040 +       result = KEY_DIFF_EL(k1, key, 1);
22041 +       if (result == EQUAL_TO) {
22042 +               result = KEY_DIFF_EL(k1, key, 2);
22043 +               if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22044 +                       result = KEY_DIFF_EL(k1, key, 3);
22045 +               }
22046 +       }
22047 +       return result;
22048 +}
22049 +
22050 +/*
22051 + * return number of bytes necessary to encode @inode identity.
22052 + */
22053 +int inode_onwire_size(const struct inode *inode)
22054 +{
22055 +       int result;
22056 +
22057 +       result = dscale_bytes(get_inode_oid(inode));
22058 +       result += dscale_bytes(get_inode_locality(inode));
22059 +
22060 +       /*
22061 +        * ordering is large (it usually has highest bits set), so it makes
22062 +        * little sense to dscale it.
22063 +        */
22064 +       if (REISER4_LARGE_KEY)
22065 +               result += sizeof(get_inode_ordering(inode));
22066 +       return result;
22067 +}
22068 +
22069 +/*
22070 + * encode @inode identity at @start
22071 + */
22072 +char *build_inode_onwire(const struct inode *inode, char *start)
22073 +{
22074 +       start += dscale_write(start, get_inode_locality(inode));
22075 +       start += dscale_write(start, get_inode_oid(inode));
22076 +
22077 +       if (REISER4_LARGE_KEY) {
22078 +               put_unaligned(cpu_to_le64(get_inode_ordering(inode)), (__le64 *)start);
22079 +               start += sizeof(get_inode_ordering(inode));
22080 +       }
22081 +       return start;
22082 +}
22083 +
22084 +/*
22085 + * extract key that was previously encoded by build_inode_onwire() at @addr
22086 + */
22087 +char *extract_obj_key_id_from_onwire(char *addr, obj_key_id * key_id)
22088 +{
22089 +       __u64 val;
22090 +
22091 +       addr += dscale_read(addr, &val);
22092 +       val = (val << KEY_LOCALITY_SHIFT) | KEY_SD_MINOR;
22093 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->locality);
22094 +       addr += dscale_read(addr, &val);
22095 +       put_unaligned(cpu_to_le64(val), (__le64 *)key_id->objectid);
22096 +#if REISER4_LARGE_KEY
22097 +       memcpy(&key_id->ordering, addr, sizeof key_id->ordering);
22098 +       addr += sizeof key_id->ordering;
22099 +#endif
22100 +       return addr;
22101 +}
22102 +
22103 +/* Make Linus happy.
22104 +   Local variables:
22105 +   c-indentation-style: "K&R"
22106 +   mode-name: "LC"
22107 +   c-basic-offset: 8
22108 +   tab-width: 8
22109 +   fill-column: 120
22110 +   End:
22111 +*/
22112 diff --git a/fs/reiser4/kassign.h b/fs/reiser4/kassign.h
22113 new file mode 100644
22114 index 0000000..ee818d5
22115 --- /dev/null
22116 +++ b/fs/reiser4/kassign.h
22117 @@ -0,0 +1,110 @@
22118 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
22119 + * reiser4/README */
22120 +
22121 +/* Key assignment policy interface. See kassign.c for details. */
22122 +
22123 +#if !defined( __KASSIGN_H__ )
22124 +#define __KASSIGN_H__
22125 +
22126 +#include "forward.h"
22127 +#include "key.h"
22128 +#include "dformat.h"
22129 +
22130 +#include <linux/types.h>       /* for __u??  */
22131 +#include <linux/fs.h>          /* for struct super_block, etc  */
22132 +#include <linux/dcache.h>      /* for struct qstr */
22133 +
22134 +/* key assignment functions */
22135 +
22136 +/* Information from which key of file stat-data can be uniquely
22137 +   restored. This depends on key assignment policy for
22138 +   stat-data. Currently it's enough to store object id and locality id
22139 +   (60+60==120) bits, because minor packing locality and offset of
22140 +   stat-data key are always known constants: KEY_SD_MINOR and 0
22141 +   respectively. For simplicity 4 bits are wasted in each id, and just
22142 +   two 64 bit integers are stored.
22143 +
22144 +   This field has to be byte-aligned, because we don't want to waste
22145 +   space in directory entries. There is another side of a coin of
22146 +   course: we waste CPU and bus bandwidth in stead, by copying data back
22147 +   and forth.
22148 +
22149 +   Next optimization: &obj_key_id is mainly used to address stat data from
22150 +   directory entries. Under the assumption that majority of files only have
22151 +   only name (one hard link) from *the* parent directory it seems reasonable
22152 +   to only store objectid of stat data and take its locality from key of
22153 +   directory item.
22154 +
22155 +   This requires some flag to be added to the &obj_key_id to distinguish
22156 +   between these two cases. Remaining bits in flag byte are then asking to be
22157 +   used to store file type.
22158 +
22159 +   This optimization requires changes in directory item handling code.
22160 +
22161 +*/
22162 +typedef struct obj_key_id {
22163 +       d8 locality[sizeof(__u64)];
22164 +        ON_LARGE_KEY(d8 ordering[sizeof(__u64)];
22165 +           )
22166 +       d8 objectid[sizeof(__u64)];
22167 +}
22168 +obj_key_id;
22169 +
22170 +/* Information sufficient to uniquely identify directory entry within
22171 +   compressed directory item.
22172 +
22173 +   For alignment issues see &obj_key_id above.
22174 +*/
22175 +typedef struct de_id {
22176 +       ON_LARGE_KEY(d8 ordering[sizeof(__u64)];)
22177 +       d8 objectid[sizeof(__u64)];
22178 +       d8 offset[sizeof(__u64)];
22179 +}
22180 +de_id;
22181 +
22182 +extern int inode_onwire_size(const struct inode *obj);
22183 +extern char *build_inode_onwire(const struct inode *obj, char *area);
22184 +extern char *extract_obj_key_id_from_onwire(char *area, obj_key_id * key_id);
22185 +
22186 +extern int build_inode_key_id(const struct inode *obj, obj_key_id * id);
22187 +extern int extract_key_from_id(const obj_key_id * id, reiser4_key * key);
22188 +extern int build_obj_key_id(const reiser4_key * key, obj_key_id * id);
22189 +extern oid_t extract_dir_id_from_key(const reiser4_key * de_key);
22190 +extern int build_de_id(const struct inode *dir, const struct qstr *name,
22191 +                      de_id * id);
22192 +extern int build_de_id_by_key(const reiser4_key * entry_key, de_id * id);
22193 +extern int extract_key_from_de_id(const oid_t locality, const de_id * id,
22194 +                                 reiser4_key * key);
22195 +extern cmp_t de_id_cmp(const de_id * id1, const de_id * id2);
22196 +extern cmp_t de_id_key_cmp(const de_id * id, const reiser4_key * key);
22197 +
22198 +extern int build_readdir_key_common(struct file *dir, reiser4_key * result);
22199 +extern void build_entry_key_common(const struct inode *dir,
22200 +                                  const struct qstr *name,
22201 +                                  reiser4_key * result);
22202 +extern void build_entry_key_stable_entry(const struct inode *dir,
22203 +                                        const struct qstr *name,
22204 +                                        reiser4_key * result);
22205 +extern int is_dot_key(const reiser4_key * key);
22206 +extern reiser4_key *build_sd_key(const struct inode *target,
22207 +                                reiser4_key * result);
22208 +
22209 +extern int is_longname_key(const reiser4_key * key);
22210 +extern int is_longname(const char *name, int len);
22211 +extern char *extract_name_from_key(const reiser4_key * key, char *buf);
22212 +extern char *reiser4_unpack_string(__u64 value, char *buf);
22213 +extern void complete_entry_key(const struct inode *dir, const char *name,
22214 +                              int len, reiser4_key *result);
22215 +
22216 +/* __KASSIGN_H__ */
22217 +#endif
22218 +
22219 +/* Make Linus happy.
22220 +   Local variables:
22221 +   c-indentation-style: "K&R"
22222 +   mode-name: "LC"
22223 +   c-basic-offset: 8
22224 +   tab-width: 8
22225 +   fill-column: 120
22226 +   End:
22227 +*/
22228 diff --git a/fs/reiser4/key.c b/fs/reiser4/key.c
22229 new file mode 100644
22230 index 0000000..384c318
22231 --- /dev/null
22232 +++ b/fs/reiser4/key.c
22233 @@ -0,0 +1,137 @@
22234 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22235 +
22236 +/* Key manipulations. */
22237 +
22238 +#include "debug.h"
22239 +#include "key.h"
22240 +#include "super.h"
22241 +#include "reiser4.h"
22242 +
22243 +#include <linux/types.h>       /* for __u??  */
22244 +
22245 +/* Minimal possible key: all components are zero. It is presumed that this is
22246 +   independent of key scheme. */
22247 +static const reiser4_key MINIMAL_KEY = {
22248 +       .el = {
22249 +               0ull,
22250 +               ON_LARGE_KEY(0ull,)
22251 +               0ull,
22252 +               0ull
22253 +       }
22254 +};
22255 +
22256 +/* Maximal possible key: all components are ~0. It is presumed that this is
22257 +   independent of key scheme. */
22258 +static const reiser4_key MAXIMAL_KEY = {
22259 +       .el = {
22260 +               __constant_cpu_to_le64(~0ull),
22261 +               ON_LARGE_KEY(__constant_cpu_to_le64(~0ull),)
22262 +               __constant_cpu_to_le64(~0ull),
22263 +               __constant_cpu_to_le64(~0ull)
22264 +       }
22265 +};
22266 +
22267 +/* Initialize key. */
22268 +void reiser4_key_init(reiser4_key * key /* key to init */ )
22269 +{
22270 +       assert("nikita-1169", key != NULL);
22271 +       memset(key, 0, sizeof *key);
22272 +}
22273 +
22274 +/* minimal possible key in the tree. Return pointer to the static storage. */
22275 +const reiser4_key *reiser4_min_key(void)
22276 +{
22277 +       return &MINIMAL_KEY;
22278 +}
22279 +
22280 +/* maximum possible key in the tree. Return pointer to the static storage. */
22281 +const reiser4_key *reiser4_max_key(void)
22282 +{
22283 +       return &MAXIMAL_KEY;
22284 +}
22285 +
22286 +#if REISER4_DEBUG
22287 +/* debugging aid: print symbolic name of key type */
22288 +static const char *type_name(unsigned int key_type /* key type */ )
22289 +{
22290 +       switch (key_type) {
22291 +       case KEY_FILE_NAME_MINOR:
22292 +               return "file name";
22293 +       case KEY_SD_MINOR:
22294 +               return "stat data";
22295 +       case KEY_ATTR_NAME_MINOR:
22296 +               return "attr name";
22297 +       case KEY_ATTR_BODY_MINOR:
22298 +               return "attr body";
22299 +       case KEY_BODY_MINOR:
22300 +               return "file body";
22301 +       default:
22302 +               return "unknown";
22303 +       }
22304 +}
22305 +
22306 +/* debugging aid: print human readable information about key */
22307 +void reiser4_print_key(const char *prefix /* prefix to print */ ,
22308 +              const reiser4_key * key /* key to print */ )
22309 +{
22310 +       /* turn bold on */
22311 +       /* printf ("\033[1m"); */
22312 +       if (key == NULL)
22313 +               printk("%s: null key\n", prefix);
22314 +       else {
22315 +               if (REISER4_LARGE_KEY)
22316 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx:%Lx)", prefix,
22317 +                              get_key_locality(key),
22318 +                              get_key_type(key),
22319 +                              get_key_ordering(key),
22320 +                              get_key_band(key),
22321 +                              get_key_objectid(key), get_key_offset(key));
22322 +               else
22323 +                       printk("%s: (%Lx:%x:%Lx:%Lx:%Lx)", prefix,
22324 +                              get_key_locality(key),
22325 +                              get_key_type(key),
22326 +                              get_key_band(key),
22327 +                              get_key_objectid(key), get_key_offset(key));
22328 +               /*
22329 +                * if this is a key of directory entry, try to decode part of
22330 +                * a name stored in the key, and output it.
22331 +                */
22332 +               if (get_key_type(key) == KEY_FILE_NAME_MINOR) {
22333 +                       char buf[DE_NAME_BUF_LEN];
22334 +                       char *c;
22335 +
22336 +                       c = buf;
22337 +                       c = reiser4_unpack_string(get_key_ordering(key), c);
22338 +                       reiser4_unpack_string(get_key_fulloid(key), c);
22339 +                       printk("[%s", buf);
22340 +                       if (is_longname_key(key))
22341 +                               /*
22342 +                                * only part of the name is stored in the key.
22343 +                                */
22344 +                               printk("...]\n");
22345 +                       else {
22346 +                               /*
22347 +                                * whole name is stored in the key.
22348 +                                */
22349 +                               reiser4_unpack_string(get_key_offset(key), buf);
22350 +                               printk("%s]\n", buf);
22351 +                       }
22352 +               } else {
22353 +                       printk("[%s]\n", type_name(get_key_type(key)));
22354 +               }
22355 +       }
22356 +       /* turn bold off */
22357 +       /* printf ("\033[m\017"); */
22358 +}
22359 +
22360 +#endif
22361 +
22362 +/* Make Linus happy.
22363 +   Local variables:
22364 +   c-indentation-style: "K&R"
22365 +   mode-name: "LC"
22366 +   c-basic-offset: 8
22367 +   tab-width: 8
22368 +   fill-column: 120
22369 +   End:
22370 +*/
22371 diff --git a/fs/reiser4/key.h b/fs/reiser4/key.h
22372 new file mode 100644
22373 index 0000000..3f6b47e
22374 --- /dev/null
22375 +++ b/fs/reiser4/key.h
22376 @@ -0,0 +1,384 @@
22377 +/* Copyright 2000, 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22378 +
22379 +/* Declarations of key-related data-structures and operations on keys. */
22380 +
22381 +#if !defined( __REISER4_KEY_H__ )
22382 +#define __REISER4_KEY_H__
22383 +
22384 +#include "dformat.h"
22385 +#include "forward.h"
22386 +#include "debug.h"
22387 +
22388 +#include <linux/types.h>       /* for __u??  */
22389 +
22390 +/* Operations on keys in reiser4 tree */
22391 +
22392 +/* No access to any of these fields shall be done except via a
22393 +   wrapping macro/function, and that wrapping macro/function shall
22394 +   convert to little endian order.  Compare keys will consider cpu byte order. */
22395 +
22396 +/* A storage layer implementation difference between a regular unix file body and its attributes is in the typedef below
22397 +   which causes all of the attributes of a file to be near in key to all of the other attributes for all of the files
22398 +   within that directory, and not near to the file itself.  It is interesting to consider whether this is the wrong
22399 +   approach, and whether there should be no difference at all. For current usage patterns this choice is probably the
22400 +   right one.  */
22401 +
22402 +/* possible values for minor packing locality (4 bits required) */
22403 +typedef enum {
22404 +       /* file name */
22405 +       KEY_FILE_NAME_MINOR = 0,
22406 +       /* stat-data */
22407 +       KEY_SD_MINOR = 1,
22408 +       /* file attribute name */
22409 +       KEY_ATTR_NAME_MINOR = 2,
22410 +       /* file attribute value */
22411 +       KEY_ATTR_BODY_MINOR = 3,
22412 +       /* file body (tail or extent) */
22413 +       KEY_BODY_MINOR = 4,
22414 +} key_minor_locality;
22415 +
22416 +/* everything stored in the tree has a unique key, which means that the tree is (logically) fully ordered by key.
22417 +   Physical order is determined by dynamic heuristics that attempt to reflect key order when allocating available space,
22418 +   and by the repacker.  It is stylistically better to put aggregation information into the key.  Thus, if you want to
22419 +   segregate extents from tails, it is better to give them distinct minor packing localities rather than changing
22420 +   block_alloc.c to check the node type when deciding where to allocate the node.
22421 +
22422 +   The need to randomly displace new directories and large files disturbs this symmetry unfortunately.  However, it
22423 +   should be noted that this is a need that is not clearly established given the existence of a repacker.  Also, in our
22424 +   current implementation tails have a different minor packing locality from extents, and no files have both extents and
22425 +   tails, so maybe symmetry can be had without performance cost after all.  Symmetry is what we ship for now....
22426 +*/
22427 +
22428 +/* Arbitrary major packing localities can be assigned to objects using
22429 +   the reiser4(filenameA/..packing<=some_number) system call.
22430 +
22431 +   In reiser4, the creat() syscall creates a directory
22432 +
22433 +   whose default flow (that which is referred to if the directory is
22434 +   read as a file) is the traditional unix file body.
22435 +
22436 +   whose directory plugin is the 'filedir'
22437 +
22438 +   whose major packing locality is that of the parent of the object created.
22439 +
22440 +   The static_stat item is a particular commonly used directory
22441 +   compression (the one for normal unix files).
22442 +
22443 +   The filedir plugin checks to see if the static_stat item exists.
22444 +   There is a unique key for static_stat.  If yes, then it uses the
22445 +   static_stat item for all of the values that it contains.  The
22446 +   static_stat item contains a flag for each stat it contains which
22447 +   indicates whether one should look outside the static_stat item for its
22448 +   contents.
22449 +*/
22450 +
22451 +/* offset of fields in reiser4_key. Value of each element of this enum
22452 +    is index within key (thought as array of __u64's) where this field
22453 +    is. */
22454 +typedef enum {
22455 +       /* major "locale", aka dirid. Sits in 1st element */
22456 +       KEY_LOCALITY_INDEX = 0,
22457 +       /* minor "locale", aka item type. Sits in 1st element */
22458 +       KEY_TYPE_INDEX = 0,
22459 +       ON_LARGE_KEY(KEY_ORDERING_INDEX,)
22460 +           /* "object band". Sits in 2nd element */
22461 +           KEY_BAND_INDEX,
22462 +       /* objectid. Sits in 2nd element */
22463 +       KEY_OBJECTID_INDEX = KEY_BAND_INDEX,
22464 +       /* full objectid. Sits in 2nd element */
22465 +       KEY_FULLOID_INDEX = KEY_BAND_INDEX,
22466 +       /* Offset. Sits in 3rd element */
22467 +       KEY_OFFSET_INDEX,
22468 +       /* Name hash. Sits in 3rd element */
22469 +       KEY_HASH_INDEX = KEY_OFFSET_INDEX,
22470 +       KEY_CACHELINE_END = KEY_OFFSET_INDEX,
22471 +       KEY_LAST_INDEX
22472 +} reiser4_key_field_index;
22473 +
22474 +/* key in reiser4 internal "balanced" tree. It is just array of three
22475 +    64bit integers in disk byte order (little-endian by default). This
22476 +    array is actually indexed by reiser4_key_field.  Each __u64 within
22477 +    this array is called "element". Logical key component encoded within
22478 +    elements are called "fields".
22479 +
22480 +    We declare this as union with second component dummy to suppress
22481 +    inconvenient array<->pointer casts implied in C. */
22482 +union reiser4_key {
22483 +       __le64 el[KEY_LAST_INDEX];
22484 +       int pad;
22485 +};
22486 +
22487 +/* bitmasks showing where within reiser4_key particular key is stored. */
22488 +/* major locality occupies higher 60 bits of the first element */
22489 +#define KEY_LOCALITY_MASK 0xfffffffffffffff0ull
22490 +
22491 +/* minor locality occupies lower 4 bits of the first element */
22492 +#define KEY_TYPE_MASK 0xfull
22493 +
22494 +/* controversial band occupies higher 4 bits of the 2nd element */
22495 +#define KEY_BAND_MASK 0xf000000000000000ull
22496 +
22497 +/* objectid occupies lower 60 bits of the 2nd element */
22498 +#define KEY_OBJECTID_MASK 0x0fffffffffffffffull
22499 +
22500 +/* full 64bit objectid*/
22501 +#define KEY_FULLOID_MASK 0xffffffffffffffffull
22502 +
22503 +/* offset is just 3rd L.M.Nt itself */
22504 +#define KEY_OFFSET_MASK 0xffffffffffffffffull
22505 +
22506 +/* ordering is whole second element */
22507 +#define KEY_ORDERING_MASK 0xffffffffffffffffull
22508 +
22509 +/* how many bits key element should be shifted to left to get particular field */
22510 +typedef enum {
22511 +       KEY_LOCALITY_SHIFT = 4,
22512 +       KEY_TYPE_SHIFT = 0,
22513 +       KEY_BAND_SHIFT = 60,
22514 +       KEY_OBJECTID_SHIFT = 0,
22515 +       KEY_FULLOID_SHIFT = 0,
22516 +       KEY_OFFSET_SHIFT = 0,
22517 +       KEY_ORDERING_SHIFT = 0,
22518 +} reiser4_key_field_shift;
22519 +
22520 +static inline __u64
22521 +get_key_el(const reiser4_key * key, reiser4_key_field_index off)
22522 +{
22523 +       assert("nikita-753", key != NULL);
22524 +       assert("nikita-754", off < KEY_LAST_INDEX);
22525 +       return le64_to_cpu(get_unaligned(&key->el[off]));
22526 +}
22527 +
22528 +static inline void
22529 +set_key_el(reiser4_key * key, reiser4_key_field_index off, __u64 value)
22530 +{
22531 +       assert("nikita-755", key != NULL);
22532 +       assert("nikita-756", off < KEY_LAST_INDEX);
22533 +       put_unaligned(cpu_to_le64(value), &key->el[off]);
22534 +}
22535 +
22536 +/* macro to define getter and setter functions for field F with type T */
22537 +#define DEFINE_KEY_FIELD( L, U, T )                                    \
22538 +static inline T get_key_ ## L ( const reiser4_key *key )               \
22539 +{                                                                      \
22540 +       assert( "nikita-750", key != NULL );                            \
22541 +       return ( T ) ( get_key_el( key, KEY_ ## U ## _INDEX ) &         \
22542 +                KEY_ ## U ## _MASK ) >> KEY_ ## U ## _SHIFT;           \
22543 +}                                                                      \
22544 +                                                                       \
22545 +static inline void set_key_ ## L ( reiser4_key *key, T loc )           \
22546 +{                                                                      \
22547 +       __u64 el;                                                       \
22548 +                                                                       \
22549 +       assert( "nikita-752", key != NULL );                            \
22550 +                                                                       \
22551 +       el = get_key_el( key, KEY_ ## U ## _INDEX );                    \
22552 +       /* clear field bits in the key */                               \
22553 +       el &= ~KEY_ ## U ## _MASK;                                      \
22554 +       /* actually it should be                                        \
22555 +                                                                       \
22556 +          el |= ( loc << KEY_ ## U ## _SHIFT ) & KEY_ ## U ## _MASK;   \
22557 +                                                                       \
22558 +          but we trust user to never pass values that wouldn't fit     \
22559 +          into field. Clearing extra bits is one operation, but this   \
22560 +          function is time-critical.                                   \
22561 +          But check this in assertion. */                              \
22562 +       assert( "nikita-759", ( ( loc << KEY_ ## U ## _SHIFT ) &        \
22563 +               ~KEY_ ## U ## _MASK ) == 0 );                           \
22564 +       el |= ( loc << KEY_ ## U ## _SHIFT );                           \
22565 +       set_key_el( key, KEY_ ## U ## _INDEX, el );                     \
22566 +}
22567 +
22568 +typedef __u64 oid_t;
22569 +
22570 +/* define get_key_locality(), set_key_locality() */
22571 +DEFINE_KEY_FIELD(locality, LOCALITY, oid_t);
22572 +/* define get_key_type(), set_key_type() */
22573 +DEFINE_KEY_FIELD(type, TYPE, key_minor_locality);
22574 +/* define get_key_band(), set_key_band() */
22575 +DEFINE_KEY_FIELD(band, BAND, __u64);
22576 +/* define get_key_objectid(), set_key_objectid() */
22577 +DEFINE_KEY_FIELD(objectid, OBJECTID, oid_t);
22578 +/* define get_key_fulloid(), set_key_fulloid() */
22579 +DEFINE_KEY_FIELD(fulloid, FULLOID, oid_t);
22580 +/* define get_key_offset(), set_key_offset() */
22581 +DEFINE_KEY_FIELD(offset, OFFSET, __u64);
22582 +#if (REISER4_LARGE_KEY)
22583 +/* define get_key_ordering(), set_key_ordering() */
22584 +DEFINE_KEY_FIELD(ordering, ORDERING, __u64);
22585 +#else
22586 +static inline __u64 get_key_ordering(const reiser4_key * key)
22587 +{
22588 +       return 0;
22589 +}
22590 +
22591 +static inline void set_key_ordering(reiser4_key * key, __u64 val)
22592 +{
22593 +}
22594 +#endif
22595 +
22596 +/* key comparison result */
22597 +typedef enum { LESS_THAN = -1, /* if first key is less than second */
22598 +       EQUAL_TO = 0,           /* if keys are equal */
22599 +       GREATER_THAN = +1       /* if first key is greater than second */
22600 +} cmp_t;
22601 +
22602 +void reiser4_key_init(reiser4_key * key);
22603 +
22604 +/* minimal possible key in the tree. Return pointer to the static storage. */
22605 +extern const reiser4_key *reiser4_min_key(void);
22606 +extern const reiser4_key *reiser4_max_key(void);
22607 +
22608 +/* helper macro for keycmp() */
22609 +#define KEY_DIFF(k1, k2, field)                                                        \
22610 +({                                                                             \
22611 +       typeof (get_key_ ## field (k1)) f1;                                     \
22612 +       typeof (get_key_ ## field (k2)) f2;                                     \
22613 +                                                                               \
22614 +       f1 = get_key_ ## field (k1);                                            \
22615 +       f2 = get_key_ ## field (k2);                                            \
22616 +                                                                               \
22617 +       (f1 < f2) ? LESS_THAN : ((f1 == f2) ? EQUAL_TO : GREATER_THAN);         \
22618 +})
22619 +
22620 +/* helper macro for keycmp() */
22621 +#define KEY_DIFF_EL(k1, k2, off)                                               \
22622 +({                                                                             \
22623 +       __u64 e1;                                                               \
22624 +       __u64 e2;                                                               \
22625 +                                                                               \
22626 +       e1 = get_key_el(k1, off);                                               \
22627 +       e2 = get_key_el(k2, off);                                               \
22628 +                                                                               \
22629 +       (e1 < e2) ? LESS_THAN : ((e1 == e2) ? EQUAL_TO : GREATER_THAN);         \
22630 +})
22631 +
22632 +/* compare `k1' and `k2'.  This function is a heart of "key allocation
22633 +    policy". All you need to implement new policy is to add yet another
22634 +    clause here. */
22635 +static inline cmp_t keycmp(const reiser4_key * k1 /* first key to compare */ ,
22636 +                          const reiser4_key * k2 /* second key to compare */ )
22637 +{
22638 +       cmp_t result;
22639 +
22640 +       /*
22641 +        * This function is the heart of reiser4 tree-routines. Key comparison
22642 +        * is among most heavily used operations in the file system.
22643 +        */
22644 +
22645 +       assert("nikita-439", k1 != NULL);
22646 +       assert("nikita-440", k2 != NULL);
22647 +
22648 +       /* there is no actual branch here: condition is compile time constant
22649 +        * and constant folding and propagation ensures that only one branch
22650 +        * is actually compiled in. */
22651 +
22652 +       if (REISER4_PLANA_KEY_ALLOCATION) {
22653 +               /* if physical order of fields in a key is identical
22654 +                  with logical order, we can implement key comparison
22655 +                  as three 64bit comparisons. */
22656 +               /* logical order of fields in plan-a:
22657 +                  locality->type->objectid->offset. */
22658 +               /* compare locality and type at once */
22659 +               result = KEY_DIFF_EL(k1, k2, 0);
22660 +               if (result == EQUAL_TO) {
22661 +                       /* compare objectid (and band if it's there) */
22662 +                       result = KEY_DIFF_EL(k1, k2, 1);
22663 +                       /* compare offset */
22664 +                       if (result == EQUAL_TO) {
22665 +                               result = KEY_DIFF_EL(k1, k2, 2);
22666 +                               if (REISER4_LARGE_KEY && result == EQUAL_TO) {
22667 +                                       result = KEY_DIFF_EL(k1, k2, 3);
22668 +                               }
22669 +                       }
22670 +               }
22671 +       } else if (REISER4_3_5_KEY_ALLOCATION) {
22672 +               result = KEY_DIFF(k1, k2, locality);
22673 +               if (result == EQUAL_TO) {
22674 +                       result = KEY_DIFF(k1, k2, objectid);
22675 +                       if (result == EQUAL_TO) {
22676 +                               result = KEY_DIFF(k1, k2, type);
22677 +                               if (result == EQUAL_TO)
22678 +                                       result = KEY_DIFF(k1, k2, offset);
22679 +                       }
22680 +               }
22681 +       } else
22682 +               impossible("nikita-441", "Unknown key allocation scheme!");
22683 +       return result;
22684 +}
22685 +
22686 +/* true if @k1 equals @k2 */
22687 +static inline int keyeq(const reiser4_key * k1 /* first key to compare */ ,
22688 +                       const reiser4_key * k2 /* second key to compare */ )
22689 +{
22690 +       assert("nikita-1879", k1 != NULL);
22691 +       assert("nikita-1880", k2 != NULL);
22692 +       return !memcmp(k1, k2, sizeof *k1);
22693 +}
22694 +
22695 +/* true if @k1 is less than @k2 */
22696 +static inline int keylt(const reiser4_key * k1 /* first key to compare */ ,
22697 +                       const reiser4_key * k2 /* second key to compare */ )
22698 +{
22699 +       assert("nikita-1952", k1 != NULL);
22700 +       assert("nikita-1953", k2 != NULL);
22701 +       return keycmp(k1, k2) == LESS_THAN;
22702 +}
22703 +
22704 +/* true if @k1 is less than or equal to @k2 */
22705 +static inline int keyle(const reiser4_key * k1 /* first key to compare */ ,
22706 +                       const reiser4_key * k2 /* second key to compare */ )
22707 +{
22708 +       assert("nikita-1954", k1 != NULL);
22709 +       assert("nikita-1955", k2 != NULL);
22710 +       return keycmp(k1, k2) != GREATER_THAN;
22711 +}
22712 +
22713 +/* true if @k1 is greater than @k2 */
22714 +static inline int keygt(const reiser4_key * k1 /* first key to compare */ ,
22715 +                       const reiser4_key * k2 /* second key to compare */ )
22716 +{
22717 +       assert("nikita-1959", k1 != NULL);
22718 +       assert("nikita-1960", k2 != NULL);
22719 +       return keycmp(k1, k2) == GREATER_THAN;
22720 +}
22721 +
22722 +/* true if @k1 is greater than or equal to @k2 */
22723 +static inline int keyge(const reiser4_key * k1 /* first key to compare */ ,
22724 +                       const reiser4_key * k2 /* second key to compare */ )
22725 +{
22726 +       assert("nikita-1956", k1 != NULL);
22727 +       assert("nikita-1957", k2 != NULL);      /* October  4: sputnik launched
22728 +                                                * November 3: Laika */
22729 +       return keycmp(k1, k2) != LESS_THAN;
22730 +}
22731 +
22732 +static inline void prefetchkey(reiser4_key * key)
22733 +{
22734 +       prefetch(key);
22735 +       prefetch(&key->el[KEY_CACHELINE_END]);
22736 +}
22737 +
22738 +/* (%Lx:%x:%Lx:%Lx:%Lx:%Lx) =
22739 +           1 + 16 + 1 + 1 + 1 + 1 + 1 + 16 + 1 + 16 + 1 + 16 + 1 */
22740 +/* size of a buffer suitable to hold human readable key representation */
22741 +#define KEY_BUF_LEN (80)
22742 +
22743 +#if REISER4_DEBUG
22744 +extern void reiser4_print_key(const char *prefix, const reiser4_key * key);
22745 +#else
22746 +#define reiser4_print_key(p,k) noop
22747 +#endif
22748 +
22749 +/* __FS_REISERFS_KEY_H__ */
22750 +#endif
22751 +
22752 +/* Make Linus happy.
22753 +   Local variables:
22754 +   c-indentation-style: "K&R"
22755 +   mode-name: "LC"
22756 +   c-basic-offset: 8
22757 +   tab-width: 8
22758 +   fill-column: 120
22759 +   End:
22760 +*/
22761 diff --git a/fs/reiser4/ktxnmgrd.c b/fs/reiser4/ktxnmgrd.c
22762 new file mode 100644
22763 index 0000000..15bb6d6
22764 --- /dev/null
22765 +++ b/fs/reiser4/ktxnmgrd.c
22766 @@ -0,0 +1,215 @@
22767 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
22768 +/* Transaction manager daemon. */
22769 +
22770 +/*
22771 + * ktxnmgrd is a kernel daemon responsible for committing transactions. It is
22772 + * needed/important for the following reasons:
22773 + *
22774 + *     1. in reiser4 atom is not committed immediately when last transaction
22775 + *     handle closes, unless atom is either too old or too large (see
22776 + *     atom_should_commit()). This is done to avoid committing too frequently.
22777 + *     because:
22778 + *
22779 + *     2. sometimes we don't want to commit atom when closing last transaction
22780 + *     handle even if it is old and fat enough. For example, because we are at
22781 + *     this point under directory semaphore, and committing would stall all
22782 + *     accesses to this directory.
22783 + *
22784 + * ktxnmgrd binds its time sleeping on condition variable. When is awakes
22785 + * either due to (tunable) timeout or because it was explicitly woken up by
22786 + * call to ktxnmgrd_kick(), it scans list of all atoms and commits ones
22787 + * eligible.
22788 + *
22789 + */
22790 +
22791 +#include "debug.h"
22792 +#include "txnmgr.h"
22793 +#include "tree.h"
22794 +#include "ktxnmgrd.h"
22795 +#include "super.h"
22796 +#include "reiser4.h"
22797 +
22798 +#include <linux/sched.h>       /* for struct task_struct */
22799 +#include <linux/wait.h>
22800 +#include <linux/suspend.h>
22801 +#include <linux/kernel.h>
22802 +#include <linux/writeback.h>
22803 +#include <linux/kthread.h>
22804 +#include <linux/freezer.h>
22805 +
22806 +static int scan_mgr(struct super_block *);
22807 +
22808 +/*
22809 + * change current->comm so that ps, top, and friends will see changed
22810 + * state. This serves no useful purpose whatsoever, but also costs nothing. May
22811 + * be it will make lonely system administrator feeling less alone at 3 A.M.
22812 + */
22813 +#define set_comm( state )                                              \
22814 +       snprintf( current -> comm, sizeof( current -> comm ),   \
22815 +                 "%s:%s:%s", __FUNCTION__, (super)->s_id, ( state ) )
22816 +
22817 +/**
22818 + * ktxnmgrd - kernel txnmgr daemon
22819 + * @arg: pointer to super block
22820 + *
22821 + * The background transaction manager daemon, started as a kernel thread during
22822 + * reiser4 initialization.
22823 + */
22824 +static int ktxnmgrd(void *arg)
22825 +{
22826 +       struct super_block *super;
22827 +       ktxnmgrd_context *ctx;
22828 +       txn_mgr *mgr;
22829 +       int done = 0;
22830 +
22831 +       super = arg;
22832 +       mgr = &get_super_private(super)->tmgr;
22833 +
22834 +       /*
22835 +        * do_fork() just copies task_struct into the new thread. ->fs_context
22836 +        * shouldn't be copied of course. This shouldn't be a problem for the
22837 +        * rest of the code though.
22838 +        */
22839 +       current->journal_info = NULL;
22840 +       ctx = mgr->daemon;
22841 +       while (1) {
22842 +               try_to_freeze();
22843 +               set_comm("wait");
22844 +               {
22845 +                       DEFINE_WAIT(__wait);
22846 +
22847 +                       prepare_to_wait(&ctx->wait, &__wait, TASK_INTERRUPTIBLE);
22848 +                       if (kthread_should_stop()) {
22849 +                               done = 1;
22850 +                       } else
22851 +                               schedule_timeout(ctx->timeout);
22852 +                       finish_wait(&ctx->wait, &__wait);
22853 +               }
22854 +               if (done)
22855 +                       break;
22856 +               set_comm("run");
22857 +               spin_lock(&ctx->guard);
22858 +               /*
22859 +                * wait timed out or ktxnmgrd was woken up by explicit request
22860 +                * to commit something. Scan list of atoms in txnmgr and look
22861 +                * for too old atoms.
22862 +                */
22863 +               do {
22864 +                       ctx->rescan = 0;
22865 +                       scan_mgr(super);
22866 +                       spin_lock(&ctx->guard);
22867 +                       if (ctx->rescan) {
22868 +                               /*
22869 +                                * the list could be modified while ctx
22870 +                                * spinlock was released, we have to repeat
22871 +                                * scanning from the beginning
22872 +                                */
22873 +                               break;
22874 +                       }
22875 +               } while (ctx->rescan);
22876 +               spin_unlock(&ctx->guard);
22877 +       }
22878 +       return 0;
22879 +}
22880 +
22881 +#undef set_comm
22882 +
22883 +/**
22884 + * reiser4_init_ktxnmgrd - initialize ktxnmgrd context and start kernel daemon
22885 + * @super: pointer to super block
22886 + *
22887 + * Allocates and initializes ktxnmgrd_context, attaches it to transaction
22888 + * manager. Starts kernel txnmgr daemon. This is called on mount.
22889 + */
22890 +int reiser4_init_ktxnmgrd(struct super_block *super)
22891 +{
22892 +       txn_mgr *mgr;
22893 +       ktxnmgrd_context *ctx;
22894 +
22895 +       mgr = &get_super_private(super)->tmgr;
22896 +
22897 +       assert("zam-1014", mgr->daemon == NULL);
22898 +
22899 +       ctx = kmalloc(sizeof(ktxnmgrd_context), reiser4_ctx_gfp_mask_get());
22900 +       if (ctx == NULL)
22901 +               return RETERR(-ENOMEM);
22902 +
22903 +       assert("nikita-2442", ctx != NULL);
22904 +
22905 +       memset(ctx, 0, sizeof *ctx);
22906 +       init_waitqueue_head(&ctx->wait);
22907 +
22908 +       /*kcond_init(&ctx->startup);*/
22909 +       spin_lock_init(&ctx->guard);
22910 +       ctx->timeout = REISER4_TXNMGR_TIMEOUT;
22911 +       ctx->rescan = 1;
22912 +       mgr->daemon = ctx;
22913 +
22914 +       ctx->tsk = kthread_run(ktxnmgrd, super, "ktxnmgrd");
22915 +       if (IS_ERR(ctx->tsk)) {
22916 +               int ret = PTR_ERR(ctx->tsk);
22917 +               mgr->daemon = NULL;
22918 +               kfree(ctx);
22919 +               return RETERR(ret);
22920 +       }
22921 +       return 0;
22922 +}
22923 +
22924 +void ktxnmgrd_kick(txn_mgr *mgr)
22925 +{
22926 +       assert("nikita-3234", mgr != NULL);
22927 +       assert("nikita-3235", mgr->daemon != NULL);
22928 +       wake_up(&mgr->daemon->wait);
22929 +}
22930 +
22931 +int is_current_ktxnmgrd(void)
22932 +{
22933 +       return (get_current_super_private()->tmgr.daemon->tsk == current);
22934 +}
22935 +
22936 +/**
22937 + * scan_mgr - commit atoms which are to be committed
22938 + * @super: super block to commit atoms of
22939 + *
22940 + * Commits old atoms.
22941 + */
22942 +static int scan_mgr(struct super_block *super)
22943 +{
22944 +       int ret;
22945 +       reiser4_context ctx;
22946 +
22947 +       init_stack_context(&ctx, super);
22948 +
22949 +       ret = commit_some_atoms(&get_super_private(super)->tmgr);
22950 +
22951 +       reiser4_exit_context(&ctx);
22952 +       return ret;
22953 +}
22954 +
22955 +/**
22956 + * reiser4_done_ktxnmgrd - stop kernel thread and frees ktxnmgrd context
22957 + * @mgr:
22958 + *
22959 + * This is called on umount. Stops ktxnmgrd and free t
22960 + */
22961 +void reiser4_done_ktxnmgrd(struct super_block *super)
22962 +{
22963 +       txn_mgr *mgr;
22964 +
22965 +       mgr = &get_super_private(super)->tmgr;
22966 +       assert("zam-1012", mgr->daemon != NULL);
22967 +
22968 +       kthread_stop(mgr->daemon->tsk);
22969 +       kfree(mgr->daemon);
22970 +       mgr->daemon = NULL;
22971 +}
22972 +
22973 +/*
22974 + * Local variables:
22975 + * c-indentation-style: "K&R"
22976 + * mode-name: "LC"
22977 + * c-basic-offset: 8
22978 + * tab-width: 8
22979 + * fill-column: 120
22980 + * End:
22981 + */
22982 diff --git a/fs/reiser4/ktxnmgrd.h b/fs/reiser4/ktxnmgrd.h
22983 new file mode 100644
22984 index 0000000..d00f1d9
22985 --- /dev/null
22986 +++ b/fs/reiser4/ktxnmgrd.h
22987 @@ -0,0 +1,52 @@
22988 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
22989 + * reiser4/README */
22990 +
22991 +/* Transaction manager daemon. See ktxnmgrd.c for comments. */
22992 +
22993 +#ifndef __KTXNMGRD_H__
22994 +#define __KTXNMGRD_H__
22995 +
22996 +#include "txnmgr.h"
22997 +
22998 +#include <linux/fs.h>
22999 +#include <linux/wait.h>
23000 +#include <linux/completion.h>
23001 +#include <linux/spinlock.h>
23002 +#include <asm/atomic.h>
23003 +#include <linux/sched.h>       /* for struct task_struct */
23004 +
23005 +/* in this structure all data necessary to start up, shut down and communicate
23006 + * with ktxnmgrd are kept. */
23007 +struct ktxnmgrd_context {
23008 +       /* wait queue head on which ktxnmgrd sleeps */
23009 +       wait_queue_head_t wait;
23010 +       /* spin lock protecting all fields of this structure */
23011 +       spinlock_t guard;
23012 +       /* timeout of sleeping on ->wait */
23013 +       signed long timeout;
23014 +       /* kernel thread running ktxnmgrd */
23015 +       struct task_struct *tsk;
23016 +       /* list of all file systems served by this ktxnmgrd */
23017 +       struct list_head queue;
23018 +       /* should ktxnmgrd repeat scanning of atoms? */
23019 +       unsigned int rescan:1;
23020 +};
23021 +
23022 +extern int reiser4_init_ktxnmgrd(struct super_block *);
23023 +extern void reiser4_done_ktxnmgrd(struct super_block *);
23024 +
23025 +extern void ktxnmgrd_kick(txn_mgr * mgr);
23026 +extern int is_current_ktxnmgrd(void);
23027 +
23028 +/* __KTXNMGRD_H__ */
23029 +#endif
23030 +
23031 +/* Make Linus happy.
23032 +   Local variables:
23033 +   c-indentation-style: "K&R"
23034 +   mode-name: "LC"
23035 +   c-basic-offset: 8
23036 +   tab-width: 8
23037 +   fill-column: 120
23038 +   End:
23039 +*/
23040 diff --git a/fs/reiser4/lock.c b/fs/reiser4/lock.c
23041 new file mode 100644
23042 index 0000000..cdca928
23043 --- /dev/null
23044 +++ b/fs/reiser4/lock.c
23045 @@ -0,0 +1,1232 @@
23046 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
23047 + * reiser4/README */
23048 +
23049 +/* Traditional deadlock avoidance is achieved by acquiring all locks in a single
23050 +   order.  V4 balances the tree from the bottom up, and searches the tree from
23051 +   the top down, and that is really the way we want it, so tradition won't work
23052 +   for us.
23053 +
23054 +   Instead we have two lock orderings, a high priority lock ordering, and a low
23055 +   priority lock ordering.  Each node in the tree has a lock in its znode.
23056 +
23057 +   Suppose we have a set of processes which lock (R/W) tree nodes. Each process
23058 +   has a set (maybe empty) of already locked nodes ("process locked set"). Each
23059 +   process may have a pending lock request to a node locked by another process.
23060 +   Note: we lock and unlock, but do not transfer locks: it is possible
23061 +   transferring locks instead would save some bus locking....
23062 +
23063 +   Deadlock occurs when we have a loop constructed from process locked sets and
23064 +   lock request vectors.
23065 +
23066 +   NOTE: The reiser4 "tree" is a tree on disk, but its cached representation in
23067 +   memory is extended with "znodes" with which we connect nodes with their left
23068 +   and right neighbors using sibling pointers stored in the znodes.  When we
23069 +   perform balancing operations we often go from left to right and from right to
23070 +   left.
23071 +
23072 +   +-P1-+          +-P3-+
23073 +   |+--+|   V1     |+--+|
23074 +   ||N1|| -------> ||N3||
23075 +   |+--+|          |+--+|
23076 +   +----+          +----+
23077 +     ^               |
23078 +     |V2             |V3
23079 +     |               v
23080 +   +---------P2---------+
23081 +   |+--+            +--+|
23082 +   ||N2|  --------  |N4||
23083 +   |+--+            +--+|
23084 +   +--------------------+
23085 +
23086 +   We solve this by ensuring that only low priority processes lock in top to
23087 +   bottom order and from right to left, and high priority processes lock from
23088 +   bottom to top and left to right.
23089 +
23090 +   ZAM-FIXME-HANS: order not just node locks in this way, order atom locks, and
23091 +   kill those damn busy loops.
23092 +   ANSWER(ZAM): atom locks (which are introduced by ASTAGE_CAPTURE_WAIT atom
23093 +   stage) cannot be ordered that way. There are no rules what nodes can belong
23094 +   to the atom and what nodes cannot.  We cannot define what is right or left
23095 +   direction, what is top or bottom.  We can take immediate parent or side
23096 +   neighbor of one node, but nobody guarantees that, say, left neighbor node is
23097 +   not a far right neighbor for other nodes from the same atom.  It breaks
23098 +   deadlock avoidance rules and hi-low priority locking cannot be applied for
23099 +   atom locks.
23100 +
23101 +   How does it help to avoid deadlocks ?
23102 +
23103 +   Suppose we have a deadlock with n processes. Processes from one priority
23104 +   class never deadlock because they take locks in one consistent
23105 +   order.
23106 +
23107 +   So, any possible deadlock loop must have low priority as well as high
23108 +   priority processes.  There are no other lock priority levels except low and
23109 +   high. We know that any deadlock loop contains at least one node locked by a
23110 +   low priority process and requested by a high priority process. If this
23111 +   situation is caught and resolved it is sufficient to avoid deadlocks.
23112 +
23113 +   V4 DEADLOCK PREVENTION ALGORITHM IMPLEMENTATION.
23114 +
23115 +   The deadlock prevention algorithm is based on comparing
23116 +   priorities of node owners (processes which keep znode locked) and
23117 +   requesters (processes which want to acquire a lock on znode).  We
23118 +   implement a scheme where low-priority owners yield locks to
23119 +   high-priority requesters. We created a signal passing system that
23120 +   is used to ask low-priority processes to yield one or more locked
23121 +   znodes.
23122 +
23123 +   The condition when a znode needs to change its owners is described by the
23124 +   following formula:
23125 +
23126 +   #############################################
23127 +   #                                           #
23128 +   # (number of high-priority requesters) >  0 #
23129 +   #                AND                        #
23130 +   # (numbers of high-priority owners)    == 0 #
23131 +   #                                           #
23132 +   #############################################
23133 +
23134 +   Note that a low-priority process delays node releasing if another
23135 +   high-priority process owns this node.  So, slightly more strictly speaking,
23136 +   to have a deadlock capable cycle you must have a loop in which a high
23137 +   priority process is waiting on a low priority process to yield a node, which
23138 +   is slightly different from saying a high priority process is waiting on a
23139 +   node owned by a low priority process.
23140 +
23141 +   It is enough to avoid deadlocks if we prevent any low-priority process from
23142 +   falling asleep if its locked set contains a node which satisfies the
23143 +   deadlock condition.
23144 +
23145 +   That condition is implicitly or explicitly checked in all places where new
23146 +   high-priority requests may be added or removed from node request queue or
23147 +   high-priority process takes or releases a lock on node. The main
23148 +   goal of these checks is to never lose the moment when node becomes "has
23149 +   wrong owners" and send "must-yield-this-lock" signals to its low-pri owners
23150 +   at that time.
23151 +
23152 +   The information about received signals is stored in the per-process
23153 +   structure (lock stack) and analyzed before a low-priority process goes to
23154 +   sleep but after a "fast" attempt to lock a node fails. Any signal wakes
23155 +   sleeping process up and forces him to re-check lock status and received
23156 +   signal info. If "must-yield-this-lock" signals were received the locking
23157 +   primitive (longterm_lock_znode()) fails with -E_DEADLOCK error code.
23158 +
23159 +   V4 LOCKING DRAWBACKS
23160 +
23161 +   If we have already balanced on one level, and we are propagating our changes
23162 +   upward to a higher level, it could be very messy to surrender all locks on
23163 +   the lower level because we put so much computational work into it, and
23164 +   reverting them to their state before they were locked might be very complex.
23165 +   We also don't want to acquire all locks before performing balancing because
23166 +   that would either be almost as much work as the balancing, or it would be
23167 +   too conservative and lock too much.  We want balancing to be done only at
23168 +   high priority.  Yet, we might want to go to the left one node and use some
23169 +   of its empty space... So we make one attempt at getting the node to the left
23170 +   using try_lock, and if it fails we do without it, because we didn't really
23171 +   need it, it was only a nice to have.
23172 +
23173 +   LOCK STRUCTURES DESCRIPTION
23174 +
23175 +   The following data structures are used in the reiser4 locking
23176 +   implementation:
23177 +
23178 +   All fields related to long-term locking are stored in znode->lock.
23179 +
23180 +   The lock stack is a per thread object.  It owns all znodes locked by the
23181 +   thread. One znode may be locked by several threads in case of read lock or
23182 +   one znode may be write locked by one thread several times. The special link
23183 +   objects (lock handles) support n<->m relation between znodes and lock
23184 +   owners.
23185 +
23186 +   <Thread 1>                       <Thread 2>
23187 +
23188 +   +---------+                     +---------+
23189 +   |  LS1    |                    |  LS2    |
23190 +   +---------+                    +---------+
23191 +       ^                                ^
23192 +       |---------------+                +----------+
23193 +       v               v                v          v
23194 +   +---------+      +---------+    +---------+   +---------+
23195 +   |  LH1    |      |   LH2   |           |  LH3    |   |   LH4   |
23196 +   +---------+     +---------+    +---------+   +---------+
23197 +       ^                   ^            ^           ^
23198 +       |                   +------------+           |
23199 +       v                   v                        v
23200 +   +---------+      +---------+                  +---------+
23201 +   |  Z1     |     |   Z2    |                  |  Z3     |
23202 +   +---------+     +---------+                  +---------+
23203 +
23204 +   Thread 1 locked znodes Z1 and Z2, thread 2 locked znodes Z2 and Z3. The
23205 +   picture above shows that lock stack LS1 has a list of 2 lock handles LH1 and
23206 +   LH2, lock stack LS2 has a list with lock handles LH3 and LH4 on it.  Znode
23207 +   Z1 is locked by only one thread, znode has only one lock handle LH1 on its
23208 +   list, similar situation is for Z3 which is locked by the thread 2 only. Z2
23209 +   is locked (for read) twice by different threads and two lock handles are on
23210 +   its list. Each lock handle represents a single relation of a locking of a
23211 +   znode by a thread. Locking of a znode is an establishing of a locking
23212 +   relation between the lock stack and the znode by adding of a new lock handle
23213 +   to a list of lock handles, the lock stack.  The lock stack links all lock
23214 +   handles for all znodes locked by the lock stack.  The znode list groups all
23215 +   lock handles for all locks stacks which locked the znode.
23216 +
23217 +   Yet another relation may exist between znode and lock owners.  If lock
23218 +   procedure cannot immediately take lock on an object it adds the lock owner
23219 +   on special `requestors' list belongs to znode.  That list represents a
23220 +   queue of pending lock requests.  Because one lock owner may request only
23221 +   only one lock object at a time, it is a 1->n relation between lock objects
23222 +   and a lock owner implemented as it is described above. Full information
23223 +   (priority, pointers to lock and link objects) about each lock request is
23224 +   stored in lock owner structure in `request' field.
23225 +
23226 +   SHORT_TERM LOCKING
23227 +
23228 +   This is a list of primitive operations over lock stacks / lock handles /
23229 +   znodes and locking descriptions for them.
23230 +
23231 +   1. locking / unlocking which is done by two list insertion/deletion, one
23232 +      to/from znode's list of lock handles, another one is to/from lock stack's
23233 +      list of lock handles.  The first insertion is protected by
23234 +      znode->lock.guard spinlock.  The list owned by the lock stack can be
23235 +      modified only by thread who owns the lock stack and nobody else can
23236 +      modify/read it. There is nothing to be protected by a spinlock or
23237 +      something else.
23238 +
23239 +   2. adding/removing a lock request to/from znode requesters list. The rule is
23240 +      that znode->lock.guard spinlock should be taken for this.
23241 +
23242 +   3. we can traverse list of lock handles and use references to lock stacks who
23243 +      locked given znode if znode->lock.guard spinlock is taken.
23244 +
23245 +   4. If a lock stack is associated with a znode as a lock requestor or lock
23246 +      owner its existence is guaranteed by znode->lock.guard spinlock.  Some its
23247 +      (lock stack's) fields should be protected from being accessed in parallel
23248 +      by two or more threads. Please look at  lock_stack structure definition
23249 +      for the info how those fields are protected. */
23250 +
23251 +/* Znode lock and capturing intertwining. */
23252 +/* In current implementation we capture formatted nodes before locking
23253 +   them. Take a look on longterm lock znode, reiser4_try_capture() request
23254 +   precedes locking requests.  The longterm_lock_znode function unconditionally
23255 +   captures znode before even checking of locking conditions.
23256 +
23257 +   Another variant is to capture znode after locking it.  It was not tested, but
23258 +   at least one deadlock condition is supposed to be there.  One thread has
23259 +   locked a znode (Node-1) and calls reiser4_try_capture() for it.
23260 +   reiser4_try_capture() sleeps because znode's atom has CAPTURE_WAIT state.
23261 +   Second thread is a flushing thread, its current atom is the atom Node-1
23262 +   belongs to. Second thread wants to lock Node-1 and sleeps because Node-1
23263 +   is locked by the first thread.  The described situation is a deadlock. */
23264 +
23265 +#include "debug.h"
23266 +#include "txnmgr.h"
23267 +#include "znode.h"
23268 +#include "jnode.h"
23269 +#include "tree.h"
23270 +#include "plugin/node/node.h"
23271 +#include "super.h"
23272 +
23273 +#include <linux/spinlock.h>
23274 +
23275 +#if REISER4_DEBUG
23276 +static int request_is_deadlock_safe(znode *, znode_lock_mode,
23277 +                                   znode_lock_request);
23278 +#endif
23279 +
23280 +/* Returns a lock owner associated with current thread */
23281 +lock_stack *get_current_lock_stack(void)
23282 +{
23283 +       return &get_current_context()->stack;
23284 +}
23285 +
23286 +/* Wakes up all low priority owners informing them about possible deadlock */
23287 +static void wake_up_all_lopri_owners(znode * node)
23288 +{
23289 +       lock_handle *handle;
23290 +
23291 +       assert_spin_locked(&(node->lock.guard));
23292 +       list_for_each_entry(handle, &node->lock.owners, owners_link) {
23293 +               assert("nikita-1832", handle->node == node);
23294 +               /* count this signal in owner->nr_signaled */
23295 +               if (!handle->signaled) {
23296 +                       handle->signaled = 1;
23297 +                       atomic_inc(&handle->owner->nr_signaled);
23298 +                       /* Wake up a single process */
23299 +                       reiser4_wake_up(handle->owner);
23300 +               }
23301 +       }
23302 +}
23303 +
23304 +/* Adds a lock to a lock owner, which means creating a link to the lock and
23305 +   putting the link into the two lists all links are on (the doubly linked list
23306 +   that forms the lock_stack, and the doubly linked list of links attached
23307 +   to a lock.
23308 +*/
23309 +static inline void
23310 +link_object(lock_handle * handle, lock_stack * owner, znode * node)
23311 +{
23312 +       assert("jmacd-810", handle->owner == NULL);
23313 +       assert_spin_locked(&(node->lock.guard));
23314 +
23315 +       handle->owner = owner;
23316 +       handle->node = node;
23317 +
23318 +       assert("reiser4-4",
23319 +              ergo(list_empty_careful(&owner->locks), owner->nr_locks == 0));
23320 +
23321 +       /* add lock handle to the end of lock_stack's list of locks */
23322 +       list_add_tail(&handle->locks_link, &owner->locks);
23323 +       ON_DEBUG(owner->nr_locks++);
23324 +       reiser4_ctx_gfp_mask_set();
23325 +
23326 +       /* add lock handle to the head of znode's list of owners */
23327 +       list_add(&handle->owners_link, &node->lock.owners);
23328 +       handle->signaled = 0;
23329 +}
23330 +
23331 +/* Breaks a relation between a lock and its owner */
23332 +static inline void unlink_object(lock_handle * handle)
23333 +{
23334 +       assert("zam-354", handle->owner != NULL);
23335 +       assert("nikita-1608", handle->node != NULL);
23336 +       assert_spin_locked(&(handle->node->lock.guard));
23337 +       assert("nikita-1829", handle->owner == get_current_lock_stack());
23338 +       assert("reiser4-5", handle->owner->nr_locks > 0);
23339 +
23340 +       /* remove lock handle from lock_stack's list of locks */
23341 +       list_del(&handle->locks_link);
23342 +       ON_DEBUG(handle->owner->nr_locks--);
23343 +       reiser4_ctx_gfp_mask_set();
23344 +       assert("reiser4-6",
23345 +              ergo(list_empty_careful(&handle->owner->locks),
23346 +                   handle->owner->nr_locks == 0));
23347 +       /* remove lock handle from znode's list of owners */
23348 +       list_del(&handle->owners_link);
23349 +       /* indicates that lock handle is free now */
23350 +       handle->node = NULL;
23351 +#if REISER4_DEBUG
23352 +       INIT_LIST_HEAD(&handle->locks_link);
23353 +       INIT_LIST_HEAD(&handle->owners_link);
23354 +       handle->owner = NULL;
23355 +#endif
23356 +}
23357 +
23358 +/* Actually locks an object knowing that we are able to do this */
23359 +static void lock_object(lock_stack * owner)
23360 +{
23361 +       lock_request *request;
23362 +       znode *node;
23363 +
23364 +       request = &owner->request;
23365 +       node = request->node;
23366 +       assert_spin_locked(&(node->lock.guard));
23367 +       if (request->mode == ZNODE_READ_LOCK) {
23368 +               node->lock.nr_readers++;
23369 +       } else {
23370 +               /* check that we don't switched from read to write lock */
23371 +               assert("nikita-1840", node->lock.nr_readers <= 0);
23372 +               /* We allow recursive locking; a node can be locked several
23373 +                  times for write by same process */
23374 +               node->lock.nr_readers--;
23375 +       }
23376 +
23377 +       link_object(request->handle, owner, node);
23378 +
23379 +       if (owner->curpri) {
23380 +               node->lock.nr_hipri_owners++;
23381 +       }
23382 +}
23383 +
23384 +/* Check for recursive write locking */
23385 +static int recursive(lock_stack * owner)
23386 +{
23387 +       int ret;
23388 +       znode *node;
23389 +       lock_handle *lh;
23390 +
23391 +       node = owner->request.node;
23392 +
23393 +       /* Owners list is not empty for a locked node */
23394 +       assert("zam-314", !list_empty_careful(&node->lock.owners));
23395 +       assert("nikita-1841", owner == get_current_lock_stack());
23396 +       assert_spin_locked(&(node->lock.guard));
23397 +
23398 +       lh = list_entry(node->lock.owners.next, lock_handle, owners_link);
23399 +       ret = (lh->owner == owner);
23400 +
23401 +       /* Recursive read locking should be done usual way */
23402 +       assert("zam-315", !ret || owner->request.mode == ZNODE_WRITE_LOCK);
23403 +       /* mixing of read/write locks is not allowed */
23404 +       assert("zam-341", !ret || znode_is_wlocked(node));
23405 +
23406 +       return ret;
23407 +}
23408 +
23409 +#if REISER4_DEBUG
23410 +/* Returns true if the lock is held by the calling thread. */
23411 +int znode_is_any_locked(const znode * node)
23412 +{
23413 +       lock_handle *handle;
23414 +       lock_stack *stack;
23415 +       int ret;
23416 +
23417 +       if (!znode_is_locked(node)) {
23418 +               return 0;
23419 +       }
23420 +
23421 +       stack = get_current_lock_stack();
23422 +
23423 +       spin_lock_stack(stack);
23424 +
23425 +       ret = 0;
23426 +
23427 +       list_for_each_entry(handle, &stack->locks, locks_link) {
23428 +               if (handle->node == node) {
23429 +                       ret = 1;
23430 +                       break;
23431 +               }
23432 +       }
23433 +
23434 +       spin_unlock_stack(stack);
23435 +
23436 +       return ret;
23437 +}
23438 +
23439 +#endif
23440 +
23441 +/* Returns true if a write lock is held by the calling thread. */
23442 +int znode_is_write_locked(const znode * node)
23443 +{
23444 +       lock_stack *stack;
23445 +       lock_handle *handle;
23446 +
23447 +       assert("jmacd-8765", node != NULL);
23448 +
23449 +       if (!znode_is_wlocked(node)) {
23450 +               return 0;
23451 +       }
23452 +
23453 +       stack = get_current_lock_stack();
23454 +
23455 +       /*
23456 +        * When znode is write locked, all owner handles point to the same lock
23457 +        * stack. Get pointer to lock stack from the first lock handle from
23458 +        * znode's owner list
23459 +        */
23460 +       handle = list_entry(node->lock.owners.next, lock_handle, owners_link);
23461 +
23462 +       return (handle->owner == stack);
23463 +}
23464 +
23465 +/* This "deadlock" condition is the essential part of reiser4 locking
23466 +   implementation. This condition is checked explicitly by calling
23467 +   check_deadlock_condition() or implicitly in all places where znode lock
23468 +   state (set of owners and request queue) is changed. Locking code is
23469 +   designed to use this condition to trigger procedure of passing object from
23470 +   low priority owner(s) to high priority one(s).
23471 +
23472 +   The procedure results in passing an event (setting lock_handle->signaled
23473 +   flag) and counting this event in nr_signaled field of owner's lock stack
23474 +   object and wakeup owner's process.
23475 +*/
23476 +static inline int check_deadlock_condition(znode * node)
23477 +{
23478 +       assert_spin_locked(&(node->lock.guard));
23479 +       return node->lock.nr_hipri_requests > 0
23480 +           && node->lock.nr_hipri_owners == 0;
23481 +}
23482 +
23483 +static int check_livelock_condition(znode * node, znode_lock_mode mode)
23484 +{
23485 +       zlock * lock = &node->lock;
23486 +
23487 +       return mode == ZNODE_READ_LOCK &&
23488 +               lock -> nr_readers >= 0 && lock->nr_hipri_write_requests > 0;
23489 +}
23490 +
23491 +/* checks lock/request compatibility */
23492 +static int can_lock_object(lock_stack * owner)
23493 +{
23494 +       znode *node = owner->request.node;
23495 +
23496 +       assert_spin_locked(&(node->lock.guard));
23497 +
23498 +       /* See if the node is disconnected. */
23499 +       if (unlikely(ZF_ISSET(node, JNODE_IS_DYING)))
23500 +               return RETERR(-EINVAL);
23501 +
23502 +       /* Do not ever try to take a lock if we are going in low priority
23503 +          direction and a node have a high priority request without high
23504 +          priority owners. */
23505 +       if (unlikely(!owner->curpri && check_deadlock_condition(node)))
23506 +               return RETERR(-E_REPEAT);
23507 +       if (unlikely(owner->curpri && check_livelock_condition(node, owner->request.mode)))
23508 +               return RETERR(-E_REPEAT);
23509 +       if (unlikely(!is_lock_compatible(node, owner->request.mode)))
23510 +               return RETERR(-E_REPEAT);
23511 +       return 0;
23512 +}
23513 +
23514 +/* Setting of a high priority to the process. It clears "signaled" flags
23515 +   because znode locked by high-priority process can't satisfy our "deadlock
23516 +   condition". */
23517 +static void set_high_priority(lock_stack * owner)
23518 +{
23519 +       assert("nikita-1846", owner == get_current_lock_stack());
23520 +       /* Do nothing if current priority is already high */
23521 +       if (!owner->curpri) {
23522 +               /* We don't need locking for owner->locks list, because, this
23523 +                * function is only called with the lock stack of the current
23524 +                * thread, and no other thread can play with owner->locks list
23525 +                * and/or change ->node pointers of lock handles in this list.
23526 +                *
23527 +                * (Interrupts also are not involved.)
23528 +                */
23529 +               lock_handle *item = list_entry(owner->locks.next, lock_handle, locks_link);
23530 +               while (&owner->locks != &item->locks_link) {
23531 +                       znode *node = item->node;
23532 +
23533 +                       spin_lock_zlock(&node->lock);
23534 +
23535 +                       node->lock.nr_hipri_owners++;
23536 +
23537 +                       /* we can safely set signaled to zero, because
23538 +                          previous statement (nr_hipri_owners ++) guarantees
23539 +                          that signaled will be never set again. */
23540 +                       item->signaled = 0;
23541 +                       spin_unlock_zlock(&node->lock);
23542 +
23543 +                       item = list_entry(item->locks_link.next, lock_handle, locks_link);
23544 +               }
23545 +               owner->curpri = 1;
23546 +               atomic_set(&owner->nr_signaled, 0);
23547 +       }
23548 +}
23549 +
23550 +/* Sets a low priority to the process. */
23551 +static void set_low_priority(lock_stack * owner)
23552 +{
23553 +       assert("nikita-3075", owner == get_current_lock_stack());
23554 +       /* Do nothing if current priority is already low */
23555 +       if (owner->curpri) {
23556 +               /* scan all locks (lock handles) held by @owner, which is
23557 +                  actually current thread, and check whether we are reaching
23558 +                  deadlock possibility anywhere.
23559 +                */
23560 +               lock_handle *handle = list_entry(owner->locks.next, lock_handle, locks_link);
23561 +               while (&owner->locks != &handle->locks_link) {
23562 +                       znode *node = handle->node;
23563 +                       spin_lock_zlock(&node->lock);
23564 +                       /* this thread just was hipri owner of @node, so
23565 +                          nr_hipri_owners has to be greater than zero. */
23566 +                       assert("nikita-1835", node->lock.nr_hipri_owners > 0);
23567 +                       node->lock.nr_hipri_owners--;
23568 +                       /* If we have deadlock condition, adjust a nr_signaled
23569 +                          field. It is enough to set "signaled" flag only for
23570 +                          current process, other low-pri owners will be
23571 +                          signaled and waken up after current process unlocks
23572 +                          this object and any high-priority requestor takes
23573 +                          control. */
23574 +                       if (check_deadlock_condition(node)
23575 +                           && !handle->signaled) {
23576 +                               handle->signaled = 1;
23577 +                               atomic_inc(&owner->nr_signaled);
23578 +                       }
23579 +                       spin_unlock_zlock(&node->lock);
23580 +                       handle = list_entry(handle->locks_link.next, lock_handle, locks_link);
23581 +               }
23582 +               owner->curpri = 0;
23583 +       }
23584 +}
23585 +
23586 +static void remove_lock_request(lock_stack * requestor)
23587 +{
23588 +       zlock * lock = &requestor->request.node->lock;
23589 +
23590 +       if (requestor->curpri) {
23591 +               assert("nikita-1838", lock->nr_hipri_requests > 0);
23592 +               lock->nr_hipri_requests--;
23593 +               if (requestor->request.mode == ZNODE_WRITE_LOCK)
23594 +                       lock->nr_hipri_write_requests --;
23595 +       }
23596 +       list_del(&requestor->requestors_link);
23597 +}
23598 +
23599 +static void invalidate_all_lock_requests(znode * node)
23600 +{
23601 +       lock_stack *requestor, *tmp;
23602 +
23603 +       assert_spin_locked(&(node->lock.guard));
23604 +
23605 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23606 +               remove_lock_request(requestor);
23607 +               requestor->request.ret_code = -EINVAL;
23608 +               reiser4_wake_up(requestor);
23609 +               requestor->request.mode = ZNODE_NO_LOCK;
23610 +       }
23611 +}
23612 +
23613 +static void dispatch_lock_requests(znode * node)
23614 +{
23615 +       lock_stack *requestor, *tmp;
23616 +
23617 +       assert_spin_locked(&(node->lock.guard));
23618 +
23619 +       list_for_each_entry_safe(requestor, tmp, &node->lock.requestors, requestors_link) {
23620 +               if (znode_is_write_locked(node))
23621 +                       break;
23622 +               if (!can_lock_object(requestor)) {
23623 +                       lock_object(requestor);
23624 +                       remove_lock_request(requestor);
23625 +                       requestor->request.ret_code = 0;
23626 +                       reiser4_wake_up(requestor);
23627 +                       requestor->request.mode = ZNODE_NO_LOCK;
23628 +               }
23629 +       }
23630 +}
23631 +
23632 +/* release long-term lock, acquired by longterm_lock_znode() */
23633 +void longterm_unlock_znode(lock_handle * handle)
23634 +{
23635 +       znode *node = handle->node;
23636 +       lock_stack *oldowner = handle->owner;
23637 +       int hipri;
23638 +       int readers;
23639 +       int rdelta;
23640 +       int youdie;
23641 +
23642 +       /*
23643 +        * this is time-critical and highly optimized code. Modify carefully.
23644 +        */
23645 +
23646 +       assert("jmacd-1021", handle != NULL);
23647 +       assert("jmacd-1022", handle->owner != NULL);
23648 +       assert("nikita-1392", LOCK_CNT_GTZ(long_term_locked_znode));
23649 +
23650 +       assert("zam-130", oldowner == get_current_lock_stack());
23651 +
23652 +       LOCK_CNT_DEC(long_term_locked_znode);
23653 +
23654 +       /*
23655 +        * to minimize amount of operations performed under lock, pre-compute
23656 +        * all variables used within critical section. This makes code
23657 +        * obscure.
23658 +        */
23659 +
23660 +       /* was this lock of hi or lo priority */
23661 +       hipri = oldowner->curpri ? 1 : 0;
23662 +       /* number of readers */
23663 +       readers = node->lock.nr_readers;
23664 +       /* +1 if write lock, -1 if read lock */
23665 +       rdelta = (readers > 0) ? -1 : +1;
23666 +       /* true if node is to die and write lock is released */
23667 +       youdie = ZF_ISSET(node, JNODE_HEARD_BANSHEE) && (readers < 0);
23668 +
23669 +       spin_lock_zlock(&node->lock);
23670 +
23671 +       assert("zam-101", znode_is_locked(node));
23672 +
23673 +       /* Adjust a number of high priority owners of this lock */
23674 +       assert("nikita-1836", node->lock.nr_hipri_owners >= hipri);
23675 +       node->lock.nr_hipri_owners -= hipri;
23676 +
23677 +       /* Handle znode deallocation on last write-lock release. */
23678 +       if (znode_is_wlocked_once(node)) {
23679 +               if (youdie) {
23680 +                       forget_znode(handle);
23681 +                       assert("nikita-2191", znode_invariant(node));
23682 +                       zput(node);
23683 +                       return;
23684 +               }
23685 +       }
23686 +
23687 +       if (handle->signaled)
23688 +               atomic_dec(&oldowner->nr_signaled);
23689 +
23690 +       /* Unlocking means owner<->object link deletion */
23691 +       unlink_object(handle);
23692 +
23693 +       /* This is enough to be sure whether an object is completely
23694 +          unlocked. */
23695 +       node->lock.nr_readers += rdelta;
23696 +
23697 +       /* If the node is locked it must have an owners list.  Likewise, if
23698 +          the node is unlocked it must have an empty owners list. */
23699 +       assert("zam-319", equi(znode_is_locked(node),
23700 +                              !list_empty_careful(&node->lock.owners)));
23701 +
23702 +#if REISER4_DEBUG
23703 +       if (!znode_is_locked(node))
23704 +               ++node->times_locked;
23705 +#endif
23706 +
23707 +       /* If there are pending lock requests we wake up a requestor */
23708 +       if (!znode_is_wlocked(node))
23709 +               dispatch_lock_requests(node);
23710 +       if (check_deadlock_condition(node))
23711 +               wake_up_all_lopri_owners(node);
23712 +       spin_unlock_zlock(&node->lock);
23713 +
23714 +       /* minus one reference from handle->node */
23715 +       assert("nikita-2190", znode_invariant(node));
23716 +       ON_DEBUG(check_lock_data());
23717 +       ON_DEBUG(check_lock_node_data(node));
23718 +       zput(node);
23719 +}
23720 +
23721 +/* final portion of longterm-lock */
23722 +static int
23723 +lock_tail(lock_stack * owner, int ok, znode_lock_mode mode)
23724 +{
23725 +       znode *node = owner->request.node;
23726 +
23727 +       assert_spin_locked(&(node->lock.guard));
23728 +
23729 +       /* If we broke with (ok == 0) it means we can_lock, now do it. */
23730 +       if (ok == 0) {
23731 +               lock_object(owner);
23732 +               owner->request.mode = 0;
23733 +               /* count a reference from lockhandle->node
23734 +
23735 +                  znode was already referenced at the entry to this function,
23736 +                  hence taking spin-lock here is not necessary (see comment
23737 +                  in the zref()).
23738 +                */
23739 +               zref(node);
23740 +
23741 +               LOCK_CNT_INC(long_term_locked_znode);
23742 +       }
23743 +       spin_unlock_zlock(&node->lock);
23744 +       ON_DEBUG(check_lock_data());
23745 +       ON_DEBUG(check_lock_node_data(node));
23746 +       return ok;
23747 +}
23748 +
23749 +/*
23750 + * version of longterm_znode_lock() optimized for the most common case: read
23751 + * lock without any special flags. This is the kind of lock that any tree
23752 + * traversal takes on the root node of the tree, which is very frequent.
23753 + */
23754 +static int longterm_lock_tryfast(lock_stack * owner)
23755 +{
23756 +       int result;
23757 +       znode *node;
23758 +       zlock *lock;
23759 +
23760 +       node = owner->request.node;
23761 +       lock = &node->lock;
23762 +
23763 +       assert("nikita-3340", reiser4_schedulable());
23764 +       assert("nikita-3341", request_is_deadlock_safe(node,
23765 +                                                      ZNODE_READ_LOCK,
23766 +                                                      ZNODE_LOCK_LOPRI));
23767 +       spin_lock_zlock(lock);
23768 +       result = can_lock_object(owner);
23769 +       spin_unlock_zlock(lock);
23770 +
23771 +       if (likely(result != -EINVAL)) {
23772 +               spin_lock_znode(node);
23773 +               result = reiser4_try_capture(ZJNODE(node), ZNODE_READ_LOCK, 0);
23774 +               spin_unlock_znode(node);
23775 +               spin_lock_zlock(lock);
23776 +               if (unlikely(result != 0)) {
23777 +                       owner->request.mode = 0;
23778 +               } else {
23779 +                       result = can_lock_object(owner);
23780 +                       if (unlikely(result == -E_REPEAT)) {
23781 +                               /* fall back to longterm_lock_znode() */
23782 +                               spin_unlock_zlock(lock);
23783 +                               return 1;
23784 +                       }
23785 +               }
23786 +               return lock_tail(owner, result, ZNODE_READ_LOCK);
23787 +       } else
23788 +               return 1;
23789 +}
23790 +
23791 +/* locks given lock object */
23792 +int longterm_lock_znode(
23793 +                              /* local link object (allocated by lock owner thread, usually on its own
23794 +                               * stack) */
23795 +                              lock_handle * handle,
23796 +                              /* znode we want to lock. */
23797 +                              znode * node,
23798 +                              /* {ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}; */
23799 +                              znode_lock_mode mode,
23800 +                              /* {0, -EINVAL, -E_DEADLOCK}, see return codes description. */
23801 +                              znode_lock_request request) {
23802 +       int ret;
23803 +       int hipri = (request & ZNODE_LOCK_HIPRI) != 0;
23804 +       int non_blocking = 0;
23805 +       int has_atom;
23806 +       txn_capture cap_flags;
23807 +       zlock *lock;
23808 +       txn_handle *txnh;
23809 +       tree_level level;
23810 +
23811 +       /* Get current process context */
23812 +       lock_stack *owner = get_current_lock_stack();
23813 +
23814 +       /* Check that the lock handle is initialized and isn't already being
23815 +        * used. */
23816 +       assert("jmacd-808", handle->owner == NULL);
23817 +       assert("nikita-3026", reiser4_schedulable());
23818 +       assert("nikita-3219", request_is_deadlock_safe(node, mode, request));
23819 +       assert("zam-1056", atomic_read(&ZJNODE(node)->x_count) > 0);
23820 +       /* long term locks are not allowed in the VM contexts (->writepage(),
23821 +        * prune_{d,i}cache()).
23822 +        *
23823 +        * FIXME this doesn't work due to unused-dentry-with-unlinked-inode
23824 +        * bug caused by d_splice_alias() only working for directories.
23825 +        */
23826 +       assert("nikita-3547", 1 || ((current->flags & PF_MEMALLOC) == 0));
23827 +       assert ("zam-1055", mode != ZNODE_NO_LOCK);
23828 +
23829 +       cap_flags = 0;
23830 +       if (request & ZNODE_LOCK_NONBLOCK) {
23831 +               cap_flags |= TXN_CAPTURE_NONBLOCKING;
23832 +               non_blocking = 1;
23833 +       }
23834 +
23835 +       if (request & ZNODE_LOCK_DONT_FUSE)
23836 +               cap_flags |= TXN_CAPTURE_DONT_FUSE;
23837 +
23838 +       /* If we are changing our process priority we must adjust a number
23839 +          of high priority owners for each znode that we already lock */
23840 +       if (hipri) {
23841 +               set_high_priority(owner);
23842 +       } else {
23843 +               set_low_priority(owner);
23844 +       }
23845 +
23846 +       level = znode_get_level(node);
23847 +
23848 +       /* Fill request structure with our values. */
23849 +       owner->request.mode = mode;
23850 +       owner->request.handle = handle;
23851 +       owner->request.node = node;
23852 +
23853 +       txnh = get_current_context()->trans;
23854 +       lock = &node->lock;
23855 +
23856 +       if (mode == ZNODE_READ_LOCK && request == 0) {
23857 +               ret = longterm_lock_tryfast(owner);
23858 +               if (ret <= 0)
23859 +                       return ret;
23860 +       }
23861 +
23862 +       has_atom = (txnh->atom != NULL);
23863 +
23864 +       /* Synchronize on node's zlock guard lock. */
23865 +       spin_lock_zlock(lock);
23866 +
23867 +       if (znode_is_locked(node) &&
23868 +           mode == ZNODE_WRITE_LOCK && recursive(owner))
23869 +               return lock_tail(owner, 0, mode);
23870 +
23871 +       for (;;) {
23872 +               /* Check the lock's availability: if it is unavaiable we get
23873 +                  E_REPEAT, 0 indicates "can_lock", otherwise the node is
23874 +                  invalid.  */
23875 +               ret = can_lock_object(owner);
23876 +
23877 +               if (unlikely(ret == -EINVAL)) {
23878 +                       /* @node is dying. Leave it alone. */
23879 +                       break;
23880 +               }
23881 +
23882 +               if (unlikely(ret == -E_REPEAT && non_blocking)) {
23883 +                       /* either locking of @node by the current thread will
23884 +                        * lead to the deadlock, or lock modes are
23885 +                        * incompatible. */
23886 +                       break;
23887 +               }
23888 +
23889 +               assert("nikita-1844", (ret == 0)
23890 +                      || ((ret == -E_REPEAT) && !non_blocking));
23891 +               /* If we can get the lock... Try to capture first before
23892 +                  taking the lock. */
23893 +
23894 +               /* first handle commonest case where node and txnh are already
23895 +                * in the same atom. */
23896 +               /* safe to do without taking locks, because:
23897 +                *
23898 +                * 1. read of aligned word is atomic with respect to writes to
23899 +                * this word
23900 +                *
23901 +                * 2. false negatives are handled in reiser4_try_capture().
23902 +                *
23903 +                * 3. false positives are impossible.
23904 +                *
23905 +                * PROOF: left as an exercise to the curious reader.
23906 +                *
23907 +                * Just kidding. Here is one:
23908 +                *
23909 +                * At the time T0 txnh->atom is stored in txnh_atom.
23910 +                *
23911 +                * At the time T1 node->atom is stored in node_atom.
23912 +                *
23913 +                * At the time T2 we observe that
23914 +                *
23915 +                *     txnh_atom != NULL && node_atom == txnh_atom.
23916 +                *
23917 +                * Imagine that at this moment we acquire node and txnh spin
23918 +                * lock in this order. Suppose that under spin lock we have
23919 +                *
23920 +                *     node->atom != txnh->atom,                       (S1)
23921 +                *
23922 +                * at the time T3.
23923 +                *
23924 +                * txnh->atom != NULL still, because txnh is open by the
23925 +                * current thread.
23926 +                *
23927 +                * Suppose node->atom == NULL, that is, node was un-captured
23928 +                * between T1, and T3. But un-capturing of formatted node is
23929 +                * always preceded by the call to reiser4_invalidate_lock(),
23930 +                * which marks znode as JNODE_IS_DYING under zlock spin
23931 +                * lock. Contradiction, because can_lock_object() above checks
23932 +                * for JNODE_IS_DYING. Hence, node->atom != NULL at T3.
23933 +                *
23934 +                * Suppose that node->atom != node_atom, that is, atom, node
23935 +                * belongs to was fused into another atom: node_atom was fused
23936 +                * into node->atom. Atom of txnh was equal to node_atom at T2,
23937 +                * which means that under spin lock, txnh->atom == node->atom,
23938 +                * because txnh->atom can only follow fusion
23939 +                * chain. Contradicts S1.
23940 +                *
23941 +                * The same for hypothesis txnh->atom != txnh_atom. Hence,
23942 +                * node->atom == node_atom == txnh_atom == txnh->atom. Again
23943 +                * contradicts S1. Hence S1 is false. QED.
23944 +                *
23945 +                */
23946 +
23947 +               if (likely(has_atom && ZJNODE(node)->atom == txnh->atom)) {
23948 +                       ;
23949 +               } else {
23950 +                       /*
23951 +                        * unlock zlock spin lock here. It is possible for
23952 +                        * longterm_unlock_znode() to sneak in here, but there
23953 +                        * is no harm: reiser4_invalidate_lock() will mark znode
23954 +                        * as JNODE_IS_DYING and this will be noted by
23955 +                        * can_lock_object() below.
23956 +                        */
23957 +                       spin_unlock_zlock(lock);
23958 +                       spin_lock_znode(node);
23959 +                       ret = reiser4_try_capture(ZJNODE(node), mode, cap_flags);
23960 +                       spin_unlock_znode(node);
23961 +                       spin_lock_zlock(lock);
23962 +                       if (unlikely(ret != 0)) {
23963 +                               /* In the failure case, the txnmgr releases
23964 +                                  the znode's lock (or in some cases, it was
23965 +                                  released a while ago).  There's no need to
23966 +                                  reacquire it so we should return here,
23967 +                                  avoid releasing the lock. */
23968 +                               owner->request.mode = 0;
23969 +                               break;
23970 +                       }
23971 +
23972 +                       /* Check the lock's availability again -- this is
23973 +                          because under some circumstances the capture code
23974 +                          has to release and reacquire the znode spinlock. */
23975 +                       ret = can_lock_object(owner);
23976 +               }
23977 +
23978 +               /* This time, a return of (ret == 0) means we can lock, so we
23979 +                  should break out of the loop. */
23980 +               if (likely(ret != -E_REPEAT || non_blocking))
23981 +                       break;
23982 +
23983 +               /* Lock is unavailable, we have to wait. */
23984 +               ret = reiser4_prepare_to_sleep(owner);
23985 +               if (unlikely(ret != 0))
23986 +                       break;
23987 +
23988 +               assert_spin_locked(&(node->lock.guard));
23989 +               if (hipri) {
23990 +                       /* If we are going in high priority direction then
23991 +                          increase high priority requests counter for the
23992 +                          node */
23993 +                       lock->nr_hipri_requests++;
23994 +                       if (mode == ZNODE_WRITE_LOCK)
23995 +                               lock->nr_hipri_write_requests ++;
23996 +                       /* If there are no high priority owners for a node,
23997 +                          then immediately wake up low priority owners, so
23998 +                          they can detect possible deadlock */
23999 +                       if (lock->nr_hipri_owners == 0)
24000 +                               wake_up_all_lopri_owners(node);
24001 +               }
24002 +               list_add_tail(&owner->requestors_link, &lock->requestors);
24003 +
24004 +               /* Ok, here we have prepared a lock request, so unlock
24005 +                  a znode ... */
24006 +               spin_unlock_zlock(lock);
24007 +               /* ... and sleep */
24008 +               reiser4_go_to_sleep(owner);
24009 +               if (owner->request.mode == ZNODE_NO_LOCK)
24010 +                       goto request_is_done;
24011 +               spin_lock_zlock(lock);
24012 +               if (owner->request.mode == ZNODE_NO_LOCK) {
24013 +                       spin_unlock_zlock(lock);
24014 +               request_is_done:
24015 +                       if (owner->request.ret_code == 0) {
24016 +                               LOCK_CNT_INC(long_term_locked_znode);
24017 +                               zref(node);
24018 +                       }
24019 +                       return owner->request.ret_code;
24020 +               }
24021 +               remove_lock_request(owner);
24022 +       }
24023 +
24024 +       return lock_tail(owner, ret, mode);
24025 +}
24026 +
24027 +/* lock object invalidation means changing of lock object state to `INVALID'
24028 +   and waiting for all other processes to cancel theirs lock requests. */
24029 +void reiser4_invalidate_lock(lock_handle * handle      /* path to lock
24030 +                                                        * owner and lock
24031 +                                                        * object is being
24032 +                                                        * invalidated. */ )
24033 +{
24034 +       znode *node = handle->node;
24035 +       lock_stack *owner = handle->owner;
24036 +
24037 +       assert("zam-325", owner == get_current_lock_stack());
24038 +       assert("zam-103", znode_is_write_locked(node));
24039 +       assert("nikita-1393", !ZF_ISSET(node, JNODE_LEFT_CONNECTED));
24040 +       assert("nikita-1793", !ZF_ISSET(node, JNODE_RIGHT_CONNECTED));
24041 +       assert("nikita-1394", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
24042 +       assert("nikita-3097", znode_is_wlocked_once(node));
24043 +       assert_spin_locked(&(node->lock.guard));
24044 +
24045 +       if (handle->signaled)
24046 +               atomic_dec(&owner->nr_signaled);
24047 +
24048 +       ZF_SET(node, JNODE_IS_DYING);
24049 +       unlink_object(handle);
24050 +       node->lock.nr_readers = 0;
24051 +
24052 +       invalidate_all_lock_requests(node);
24053 +       spin_unlock_zlock(&node->lock);
24054 +}
24055 +
24056 +/* Initializes lock_stack. */
24057 +void init_lock_stack(lock_stack * owner        /* pointer to
24058 +                                        * allocated
24059 +                                        * structure. */ )
24060 +{
24061 +       INIT_LIST_HEAD(&owner->locks);
24062 +       INIT_LIST_HEAD(&owner->requestors_link);
24063 +       spin_lock_init(&owner->sguard);
24064 +       owner->curpri = 1;
24065 +       init_waitqueue_head(&owner->wait);
24066 +}
24067 +
24068 +/* Initializes lock object. */
24069 +void reiser4_init_lock(zlock * lock    /* pointer on allocated
24070 +                                        * uninitialized lock object
24071 +                                        * structure. */ )
24072 +{
24073 +       memset(lock, 0, sizeof(zlock));
24074 +       spin_lock_init(&lock->guard);
24075 +       INIT_LIST_HEAD(&lock->requestors);
24076 +       INIT_LIST_HEAD(&lock->owners);
24077 +}
24078 +
24079 +/* Transfer a lock handle (presumably so that variables can be moved between stack and
24080 +   heap locations). */
24081 +static void
24082 +move_lh_internal(lock_handle * new, lock_handle * old, int unlink_old)
24083 +{
24084 +       znode *node = old->node;
24085 +       lock_stack *owner = old->owner;
24086 +       int signaled;
24087 +
24088 +       /* locks_list, modified by link_object() is not protected by
24089 +          anything. This is valid because only current thread ever modifies
24090 +          locks_list of its lock_stack.
24091 +        */
24092 +       assert("nikita-1827", owner == get_current_lock_stack());
24093 +       assert("nikita-1831", new->owner == NULL);
24094 +
24095 +       spin_lock_zlock(&node->lock);
24096 +
24097 +       signaled = old->signaled;
24098 +       if (unlink_old) {
24099 +               unlink_object(old);
24100 +       } else {
24101 +               if (node->lock.nr_readers > 0) {
24102 +                       node->lock.nr_readers += 1;
24103 +               } else {
24104 +                       node->lock.nr_readers -= 1;
24105 +               }
24106 +               if (signaled) {
24107 +                       atomic_inc(&owner->nr_signaled);
24108 +               }
24109 +               if (owner->curpri) {
24110 +                       node->lock.nr_hipri_owners += 1;
24111 +               }
24112 +               LOCK_CNT_INC(long_term_locked_znode);
24113 +
24114 +               zref(node);
24115 +       }
24116 +       link_object(new, owner, node);
24117 +       new->signaled = signaled;
24118 +
24119 +       spin_unlock_zlock(&node->lock);
24120 +}
24121 +
24122 +void move_lh(lock_handle * new, lock_handle * old)
24123 +{
24124 +       move_lh_internal(new, old, /*unlink_old */ 1);
24125 +}
24126 +
24127 +void copy_lh(lock_handle * new, lock_handle * old)
24128 +{
24129 +       move_lh_internal(new, old, /*unlink_old */ 0);
24130 +}
24131 +
24132 +/* after getting -E_DEADLOCK we unlock znodes until this function returns false */
24133 +int reiser4_check_deadlock(void)
24134 +{
24135 +       lock_stack *owner = get_current_lock_stack();
24136 +       return atomic_read(&owner->nr_signaled) != 0;
24137 +}
24138 +
24139 +/* Before going to sleep we re-check "release lock" requests which might come from threads with hi-pri lock
24140 +   priorities. */
24141 +int reiser4_prepare_to_sleep(lock_stack * owner)
24142 +{
24143 +       assert("nikita-1847", owner == get_current_lock_stack());
24144 +
24145 +       /* We return -E_DEADLOCK if one or more "give me the lock" messages are
24146 +        * counted in nr_signaled */
24147 +       if (unlikely(atomic_read(&owner->nr_signaled) != 0)) {
24148 +               assert("zam-959", !owner->curpri);
24149 +               return RETERR(-E_DEADLOCK);
24150 +       }
24151 +       return 0;
24152 +}
24153 +
24154 +/* Wakes up a single thread */
24155 +void __reiser4_wake_up(lock_stack * owner)
24156 +{
24157 +       atomic_set(&owner->wakeup, 1);
24158 +       wake_up(&owner->wait);
24159 +}
24160 +
24161 +/* Puts a thread to sleep */
24162 +void reiser4_go_to_sleep(lock_stack * owner)
24163 +{
24164 +       /* Well, we might sleep here, so holding of any spinlocks is no-no */
24165 +       assert("nikita-3027", reiser4_schedulable());
24166 +
24167 +       wait_event(owner->wait, atomic_read(&owner->wakeup));
24168 +       atomic_set(&owner->wakeup, 0);
24169 +}
24170 +
24171 +int lock_stack_isclean(lock_stack * owner)
24172 +{
24173 +       if (list_empty_careful(&owner->locks)) {
24174 +               assert("zam-353", atomic_read(&owner->nr_signaled) == 0);
24175 +               return 1;
24176 +       }
24177 +
24178 +       return 0;
24179 +}
24180 +
24181 +#if REISER4_DEBUG
24182 +
24183 +/*
24184 + * debugging functions
24185 + */
24186 +
24187 +static void list_check(struct list_head *head)
24188 +{
24189 +       struct list_head *pos;
24190 +
24191 +       list_for_each(pos, head)
24192 +               assert("", (pos->prev != NULL && pos->next != NULL &&
24193 +                           pos->prev->next == pos && pos->next->prev == pos));
24194 +}
24195 +
24196 +/* check consistency of locking data-structures hanging of the @stack */
24197 +static void check_lock_stack(lock_stack * stack)
24198 +{
24199 +       spin_lock_stack(stack);
24200 +       /* check that stack->locks is not corrupted */
24201 +       list_check(&stack->locks);
24202 +       spin_unlock_stack(stack);
24203 +}
24204 +
24205 +/* check consistency of locking data structures */
24206 +void check_lock_data(void)
24207 +{
24208 +       check_lock_stack(&get_current_context()->stack);
24209 +}
24210 +
24211 +/* check consistency of locking data structures for @node */
24212 +void check_lock_node_data(znode * node)
24213 +{
24214 +       spin_lock_zlock(&node->lock);
24215 +       list_check(&node->lock.owners);
24216 +       list_check(&node->lock.requestors);
24217 +       spin_unlock_zlock(&node->lock);
24218 +}
24219 +
24220 +/* check that given lock request is dead lock safe. This check is, of course,
24221 + * not exhaustive. */
24222 +static int
24223 +request_is_deadlock_safe(znode * node, znode_lock_mode mode,
24224 +                        znode_lock_request request)
24225 +{
24226 +       lock_stack *owner;
24227 +
24228 +       owner = get_current_lock_stack();
24229 +       /*
24230 +        * check that hipri lock request is not issued when there are locked
24231 +        * nodes at the higher levels.
24232 +        */
24233 +       if (request & ZNODE_LOCK_HIPRI && !(request & ZNODE_LOCK_NONBLOCK) &&
24234 +           znode_get_level(node) != 0) {
24235 +               lock_handle *item;
24236 +
24237 +               list_for_each_entry(item, &owner->locks, locks_link) {
24238 +                       znode *other;
24239 +
24240 +                       other = item->node;
24241 +
24242 +                       if (znode_get_level(other) == 0)
24243 +                               continue;
24244 +                       if (znode_get_level(other) > znode_get_level(node))
24245 +                               return 0;
24246 +               }
24247 +       }
24248 +       return 1;
24249 +}
24250 +
24251 +#endif
24252 +
24253 +/* return pointer to static storage with name of lock_mode. For
24254 +    debugging */
24255 +const char *lock_mode_name(znode_lock_mode lock /* lock mode to get name of */ )
24256 +{
24257 +       if (lock == ZNODE_READ_LOCK)
24258 +               return "read";
24259 +       else if (lock == ZNODE_WRITE_LOCK)
24260 +               return "write";
24261 +       else {
24262 +               static char buf[30];
24263 +
24264 +               sprintf(buf, "unknown: %i", lock);
24265 +               return buf;
24266 +       }
24267 +}
24268 +
24269 +/* Make Linus happy.
24270 +   Local variables:
24271 +   c-indentation-style: "K&R"
24272 +   mode-name: "LC"
24273 +   c-basic-offset: 8
24274 +   tab-width: 8
24275 +   fill-column: 79
24276 +   End:
24277 +*/
24278 diff --git a/fs/reiser4/lock.h b/fs/reiser4/lock.h
24279 new file mode 100644
24280 index 0000000..e130466
24281 --- /dev/null
24282 +++ b/fs/reiser4/lock.h
24283 @@ -0,0 +1,249 @@
24284 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
24285 +
24286 +/* Long term locking data structures. See lock.c for details. */
24287 +
24288 +#ifndef __LOCK_H__
24289 +#define __LOCK_H__
24290 +
24291 +#include "forward.h"
24292 +#include "debug.h"
24293 +#include "dformat.h"
24294 +#include "key.h"
24295 +#include "coord.h"
24296 +#include "plugin/node/node.h"
24297 +#include "txnmgr.h"
24298 +#include "readahead.h"
24299 +
24300 +#include <linux/types.h>
24301 +#include <linux/spinlock.h>
24302 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
24303 +#include <asm/atomic.h>
24304 +#include <linux/wait.h>
24305 +
24306 +/* Per-znode lock object */
24307 +struct zlock {
24308 +       spinlock_t guard;
24309 +       /* The number of readers if positive; the number of recursively taken
24310 +          write locks if negative. Protected by zlock spin lock. */
24311 +       int nr_readers;
24312 +       /* A number of processes (lock_stacks) that have this object
24313 +          locked with high priority */
24314 +       unsigned nr_hipri_owners;
24315 +       /* A number of attempts to lock znode in high priority direction */
24316 +       unsigned nr_hipri_requests;
24317 +       /* A linked list of lock_handle objects that contains pointers
24318 +          for all lock_stacks which have this lock object locked */
24319 +       unsigned nr_hipri_write_requests;
24320 +       struct list_head owners;
24321 +       /* A linked list of lock_stacks that wait for this lock */
24322 +       struct list_head requestors;
24323 +};
24324 +
24325 +static inline void spin_lock_zlock(zlock *lock)
24326 +{
24327 +       /* check that zlock is not locked */
24328 +       assert("", LOCK_CNT_NIL(spin_locked_zlock));
24329 +       /* check that spinlocks of lower priorities are not held */
24330 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
24331 +
24332 +       spin_lock(&lock->guard);
24333 +
24334 +       LOCK_CNT_INC(spin_locked_zlock);
24335 +       LOCK_CNT_INC(spin_locked);
24336 +}
24337 +
24338 +static inline void spin_unlock_zlock(zlock *lock)
24339 +{
24340 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_zlock));
24341 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24342 +
24343 +       LOCK_CNT_DEC(spin_locked_zlock);
24344 +       LOCK_CNT_DEC(spin_locked);
24345 +
24346 +       spin_unlock(&lock->guard);
24347 +}
24348 +
24349 +#define lock_is_locked(lock)          ((lock)->nr_readers != 0)
24350 +#define lock_is_rlocked(lock)         ((lock)->nr_readers > 0)
24351 +#define lock_is_wlocked(lock)         ((lock)->nr_readers < 0)
24352 +#define lock_is_wlocked_once(lock)    ((lock)->nr_readers == -1)
24353 +#define lock_can_be_rlocked(lock)     ((lock)->nr_readers >=0)
24354 +#define lock_mode_compatible(lock, mode)                               \
24355 +             (((mode) == ZNODE_WRITE_LOCK && !lock_is_locked(lock)) || \
24356 +              ((mode) == ZNODE_READ_LOCK && lock_can_be_rlocked(lock)))
24357 +
24358 +/* Since we have R/W znode locks we need additional bidirectional `link'
24359 +   objects to implement n<->m relationship between lock owners and lock
24360 +   objects. We call them `lock handles'.
24361 +
24362 +   Locking: see lock.c/"SHORT-TERM LOCKING"
24363 +*/
24364 +struct lock_handle {
24365 +       /* This flag indicates that a signal to yield a lock was passed to
24366 +          lock owner and counted in owner->nr_signalled
24367 +
24368 +          Locking: this is accessed under spin lock on ->node.
24369 +        */
24370 +       int signaled;
24371 +       /* A link to owner of a lock */
24372 +       lock_stack *owner;
24373 +       /* A link to znode locked */
24374 +       znode *node;
24375 +       /* A list of all locks for a process */
24376 +       struct list_head locks_link;
24377 +       /* A list of all owners for a znode */
24378 +       struct list_head owners_link;
24379 +};
24380 +
24381 +typedef struct lock_request {
24382 +       /* A pointer to uninitialized link object */
24383 +       lock_handle *handle;
24384 +       /* A pointer to the object we want to lock */
24385 +       znode *node;
24386 +       /* Lock mode (ZNODE_READ_LOCK or ZNODE_WRITE_LOCK) */
24387 +       znode_lock_mode mode;
24388 +       /* how dispatch_lock_requests() returns lock request result code */
24389 +       int ret_code;
24390 +} lock_request;
24391 +
24392 +/* A lock stack structure for accumulating locks owned by a process */
24393 +struct lock_stack {
24394 +       /* A guard lock protecting a lock stack */
24395 +       spinlock_t sguard;
24396 +       /* number of znodes which were requested by high priority processes */
24397 +       atomic_t nr_signaled;
24398 +       /* Current priority of a process
24399 +
24400 +          This is only accessed by the current thread and thus requires no
24401 +          locking.
24402 +        */
24403 +       int curpri;
24404 +       /* A list of all locks owned by this process. Elements can be added to
24405 +        * this list only by the current thread. ->node pointers in this list
24406 +        * can be only changed by the current thread. */
24407 +       struct list_head locks;
24408 +       /* When lock_stack waits for the lock, it puts itself on double-linked
24409 +          requestors list of that lock */
24410 +       struct list_head requestors_link;
24411 +       /* Current lock request info.
24412 +
24413 +          This is only accessed by the current thread and thus requires no
24414 +          locking.
24415 +        */
24416 +       lock_request request;
24417 +       /* the following two fields are the lock stack's
24418 +        * synchronization object to use with the standard linux/wait.h
24419 +        * interface. See reiser4_go_to_sleep and __reiser4_wake_up for
24420 +        * usage details. */
24421 +       wait_queue_head_t wait;
24422 +       atomic_t wakeup;
24423 +#if REISER4_DEBUG
24424 +       int nr_locks;           /* number of lock handles in the above list */
24425 +#endif
24426 +};
24427 +
24428 +/*
24429 +  User-visible znode locking functions
24430 +*/
24431 +
24432 +extern int longterm_lock_znode(lock_handle * handle,
24433 +                              znode * node,
24434 +                              znode_lock_mode mode,
24435 +                              znode_lock_request request);
24436 +
24437 +extern void longterm_unlock_znode(lock_handle * handle);
24438 +
24439 +extern int reiser4_check_deadlock(void);
24440 +
24441 +extern lock_stack *get_current_lock_stack(void);
24442 +
24443 +extern void init_lock_stack(lock_stack * owner);
24444 +extern void reiser4_init_lock(zlock * lock);
24445 +
24446 +static inline void init_lh(lock_handle *lh)
24447 +{
24448 +#if REISER4_DEBUG
24449 +       memset(lh, 0, sizeof *lh);
24450 +       INIT_LIST_HEAD(&lh->locks_link);
24451 +       INIT_LIST_HEAD(&lh->owners_link);
24452 +#else
24453 +       lh->node = NULL;
24454 +#endif
24455 +}
24456 +
24457 +static inline  void done_lh(lock_handle *lh)
24458 +{
24459 +       assert("zam-342", lh != NULL);
24460 +       if (lh->node != NULL)
24461 +               longterm_unlock_znode(lh);
24462 +}
24463 +
24464 +extern void move_lh(lock_handle * new, lock_handle * old);
24465 +extern void copy_lh(lock_handle * new, lock_handle * old);
24466 +
24467 +extern int reiser4_prepare_to_sleep(lock_stack * owner);
24468 +extern void reiser4_go_to_sleep(lock_stack * owner);
24469 +extern void __reiser4_wake_up(lock_stack * owner);
24470 +
24471 +extern int lock_stack_isclean(lock_stack * owner);
24472 +
24473 +/* zlock object state check macros: only used in assertions.  Both forms imply that the
24474 +   lock is held by the current thread. */
24475 +extern int znode_is_write_locked(const znode *);
24476 +extern void reiser4_invalidate_lock(lock_handle *);
24477 +
24478 +/* lock ordering is: first take zlock spin lock, then lock stack spin lock */
24479 +#define spin_ordering_pred_stack(stack)                        \
24480 +       (LOCK_CNT_NIL(spin_locked_stack) &&             \
24481 +        LOCK_CNT_NIL(spin_locked_txnmgr) &&            \
24482 +        LOCK_CNT_NIL(spin_locked_inode) &&             \
24483 +        LOCK_CNT_NIL(rw_locked_cbk_cache) &&           \
24484 +        LOCK_CNT_NIL(spin_locked_super_eflush) )
24485 +
24486 +static inline void spin_lock_stack(lock_stack *stack)
24487 +{
24488 +       assert("", spin_ordering_pred_stack(stack));
24489 +       spin_lock(&(stack->sguard));
24490 +       LOCK_CNT_INC(spin_locked_stack);
24491 +       LOCK_CNT_INC(spin_locked);
24492 +}
24493 +
24494 +static inline void spin_unlock_stack(lock_stack *stack)
24495 +{
24496 +       assert_spin_locked(&(stack->sguard));
24497 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_stack));
24498 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
24499 +       LOCK_CNT_DEC(spin_locked_stack);
24500 +       LOCK_CNT_DEC(spin_locked);
24501 +       spin_unlock(&(stack->sguard));
24502 +}
24503 +
24504 +static inline void reiser4_wake_up(lock_stack * owner)
24505 +{
24506 +       spin_lock_stack(owner);
24507 +       __reiser4_wake_up(owner);
24508 +       spin_unlock_stack(owner);
24509 +}
24510 +
24511 +const char *lock_mode_name(znode_lock_mode lock);
24512 +
24513 +#if REISER4_DEBUG
24514 +extern void check_lock_data(void);
24515 +extern void check_lock_node_data(znode * node);
24516 +#else
24517 +#define check_lock_data() noop
24518 +#define check_lock_node_data() noop
24519 +#endif
24520 +
24521 +/* __LOCK_H__ */
24522 +#endif
24523 +
24524 +/* Make Linus happy.
24525 +   Local variables:
24526 +   c-indentation-style: "K&R"
24527 +   mode-name: "LC"
24528 +   c-basic-offset: 8
24529 +   tab-width: 8
24530 +   fill-column: 120
24531 +   End:
24532 +*/
24533 diff --git a/fs/reiser4/oid.c b/fs/reiser4/oid.c
24534 new file mode 100644
24535 index 0000000..f311d06
24536 --- /dev/null
24537 +++ b/fs/reiser4/oid.c
24538 @@ -0,0 +1,141 @@
24539 +/* Copyright 2003 by Hans Reiser, licensing governed by reiser4/README */
24540 +
24541 +#include "debug.h"
24542 +#include "super.h"
24543 +#include "txnmgr.h"
24544 +
24545 +/* we used to have oid allocation plugin. It was removed because it
24546 +   was recognized as providing unneeded level of abstraction. If one
24547 +   ever will find it useful - look at yet_unneeded_abstractions/oid
24548 +*/
24549 +
24550 +/*
24551 + * initialize in-memory data for oid allocator at @super. @nr_files and @next
24552 + * are provided by disk format plugin that reads them from the disk during
24553 + * mount.
24554 + */
24555 +int oid_init_allocator(struct super_block *super, oid_t nr_files, oid_t next)
24556 +{
24557 +       reiser4_super_info_data *sbinfo;
24558 +
24559 +       sbinfo = get_super_private(super);
24560 +
24561 +       sbinfo->next_to_use = next;
24562 +       sbinfo->oids_in_use = nr_files;
24563 +       return 0;
24564 +}
24565 +
24566 +/*
24567 + * allocate oid and return it. ABSOLUTE_MAX_OID is returned when allocator
24568 + * runs out of oids.
24569 + */
24570 +oid_t oid_allocate(struct super_block * super)
24571 +{
24572 +       reiser4_super_info_data *sbinfo;
24573 +       oid_t oid;
24574 +
24575 +       sbinfo = get_super_private(super);
24576 +
24577 +       spin_lock_reiser4_super(sbinfo);
24578 +       if (sbinfo->next_to_use != ABSOLUTE_MAX_OID) {
24579 +               oid = sbinfo->next_to_use++;
24580 +               sbinfo->oids_in_use++;
24581 +       } else
24582 +               oid = ABSOLUTE_MAX_OID;
24583 +       spin_unlock_reiser4_super(sbinfo);
24584 +       return oid;
24585 +}
24586 +
24587 +/*
24588 + * Tell oid allocator that @oid is now free.
24589 + */
24590 +int oid_release(struct super_block *super, oid_t oid UNUSED_ARG)
24591 +{
24592 +       reiser4_super_info_data *sbinfo;
24593 +
24594 +       sbinfo = get_super_private(super);
24595 +
24596 +       spin_lock_reiser4_super(sbinfo);
24597 +       sbinfo->oids_in_use--;
24598 +       spin_unlock_reiser4_super(sbinfo);
24599 +       return 0;
24600 +}
24601 +
24602 +/*
24603 + * return next @oid that would be allocated (i.e., returned by oid_allocate())
24604 + * without actually allocating it. This is used by disk format plugin to save
24605 + * oid allocator state on the disk.
24606 + */
24607 +oid_t oid_next(const struct super_block * super)
24608 +{
24609 +       reiser4_super_info_data *sbinfo;
24610 +       oid_t oid;
24611 +
24612 +       sbinfo = get_super_private(super);
24613 +
24614 +       spin_lock_reiser4_super(sbinfo);
24615 +       oid = sbinfo->next_to_use;
24616 +       spin_unlock_reiser4_super(sbinfo);
24617 +       return oid;
24618 +}
24619 +
24620 +/*
24621 + * returns number of currently used oids. This is used by statfs(2) to report
24622 + * number of "inodes" and by disk format plugin to save oid allocator state on
24623 + * the disk.
24624 + */
24625 +long oids_used(const struct super_block *super)
24626 +{
24627 +       reiser4_super_info_data *sbinfo;
24628 +       oid_t used;
24629 +
24630 +       sbinfo = get_super_private(super);
24631 +
24632 +       spin_lock_reiser4_super(sbinfo);
24633 +       used = sbinfo->oids_in_use;
24634 +       spin_unlock_reiser4_super(sbinfo);
24635 +       if (used < (__u64) ((long)~0) >> 1)
24636 +               return (long)used;
24637 +       else
24638 +               return (long)-1;
24639 +}
24640 +
24641 +/*
24642 + * Count oid as allocated in atom. This is done after call to oid_allocate()
24643 + * at the point when we are irrevocably committed to creation of the new file
24644 + * (i.e., when oid allocation cannot be any longer rolled back due to some
24645 + * error).
24646 + */
24647 +void oid_count_allocated(void)
24648 +{
24649 +       txn_atom *atom;
24650 +
24651 +       atom = get_current_atom_locked();
24652 +       atom->nr_objects_created++;
24653 +       spin_unlock_atom(atom);
24654 +}
24655 +
24656 +/*
24657 + * Count oid as free in atom. This is done after call to oid_release() at the
24658 + * point when we are irrevocably committed to the deletion of the file (i.e.,
24659 + * when oid release cannot be any longer rolled back due to some error).
24660 + */
24661 +void oid_count_released(void)
24662 +{
24663 +       txn_atom *atom;
24664 +
24665 +       atom = get_current_atom_locked();
24666 +       atom->nr_objects_deleted++;
24667 +       spin_unlock_atom(atom);
24668 +}
24669 +
24670 +/*
24671 +   Local variables:
24672 +   c-indentation-style: "K&R"
24673 +   mode-name: "LC"
24674 +   c-basic-offset: 8
24675 +   tab-width: 8
24676 +   fill-column: 120
24677 +   scroll-step: 1
24678 +   End:
24679 +*/
24680 diff --git a/fs/reiser4/page_cache.c b/fs/reiser4/page_cache.c
24681 new file mode 100644
24682 index 0000000..e1f436d
24683 --- /dev/null
24684 +++ b/fs/reiser4/page_cache.c
24685 @@ -0,0 +1,736 @@
24686 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
24687 + * reiser4/README */
24688 +
24689 +/* Memory pressure hooks. Fake inodes handling. */
24690 +
24691 +/*   GLOSSARY
24692 +
24693 +   . Formatted and unformatted nodes.
24694 +     Elements of reiser4 balanced tree to store data and metadata.
24695 +     Unformatted nodes are pointed to by extent pointers. Such nodes
24696 +     are used to store data of large objects. Unlike unformatted nodes,
24697 +     formatted ones have associated format described by node4X plugin.
24698 +
24699 +   . Jnode (or journal node)
24700 +     The in-memory header which is used to track formatted and unformatted
24701 +     nodes, bitmap nodes, etc. In particular, jnodes are used to track
24702 +     transactional information associated with each block(see reiser4/jnode.c
24703 +     for details).
24704 +
24705 +   . Znode
24706 +     The in-memory header which is used to track formatted nodes. Contains
24707 +     embedded jnode (see reiser4/znode.c for details).
24708 +*/
24709 +
24710 +/* We store all file system meta data (and data, of course) in the page cache.
24711 +
24712 +   What does this mean? In stead of using bread/brelse we create special
24713 +   "fake" inode (one per super block) and store content of formatted nodes
24714 +   into pages bound to this inode in the page cache. In newer kernels bread()
24715 +   already uses inode attached to block device (bd_inode). Advantage of having
24716 +   our own fake inode is that we can install appropriate methods in its
24717 +   address_space operations. Such methods are called by VM on memory pressure
24718 +   (or during background page flushing) and we can use them to react
24719 +   appropriately.
24720 +
24721 +   In initial version we only support one block per page. Support for multiple
24722 +   blocks per page is complicated by relocation.
24723 +
24724 +   To each page, used by reiser4, jnode is attached. jnode is analogous to
24725 +   buffer head. Difference is that jnode is bound to the page permanently:
24726 +   jnode cannot be removed from memory until its backing page is.
24727 +
24728 +   jnode contain pointer to page (->pg field) and page contain pointer to
24729 +   jnode in ->private field. Pointer from jnode to page is protected to by
24730 +   jnode's spinlock and pointer from page to jnode is protected by page lock
24731 +   (PG_locked bit). Lock ordering is: first take page lock, then jnode spin
24732 +   lock. To go into reverse direction use jnode_lock_page() function that uses
24733 +   standard try-lock-and-release device.
24734 +
24735 +   Properties:
24736 +
24737 +   1. when jnode-to-page mapping is established (by jnode_attach_page()), page
24738 +   reference counter is increased.
24739 +
24740 +   2. when jnode-to-page mapping is destroyed (by page_clear_jnode(), page
24741 +   reference counter is decreased.
24742 +
24743 +   3. on jload() reference counter on jnode page is increased, page is
24744 +   kmapped and `referenced'.
24745 +
24746 +   4. on jrelse() inverse operations are performed.
24747 +
24748 +   5. kmapping/kunmapping of unformatted pages is done by read/write methods.
24749 +
24750 +   DEADLOCKS RELATED TO MEMORY PRESSURE. [OUTDATED. Only interesting
24751 +   historically.]
24752 +
24753 +   [In the following discussion, `lock' invariably means long term lock on
24754 +   znode.] (What about page locks?)
24755 +
24756 +   There is some special class of deadlock possibilities related to memory
24757 +   pressure. Locks acquired by other reiser4 threads are accounted for in
24758 +   deadlock prevention mechanism (lock.c), but when ->vm_writeback() is
24759 +   invoked additional hidden arc is added to the locking graph: thread that
24760 +   tries to allocate memory waits for ->vm_writeback() to finish. If this
24761 +   thread keeps lock and ->vm_writeback() tries to acquire this lock, deadlock
24762 +   prevention is useless.
24763 +
24764 +   Another related problem is possibility for ->vm_writeback() to run out of
24765 +   memory itself. This is not a problem for ext2 and friends, because their
24766 +   ->vm_writeback() don't allocate much memory, but reiser4 flush is
24767 +   definitely able to allocate huge amounts of memory.
24768 +
24769 +   It seems that there is no reliable way to cope with the problems above. In
24770 +   stead it was decided that ->vm_writeback() (as invoked in the kswapd
24771 +   context) wouldn't perform any flushing itself, but rather should just wake
24772 +   up some auxiliary thread dedicated for this purpose (or, the same thread
24773 +   that does periodic commit of old atoms (ktxnmgrd.c)).
24774 +
24775 +   Details:
24776 +
24777 +   1. Page is called `reclaimable' against particular reiser4 mount F if this
24778 +   page can be ultimately released by try_to_free_pages() under presumptions
24779 +   that:
24780 +
24781 +    a. ->vm_writeback() for F is no-op, and
24782 +
24783 +    b. none of the threads accessing F are making any progress, and
24784 +
24785 +    c. other reiser4 mounts obey the same memory reservation protocol as F
24786 +    (described below).
24787 +
24788 +   For example, clean un-pinned page, or page occupied by ext2 data are
24789 +   reclaimable against any reiser4 mount.
24790 +
24791 +   When there is more than one reiser4 mount in a system, condition (c) makes
24792 +   reclaim-ability not easily verifiable beyond trivial cases mentioned above.
24793 +
24794 +   THIS COMMENT IS VALID FOR "MANY BLOCKS ON PAGE" CASE
24795 +
24796 +   Fake inode is used to bound formatted nodes and each node is indexed within
24797 +   fake inode by its block number. If block size of smaller than page size, it
24798 +   may so happen that block mapped to the page with formatted node is occupied
24799 +   by unformatted node or is unallocated. This lead to some complications,
24800 +   because flushing whole page can lead to an incorrect overwrite of
24801 +   unformatted node that is moreover, can be cached in some other place as
24802 +   part of the file body. To avoid this, buffers for unformatted nodes are
24803 +   never marked dirty. Also pages in the fake are never marked dirty. This
24804 +   rules out usage of ->writepage() as memory pressure hook. In stead
24805 +   ->releasepage() is used.
24806 +
24807 +   Josh is concerned that page->buffer is going to die. This should not pose
24808 +   significant problem though, because we need to add some data structures to
24809 +   the page anyway (jnode) and all necessary book keeping can be put there.
24810 +
24811 +*/
24812 +
24813 +/* Life cycle of pages/nodes.
24814 +
24815 +   jnode contains reference to page and page contains reference back to
24816 +   jnode. This reference is counted in page ->count. Thus, page bound to jnode
24817 +   cannot be released back into free pool.
24818 +
24819 +    1. Formatted nodes.
24820 +
24821 +      1. formatted node is represented by znode. When new znode is created its
24822 +      ->pg pointer is NULL initially.
24823 +
24824 +      2. when node content is loaded into znode (by call to zload()) for the
24825 +      first time following happens (in call to ->read_node() or
24826 +      ->allocate_node()):
24827 +
24828 +        1. new page is added to the page cache.
24829 +
24830 +        2. this page is attached to znode and its ->count is increased.
24831 +
24832 +        3. page is kmapped.
24833 +
24834 +      3. if more calls to zload() follow (without corresponding zrelses), page
24835 +      counter is left intact and in its stead ->d_count is increased in znode.
24836 +
24837 +      4. each call to zrelse decreases ->d_count. When ->d_count drops to zero
24838 +      ->release_node() is called and page is kunmapped as result.
24839 +
24840 +      5. at some moment node can be captured by a transaction. Its ->x_count
24841 +      is then increased by transaction manager.
24842 +
24843 +      6. if node is removed from the tree (empty node with JNODE_HEARD_BANSHEE
24844 +      bit set) following will happen (also see comment at the top of znode.c):
24845 +
24846 +        1. when last lock is released, node will be uncaptured from
24847 +        transaction. This released reference that transaction manager acquired
24848 +        at the step 5.
24849 +
24850 +        2. when last reference is released, zput() detects that node is
24851 +        actually deleted and calls ->delete_node()
24852 +        operation. page_cache_delete_node() implementation detaches jnode from
24853 +        page and releases page.
24854 +
24855 +      7. otherwise (node wasn't removed from the tree), last reference to
24856 +      znode will be released after transaction manager committed transaction
24857 +      node was in. This implies squallocing of this node (see
24858 +      flush.c). Nothing special happens at this point. Znode is still in the
24859 +      hash table and page is still attached to it.
24860 +
24861 +      8. znode is actually removed from the memory because of the memory
24862 +      pressure, or during umount (znodes_tree_done()). Anyway, znode is
24863 +      removed by the call to zdrop(). At this moment, page is detached from
24864 +      znode and removed from the inode address space.
24865 +
24866 +*/
24867 +
24868 +#include "debug.h"
24869 +#include "dformat.h"
24870 +#include "key.h"
24871 +#include "txnmgr.h"
24872 +#include "jnode.h"
24873 +#include "znode.h"
24874 +#include "block_alloc.h"
24875 +#include "tree.h"
24876 +#include "vfs_ops.h"
24877 +#include "inode.h"
24878 +#include "super.h"
24879 +#include "entd.h"
24880 +#include "page_cache.h"
24881 +#include "ktxnmgrd.h"
24882 +
24883 +#include <linux/types.h>
24884 +#include <linux/fs.h>
24885 +#include <linux/mm.h>          /* for struct page */
24886 +#include <linux/swap.h>                /* for struct page */
24887 +#include <linux/pagemap.h>
24888 +#include <linux/bio.h>
24889 +#include <linux/writeback.h>
24890 +#include <linux/blkdev.h>
24891 +
24892 +static struct bio *page_bio(struct page *, jnode *, int rw, gfp_t gfp);
24893 +
24894 +static struct address_space_operations formatted_fake_as_ops;
24895 +
24896 +static const oid_t fake_ino = 0x1;
24897 +static const oid_t bitmap_ino = 0x2;
24898 +static const oid_t cc_ino = 0x3;
24899 +
24900 +static void
24901 +init_fake_inode(struct super_block *super, struct inode *fake,
24902 +               struct inode **pfake)
24903 +{
24904 +       assert("nikita-2168", fake->i_state & I_NEW);
24905 +       fake->i_mapping->a_ops = &formatted_fake_as_ops;
24906 +       *pfake = fake;
24907 +       /* NOTE-NIKITA something else? */
24908 +       unlock_new_inode(fake);
24909 +}
24910 +
24911 +/**
24912 + * reiser4_init_formatted_fake - iget inodes for formatted nodes and bitmaps
24913 + * @super: super block to init fake inode for
24914 + *
24915 + * Initializes fake inode to which formatted nodes are bound in the page cache
24916 + * and inode for bitmaps.
24917 + */
24918 +int reiser4_init_formatted_fake(struct super_block *super)
24919 +{
24920 +       struct inode *fake;
24921 +       struct inode *bitmap;
24922 +       struct inode *cc;
24923 +       reiser4_super_info_data *sinfo;
24924 +
24925 +       assert("nikita-1703", super != NULL);
24926 +
24927 +       sinfo = get_super_private_nocheck(super);
24928 +       fake = iget_locked(super, oid_to_ino(fake_ino));
24929 +
24930 +       if (fake != NULL) {
24931 +               init_fake_inode(super, fake, &sinfo->fake);
24932 +
24933 +               bitmap = iget_locked(super, oid_to_ino(bitmap_ino));
24934 +               if (bitmap != NULL) {
24935 +                       init_fake_inode(super, bitmap, &sinfo->bitmap);
24936 +
24937 +                       cc = iget_locked(super, oid_to_ino(cc_ino));
24938 +                       if (cc != NULL) {
24939 +                               init_fake_inode(super, cc, &sinfo->cc);
24940 +                               return 0;
24941 +                       } else {
24942 +                               iput(sinfo->fake);
24943 +                               iput(sinfo->bitmap);
24944 +                               sinfo->fake = NULL;
24945 +                               sinfo->bitmap = NULL;
24946 +                       }
24947 +               } else {
24948 +                       iput(sinfo->fake);
24949 +                       sinfo->fake = NULL;
24950 +               }
24951 +       }
24952 +       return RETERR(-ENOMEM);
24953 +}
24954 +
24955 +/**
24956 + * reiser4_done_formatted_fake - release inode used by formatted nodes and bitmaps
24957 + * @super: super block to init fake inode for
24958 + *
24959 + * Releases inodes which were used as address spaces of bitmap and formatted
24960 + * nodes.
24961 + */
24962 +void reiser4_done_formatted_fake(struct super_block *super)
24963 +{
24964 +       reiser4_super_info_data *sinfo;
24965 +
24966 +       sinfo = get_super_private_nocheck(super);
24967 +
24968 +       if (sinfo->fake != NULL) {
24969 +               iput(sinfo->fake);
24970 +               sinfo->fake = NULL;
24971 +       }
24972 +
24973 +       if (sinfo->bitmap != NULL) {
24974 +               iput(sinfo->bitmap);
24975 +               sinfo->bitmap = NULL;
24976 +       }
24977 +
24978 +       if (sinfo->cc != NULL) {
24979 +               iput(sinfo->cc);
24980 +               sinfo->cc = NULL;
24981 +       }
24982 +       return;
24983 +}
24984 +
24985 +void reiser4_wait_page_writeback(struct page *page)
24986 +{
24987 +       assert("zam-783", PageLocked(page));
24988 +
24989 +       do {
24990 +               unlock_page(page);
24991 +               wait_on_page_writeback(page);
24992 +               lock_page(page);
24993 +       } while (PageWriteback(page));
24994 +}
24995 +
24996 +/* return tree @page is in */
24997 +reiser4_tree *reiser4_tree_by_page(const struct page *page /* page to query */ )
24998 +{
24999 +       assert("nikita-2461", page != NULL);
25000 +       return &get_super_private(page->mapping->host->i_sb)->tree;
25001 +}
25002 +
25003 +/* completion handler for single page bio-based read.
25004 +
25005 +   mpage_end_io_read() would also do. But it's static.
25006 +
25007 +*/
25008 +static int
25009 +end_bio_single_page_read(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
25010 +                        int err UNUSED_ARG)
25011 +{
25012 +       struct page *page;
25013 +
25014 +       if (bio->bi_size != 0) {
25015 +               warning("nikita-3332", "Truncated single page read: %i",
25016 +                       bio->bi_size);
25017 +               return 1;
25018 +       }
25019 +
25020 +       page = bio->bi_io_vec[0].bv_page;
25021 +
25022 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
25023 +               SetPageUptodate(page);
25024 +       } else {
25025 +               ClearPageUptodate(page);
25026 +               SetPageError(page);
25027 +       }
25028 +       unlock_page(page);
25029 +       bio_put(bio);
25030 +       return 0;
25031 +}
25032 +
25033 +/* completion handler for single page bio-based write.
25034 +
25035 +   mpage_end_io_write() would also do. But it's static.
25036 +
25037 +*/
25038 +static int
25039 +end_bio_single_page_write(struct bio *bio, unsigned int bytes_done UNUSED_ARG,
25040 +                         int err UNUSED_ARG)
25041 +{
25042 +       struct page *page;
25043 +
25044 +       if (bio->bi_size != 0) {
25045 +               warning("nikita-3333", "Truncated single page write: %i",
25046 +                       bio->bi_size);
25047 +               return 1;
25048 +       }
25049 +
25050 +       page = bio->bi_io_vec[0].bv_page;
25051 +
25052 +       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
25053 +               SetPageError(page);
25054 +       end_page_writeback(page);
25055 +       bio_put(bio);
25056 +       return 0;
25057 +}
25058 +
25059 +/* ->readpage() method for formatted nodes */
25060 +static int formatted_readpage(struct file *f UNUSED_ARG,
25061 +                             struct page *page /* page to read */ )
25062 +{
25063 +       assert("nikita-2412", PagePrivate(page) && jprivate(page));
25064 +       return reiser4_page_io(page, jprivate(page), READ,
25065 +                              reiser4_ctx_gfp_mask_get());
25066 +}
25067 +
25068 +/**
25069 + * reiser4_page_io - submit single-page bio request
25070 + * @page: page to perform io for
25071 + * @node: jnode of page
25072 + * @rw: read or write
25073 + * @gfp: gfp mask for bio allocation
25074 + *
25075 + * Submits single page read or write.
25076 + */
25077 +int reiser4_page_io(struct page *page, jnode *node, int rw, gfp_t gfp)
25078 +{
25079 +       struct bio *bio;
25080 +       int result;
25081 +
25082 +       assert("nikita-2094", page != NULL);
25083 +       assert("nikita-2226", PageLocked(page));
25084 +       assert("nikita-2634", node != NULL);
25085 +       assert("nikita-2893", rw == READ || rw == WRITE);
25086 +
25087 +       if (rw) {
25088 +               if (unlikely(page->mapping->host->i_sb->s_flags & MS_RDONLY)) {
25089 +                       unlock_page(page);
25090 +                       return 0;
25091 +               }
25092 +       }
25093 +
25094 +       bio = page_bio(page, node, rw, gfp);
25095 +       if (!IS_ERR(bio)) {
25096 +               if (rw == WRITE) {
25097 +                       set_page_writeback(page);
25098 +                       unlock_page(page);
25099 +               }
25100 +               reiser4_submit_bio(rw, bio);
25101 +               result = 0;
25102 +       } else {
25103 +               unlock_page(page);
25104 +               result = PTR_ERR(bio);
25105 +       }
25106 +
25107 +       return result;
25108 +}
25109 +
25110 +/* helper function to construct bio for page */
25111 +static struct bio *page_bio(struct page *page, jnode * node, int rw, gfp_t gfp)
25112 +{
25113 +       struct bio *bio;
25114 +       assert("nikita-2092", page != NULL);
25115 +       assert("nikita-2633", node != NULL);
25116 +
25117 +       /* Simple implementation in the assumption that blocksize == pagesize.
25118 +
25119 +          We only have to submit one block, but submit_bh() will allocate bio
25120 +          anyway, so lets use all the bells-and-whistles of bio code.
25121 +        */
25122 +
25123 +       bio = bio_alloc(gfp, 1);
25124 +       if (bio != NULL) {
25125 +               int blksz;
25126 +               struct super_block *super;
25127 +               reiser4_block_nr blocknr;
25128 +
25129 +               super = page->mapping->host->i_sb;
25130 +               assert("nikita-2029", super != NULL);
25131 +               blksz = super->s_blocksize;
25132 +               assert("nikita-2028", blksz == (int)PAGE_CACHE_SIZE);
25133 +
25134 +               spin_lock_jnode(node);
25135 +               blocknr = *jnode_get_io_block(node);
25136 +               spin_unlock_jnode(node);
25137 +
25138 +               assert("nikita-2275", blocknr != (reiser4_block_nr) 0);
25139 +               assert("nikita-2276", !reiser4_blocknr_is_fake(&blocknr));
25140 +
25141 +               bio->bi_bdev = super->s_bdev;
25142 +               /* fill bio->bi_sector before calling bio_add_page(), because
25143 +                * q->merge_bvec_fn may want to inspect it (see
25144 +                * drivers/md/linear.c:linear_mergeable_bvec() for example. */
25145 +               bio->bi_sector = blocknr * (blksz >> 9);
25146 +
25147 +               if (!bio_add_page(bio, page, blksz, 0)) {
25148 +                       warning("nikita-3452",
25149 +                               "Single page bio cannot be constructed");
25150 +                       return ERR_PTR(RETERR(-EINVAL));
25151 +               }
25152 +
25153 +               /* bio -> bi_idx is filled by bio_init() */
25154 +               bio->bi_end_io = (rw == READ) ?
25155 +                   end_bio_single_page_read : end_bio_single_page_write;
25156 +
25157 +               return bio;
25158 +       } else
25159 +               return ERR_PTR(RETERR(-ENOMEM));
25160 +}
25161 +
25162 +/* this function is internally called by jnode_make_dirty() */
25163 +int reiser4_set_page_dirty_internal(struct page *page)
25164 +{
25165 +       struct address_space *mapping;
25166 +
25167 +       mapping = page->mapping;
25168 +       BUG_ON(mapping == NULL);
25169 +
25170 +       if (!TestSetPageDirty(page)) {
25171 +               if (mapping_cap_account_dirty(mapping))
25172 +                       inc_zone_page_state(page, NR_FILE_DIRTY);
25173 +
25174 +               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
25175 +       }
25176 +
25177 +       /* znode must be dirty ? */
25178 +       if (mapping->host == reiser4_get_super_fake(mapping->host->i_sb))
25179 +               assert("", JF_ISSET(jprivate(page), JNODE_DIRTY));
25180 +       return 0;
25181 +}
25182 +
25183 +#if REISER4_DEBUG
25184 +
25185 +/**
25186 + * can_hit_entd
25187 + *
25188 + * This is used on
25189 + */
25190 +static int can_hit_entd(reiser4_context *ctx, struct super_block *s)
25191 +{
25192 +       if (ctx == NULL || ((unsigned long)ctx->magic) != context_magic)
25193 +               return 1;
25194 +       if (ctx->super != s)
25195 +               return 1;
25196 +       if (get_super_private(s)->entd.tsk == current)
25197 +               return 0;
25198 +       if (!lock_stack_isclean(&ctx->stack))
25199 +               return 0;
25200 +       if (ctx->trans->atom != NULL)
25201 +               return 0;
25202 +       return 1;
25203 +}
25204 +
25205 +#endif
25206 +
25207 +/**
25208 + * reiser4_writepage - writepage of struct address_space_operations
25209 + * @page: page to write
25210 + * @wbc:
25211 + *
25212 + *
25213 + */
25214 +/* Common memory pressure notification. */
25215 +int reiser4_writepage(struct page *page,
25216 +                     struct writeback_control *wbc)
25217 +{
25218 +       struct super_block *s;
25219 +       reiser4_context *ctx;
25220 +
25221 +       assert("vs-828", PageLocked(page));
25222 +
25223 +       s = page->mapping->host->i_sb;
25224 +       ctx = get_current_context_check();
25225 +
25226 +       assert("", can_hit_entd(ctx, s));
25227 +
25228 +       return write_page_by_ent(page, wbc);
25229 +}
25230 +
25231 +/* ->set_page_dirty() method of formatted address_space */
25232 +static int formatted_set_page_dirty(struct page *page)
25233 +{
25234 +       assert("nikita-2173", page != NULL);
25235 +       BUG();
25236 +       return __set_page_dirty_nobuffers(page);
25237 +}
25238 +
25239 +/* writepages method of address space operations in reiser4 is used to involve
25240 +   into transactions pages which are dirtied via mmap. Only regular files can
25241 +   have such pages. Fake inode is used to access formatted nodes via page
25242 +   cache. As formatted nodes can never be mmaped, fake inode's writepages has
25243 +   nothing to do */
25244 +static int
25245 +writepages_fake(struct address_space *mapping, struct writeback_control *wbc)
25246 +{
25247 +       return 0;
25248 +}
25249 +
25250 +/* address space operations for the fake inode */
25251 +static struct address_space_operations formatted_fake_as_ops = {
25252 +       /* Perform a writeback of a single page as a memory-freeing
25253 +        * operation. */
25254 +       .writepage = reiser4_writepage,
25255 +       /* this is called to read formatted node */
25256 +       .readpage = formatted_readpage,
25257 +       /* ->sync_page() method of fake inode address space operations. Called
25258 +          from wait_on_page() and lock_page().
25259 +
25260 +          This is most annoyingly misnomered method. Actually it is called
25261 +          from wait_on_page_bit() and lock_page() and its purpose is to
25262 +          actually start io by jabbing device drivers.
25263 +        */
25264 +       .sync_page = block_sync_page,
25265 +       /* Write back some dirty pages from this mapping. Called from sync.
25266 +          called during sync (pdflush) */
25267 +       .writepages = writepages_fake,
25268 +       /* Set a page dirty */
25269 +       .set_page_dirty = formatted_set_page_dirty,
25270 +       /* used for read-ahead. Not applicable */
25271 +       .readpages = NULL,
25272 +       .prepare_write = NULL,
25273 +       .commit_write = NULL,
25274 +       .bmap = NULL,
25275 +       /* called just before page is being detached from inode mapping and
25276 +          removed from memory. Called on truncate, cut/squeeze, and
25277 +          umount. */
25278 +       .invalidatepage = reiser4_invalidatepage,
25279 +       /* this is called by shrink_cache() so that file system can try to
25280 +          release objects (jnodes, buffers, journal heads) attached to page
25281 +          and, may be made page itself free-able.
25282 +        */
25283 +       .releasepage = reiser4_releasepage,
25284 +       .direct_IO = NULL
25285 +};
25286 +
25287 +/* called just before page is released (no longer used by reiser4). Callers:
25288 +   jdelete() and extent2tail(). */
25289 +void reiser4_drop_page(struct page *page)
25290 +{
25291 +       assert("nikita-2181", PageLocked(page));
25292 +       clear_page_dirty_for_io(page);
25293 +       ClearPageUptodate(page);
25294 +#if defined(PG_skipped)
25295 +       ClearPageSkipped(page);
25296 +#endif
25297 +       unlock_page(page);
25298 +}
25299 +
25300 +#define JNODE_GANG_SIZE (16)
25301 +
25302 +/* find all jnodes from range specified and invalidate them */
25303 +static int
25304 +truncate_jnodes_range(struct inode *inode, pgoff_t from, pgoff_t count)
25305 +{
25306 +       reiser4_inode *info;
25307 +       int truncated_jnodes;
25308 +       reiser4_tree *tree;
25309 +       unsigned long index;
25310 +       unsigned long end;
25311 +
25312 +       if (inode_file_plugin(inode) ==
25313 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
25314 +               /* No need to get rid of jnodes here: if the single jnode of
25315 +                  page cluster did not have page, then it was found and killed
25316 +                  before in
25317 +                  truncate_page_cluster_cryptcompress()->jput()->jput_final(),
25318 +                  otherwise it will be dropped by reiser4_invalidatepage() */
25319 +               return 0;
25320 +       truncated_jnodes = 0;
25321 +
25322 +       info = reiser4_inode_data(inode);
25323 +       tree = reiser4_tree_by_inode(inode);
25324 +
25325 +       index = from;
25326 +       end = from + count;
25327 +
25328 +       while (1) {
25329 +               jnode *gang[JNODE_GANG_SIZE];
25330 +               int taken;
25331 +               int i;
25332 +               jnode *node;
25333 +
25334 +               assert("nikita-3466", index <= end);
25335 +
25336 +               read_lock_tree(tree);
25337 +               taken =
25338 +                   radix_tree_gang_lookup(jnode_tree_by_reiser4_inode(info),
25339 +                                          (void **)gang, index,
25340 +                                          JNODE_GANG_SIZE);
25341 +               for (i = 0; i < taken; ++i) {
25342 +                       node = gang[i];
25343 +                       if (index_jnode(node) < end)
25344 +                               jref(node);
25345 +                       else
25346 +                               gang[i] = NULL;
25347 +               }
25348 +               read_unlock_tree(tree);
25349 +
25350 +               for (i = 0; i < taken; ++i) {
25351 +                       node = gang[i];
25352 +                       if (node != NULL) {
25353 +                               index = max(index, index_jnode(node));
25354 +                               spin_lock_jnode(node);
25355 +                               assert("edward-1457", node->pg == NULL);
25356 +                               /* this is always called after
25357 +                                  truncate_inode_pages_range(). Therefore, here
25358 +                                  jnode can not have page. New pages can not be
25359 +                                  created because truncate_jnodes_range goes
25360 +                                  under exclusive access on file obtained,
25361 +                                  where as new page creation requires
25362 +                                  non-exclusive access obtained */
25363 +                               JF_SET(node, JNODE_HEARD_BANSHEE);
25364 +                               reiser4_uncapture_jnode(node);
25365 +                               unhash_unformatted_jnode(node);
25366 +                               truncated_jnodes++;
25367 +                               jput(node);
25368 +                       } else
25369 +                               break;
25370 +               }
25371 +               if (i != taken || taken == 0)
25372 +                       break;
25373 +       }
25374 +       return truncated_jnodes;
25375 +}
25376 +
25377 +/* Truncating files in reiser4: problems and solutions.
25378 +
25379 +   VFS calls fs's truncate after it has called truncate_inode_pages()
25380 +   to get rid of pages corresponding to part of file being truncated.
25381 +   In reiser4 it may cause existence of unallocated extents which do
25382 +   not have jnodes. Flush code does not expect that. Solution of this
25383 +   problem is straightforward. As vfs's truncate is implemented using
25384 +   setattr operation, it seems reasonable to have ->setattr() that
25385 +   will cut file body. However, flush code also does not expect dirty
25386 +   pages without parent items, so it is impossible to cut all items,
25387 +   then truncate all pages in two steps. We resolve this problem by
25388 +   cutting items one-by-one. Each such fine-grained step performed
25389 +   under longterm znode lock calls at the end ->kill_hook() method of
25390 +   a killed item to remove its binded pages and jnodes.
25391 +
25392 +   The following function is a common part of mentioned kill hooks.
25393 +   Also, this is called before tail-to-extent conversion (to not manage
25394 +   few copies of the data).
25395 +*/
25396 +void reiser4_invalidate_pages(struct address_space *mapping, pgoff_t from,
25397 +                             unsigned long count, int even_cows)
25398 +{
25399 +       loff_t from_bytes, count_bytes;
25400 +
25401 +       if (count == 0)
25402 +               return;
25403 +       from_bytes = ((loff_t) from) << PAGE_CACHE_SHIFT;
25404 +       count_bytes = ((loff_t) count) << PAGE_CACHE_SHIFT;
25405 +
25406 +       unmap_mapping_range(mapping, from_bytes, count_bytes, even_cows);
25407 +       truncate_inode_pages_range(mapping, from_bytes,
25408 +                                  from_bytes + count_bytes - 1);
25409 +       truncate_jnodes_range(mapping->host, from, count);
25410 +}
25411 +
25412 +/*
25413 + * Local variables:
25414 + * c-indentation-style: "K&R"
25415 + * mode-name: "LC"
25416 + * c-basic-offset: 8
25417 + * tab-width: 8
25418 + * fill-column: 120
25419 + * scroll-step: 1
25420 + * End:
25421 + */
25422 diff --git a/fs/reiser4/page_cache.h b/fs/reiser4/page_cache.h
25423 new file mode 100644
25424 index 0000000..ab74f8f
25425 --- /dev/null
25426 +++ b/fs/reiser4/page_cache.h
25427 @@ -0,0 +1,68 @@
25428 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
25429 + * reiser4/README */
25430 +/* Memory pressure hooks. Fake inodes handling. See page_cache.c. */
25431 +
25432 +#if !defined( __REISER4_PAGE_CACHE_H__ )
25433 +#define __REISER4_PAGE_CACHE_H__
25434 +
25435 +#include "forward.h"
25436 +#include "context.h"            /* for reiser4_ctx_gfp_mask_get() */
25437 +
25438 +#include <linux/fs.h>          /* for struct super_block, address_space  */
25439 +#include <linux/mm.h>          /* for struct page  */
25440 +#include <linux/pagemap.h>     /* for lock_page()  */
25441 +#include <linux/vmalloc.h>     /* for __vmalloc()  */
25442 +
25443 +extern int reiser4_init_formatted_fake(struct super_block *);
25444 +extern void reiser4_done_formatted_fake(struct super_block *);
25445 +
25446 +extern reiser4_tree *reiser4_tree_by_page(const struct page *);
25447 +
25448 +extern int reiser4_set_page_dirty_internal(struct page *);
25449 +
25450 +#define reiser4_submit_bio(rw, bio) submit_bio((rw), (bio))
25451 +
25452 +extern void reiser4_wait_page_writeback(struct page *);
25453 +static inline void lock_and_wait_page_writeback(struct page *page)
25454 +{
25455 +       lock_page(page);
25456 +       if (unlikely(PageWriteback(page)))
25457 +               reiser4_wait_page_writeback(page);
25458 +}
25459 +
25460 +#define jprivate(page) ((jnode *)page_private(page))
25461 +
25462 +extern int reiser4_page_io(struct page *, jnode *, int rw, gfp_t);
25463 +extern void reiser4_drop_page(struct page *);
25464 +extern void reiser4_invalidate_pages(struct address_space *, pgoff_t from,
25465 +                                    unsigned long count, int even_cows);
25466 +extern void capture_reiser4_inodes(struct super_block *,
25467 +                                  struct writeback_control *);
25468 +static inline void * reiser4_vmalloc (unsigned long size)
25469 +{
25470 +       return __vmalloc(size,
25471 +                        reiser4_ctx_gfp_mask_get() | __GFP_HIGHMEM,
25472 +                        PAGE_KERNEL);
25473 +}
25474 +
25475 +#define PAGECACHE_TAG_REISER4_MOVED PAGECACHE_TAG_DIRTY
25476 +
25477 +#if REISER4_DEBUG
25478 +extern void print_page(const char *prefix, struct page *page);
25479 +#else
25480 +#define print_page(prf, p) noop
25481 +#endif
25482 +
25483 +/* __REISER4_PAGE_CACHE_H__ */
25484 +#endif
25485 +
25486 +/* Make Linus happy.
25487 +   Local variables:
25488 +   c-indentation-style: "K&R"
25489 +   mode-name: "LC"
25490 +   c-basic-offset: 8
25491 +   tab-width: 8
25492 +   fill-column: 120
25493 +   scroll-step: 1
25494 +   End:
25495 +*/
25496 diff --git a/fs/reiser4/plugin/Makefile b/fs/reiser4/plugin/Makefile
25497 new file mode 100644
25498 index 0000000..4b2c9f8
25499 --- /dev/null
25500 +++ b/fs/reiser4/plugin/Makefile
25501 @@ -0,0 +1,26 @@
25502 +obj-$(CONFIG_REISER4_FS) += plugins.o
25503 +
25504 +plugins-objs :=                        \
25505 +       plugin.o                \
25506 +       plugin_set.o            \
25507 +       object.o                \
25508 +       inode_ops.o             \
25509 +       inode_ops_rename.o      \
25510 +       file_ops.o              \
25511 +       file_ops_readdir.o      \
25512 +       file_plugin_common.o    \
25513 +       dir_plugin_common.o     \
25514 +       digest.o                \
25515 +       hash.o                  \
25516 +       fibration.o             \
25517 +       tail_policy.o           \
25518 +       regular.o
25519 +
25520 +obj-$(CONFIG_REISER4_FS) += item/
25521 +obj-$(CONFIG_REISER4_FS) += file/
25522 +obj-$(CONFIG_REISER4_FS) += dir/
25523 +obj-$(CONFIG_REISER4_FS) += node/
25524 +obj-$(CONFIG_REISER4_FS) += compress/
25525 +obj-$(CONFIG_REISER4_FS) += space/
25526 +obj-$(CONFIG_REISER4_FS) += disk_format/
25527 +obj-$(CONFIG_REISER4_FS) += security/
25528 diff --git a/fs/reiser4/plugin/cluster.c b/fs/reiser4/plugin/cluster.c
25529 new file mode 100644
25530 index 0000000..b400d5f
25531 --- /dev/null
25532 +++ b/fs/reiser4/plugin/cluster.c
25533 @@ -0,0 +1,71 @@
25534 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25535 +
25536 +/* Contains reiser4 cluster plugins (see
25537 +   http://www.namesys.com/cryptcompress_design.html
25538 +   "Concepts of clustering" for details). */
25539 +
25540 +#include "plugin_header.h"
25541 +#include "plugin.h"
25542 +#include "../inode.h"
25543 +
25544 +static int change_cluster(struct inode *inode,
25545 +                         reiser4_plugin * plugin,
25546 +                         pset_member memb)
25547 +{
25548 +       assert("edward-1324", inode != NULL);
25549 +       assert("edward-1325", plugin != NULL);
25550 +       assert("edward-1326", is_reiser4_inode(inode));
25551 +       assert("edward-1327", plugin->h.type_id == REISER4_CLUSTER_PLUGIN_TYPE);
25552 +
25553 +       /* Can't change the cluster plugin for already existent regular files. */
25554 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25555 +               return RETERR(-EINVAL);
25556 +
25557 +       /* If matches, nothing to change. */
25558 +       if (inode_hash_plugin(inode) != NULL &&
25559 +           inode_hash_plugin(inode)->h.id == plugin->h.id)
25560 +               return 0;
25561 +
25562 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
25563 +                              PSET_CLUSTER, plugin);
25564 +}
25565 +
25566 +static reiser4_plugin_ops cluster_plugin_ops = {
25567 +       .init = NULL,
25568 +       .load = NULL,
25569 +       .save_len = NULL,
25570 +       .save = NULL,
25571 +       .change = &change_cluster
25572 +};
25573 +
25574 +#define SUPPORT_CLUSTER(SHIFT, ID, LABEL, DESC)                        \
25575 +       [CLUSTER_ ## ID ## _ID] = {                             \
25576 +               .h = {                                          \
25577 +                       .type_id = REISER4_CLUSTER_PLUGIN_TYPE, \
25578 +                       .id = CLUSTER_ ## ID ## _ID,            \
25579 +                       .pops = &cluster_plugin_ops,            \
25580 +                       .label = LABEL,                         \
25581 +                       .desc = DESC,                           \
25582 +                       .linkage = {NULL, NULL}                 \
25583 +               },                                              \
25584 +               .shift = SHIFT                                  \
25585 +       }
25586 +
25587 +cluster_plugin cluster_plugins[LAST_CLUSTER_ID] = {
25588 +       SUPPORT_CLUSTER(16, 64K, "64K", "Large"),
25589 +       SUPPORT_CLUSTER(15, 32K, "32K", "Big"),
25590 +       SUPPORT_CLUSTER(14, 16K, "16K", "Average"),
25591 +       SUPPORT_CLUSTER(13, 8K, "8K", "Small"),
25592 +       SUPPORT_CLUSTER(12, 4K, "4K", "Minimal")
25593 +};
25594 +
25595 +/*
25596 +  Local variables:
25597 +  c-indentation-style: "K&R"
25598 +  mode-name: "LC"
25599 +  c-basic-offset: 8
25600 +  tab-width: 8
25601 +  fill-column: 120
25602 +  scroll-step: 1
25603 +  End:
25604 +*/
25605 diff --git a/fs/reiser4/plugin/cluster.h b/fs/reiser4/plugin/cluster.h
25606 new file mode 100644
25607 index 0000000..019f156
25608 --- /dev/null
25609 +++ b/fs/reiser4/plugin/cluster.h
25610 @@ -0,0 +1,343 @@
25611 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25612 +
25613 +/* This file contains page/cluster index translators and offset modulators
25614 +   See http://www.namesys.com/cryptcompress_design.html for details */
25615 +
25616 +#if !defined( __FS_REISER4_CLUSTER_H__ )
25617 +#define __FS_REISER4_CLUSTER_H__
25618 +
25619 +#include "../inode.h"
25620 +
25621 +static inline int inode_cluster_shift(struct inode *inode)
25622 +{
25623 +       assert("edward-92", inode != NULL);
25624 +       assert("edward-93", reiser4_inode_data(inode) != NULL);
25625 +
25626 +       return inode_cluster_plugin(inode)->shift;
25627 +}
25628 +
25629 +static inline unsigned cluster_nrpages_shift(struct inode *inode)
25630 +{
25631 +       return inode_cluster_shift(inode) - PAGE_CACHE_SHIFT;
25632 +}
25633 +
25634 +/* cluster size in page units */
25635 +static inline unsigned cluster_nrpages(struct inode *inode)
25636 +{
25637 +       return 1U << cluster_nrpages_shift(inode);
25638 +}
25639 +
25640 +static inline size_t inode_cluster_size(struct inode *inode)
25641 +{
25642 +       assert("edward-96", inode != NULL);
25643 +
25644 +       return 1U << inode_cluster_shift(inode);
25645 +}
25646 +
25647 +static inline cloff_t pg_to_clust(pgoff_t idx, struct inode *inode)
25648 +{
25649 +       return idx >> cluster_nrpages_shift(inode);
25650 +}
25651 +
25652 +static inline pgoff_t clust_to_pg(cloff_t idx, struct inode *inode)
25653 +{
25654 +       return idx << cluster_nrpages_shift(inode);
25655 +}
25656 +
25657 +static inline pgoff_t pg_to_clust_to_pg(pgoff_t idx, struct inode *inode)
25658 +{
25659 +       return clust_to_pg(pg_to_clust(idx, inode), inode);
25660 +}
25661 +
25662 +static inline pgoff_t off_to_pg(loff_t off)
25663 +{
25664 +       return (off >> PAGE_CACHE_SHIFT);
25665 +}
25666 +
25667 +static inline loff_t pg_to_off(pgoff_t idx)
25668 +{
25669 +       return ((loff_t) (idx) << PAGE_CACHE_SHIFT);
25670 +}
25671 +
25672 +static inline cloff_t off_to_clust(loff_t off, struct inode *inode)
25673 +{
25674 +       return off >> inode_cluster_shift(inode);
25675 +}
25676 +
25677 +static inline loff_t clust_to_off(cloff_t idx, struct inode *inode)
25678 +{
25679 +       return (loff_t) idx << inode_cluster_shift(inode);
25680 +}
25681 +
25682 +static inline unsigned long count_to_nr(loff_t count, unsigned shift)
25683 +{
25684 +       return (count + (1UL << shift) - 1) >> shift;
25685 +}
25686 +
25687 +/* number of pages occupied by @count bytes */
25688 +static inline pgoff_t count_to_nrpages(loff_t count)
25689 +{
25690 +       return count_to_nr(count, PAGE_CACHE_SHIFT);
25691 +}
25692 +
25693 +/* number of clusters occupied by @count bytes */
25694 +static inline cloff_t count_to_nrclust(loff_t count, struct inode *inode)
25695 +{
25696 +       return count_to_nr(count, inode_cluster_shift(inode));
25697 +}
25698 +
25699 +/* number of clusters occupied by @count pages */
25700 +static inline cloff_t pgcount_to_nrclust(pgoff_t count, struct inode *inode)
25701 +{
25702 +       return count_to_nr(count, cluster_nrpages_shift(inode));
25703 +}
25704 +
25705 +static inline loff_t off_to_clust_to_off(loff_t off, struct inode *inode)
25706 +{
25707 +       return clust_to_off(off_to_clust(off, inode), inode);
25708 +}
25709 +
25710 +static inline pgoff_t off_to_clust_to_pg(loff_t off, struct inode *inode)
25711 +{
25712 +       return clust_to_pg(off_to_clust(off, inode), inode);
25713 +}
25714 +
25715 +static inline unsigned off_to_pgoff(loff_t off)
25716 +{
25717 +       return off & (PAGE_CACHE_SIZE - 1);
25718 +}
25719 +
25720 +static inline unsigned off_to_cloff(loff_t off, struct inode *inode)
25721 +{
25722 +       return off & ((loff_t) (inode_cluster_size(inode)) - 1);
25723 +}
25724 +
25725 +static inline unsigned
25726 +pg_to_off_to_cloff(unsigned long idx, struct inode *inode)
25727 +{
25728 +       return off_to_cloff(pg_to_off(idx), inode);
25729 +}
25730 +
25731 +/* if @size != 0, returns index of the page
25732 +   which contains the last byte of the file */
25733 +static inline pgoff_t size_to_pg(loff_t size)
25734 +{
25735 +       return (size ? off_to_pg(size - 1) : 0);
25736 +}
25737 +
25738 +/* minimal index of the page which doesn't contain
25739 +   file data */
25740 +static inline pgoff_t size_to_next_pg(loff_t size)
25741 +{
25742 +       return (size ? off_to_pg(size - 1) + 1 : 0);
25743 +}
25744 +
25745 +/* how many bytes of file of size @cnt can be contained
25746 +   in page of index @idx */
25747 +static inline unsigned cnt_to_pgcnt(loff_t cnt, pgoff_t idx)
25748 +{
25749 +       if (idx > off_to_pg(cnt))
25750 +               return 0;
25751 +       if (idx < off_to_pg(cnt))
25752 +               return PAGE_CACHE_SIZE;
25753 +       return off_to_pgoff(cnt);
25754 +}
25755 +
25756 +/* how many bytes of file of size @cnt can be contained
25757 +   in logical cluster of index @idx */
25758 +static inline unsigned cnt_to_clcnt(loff_t cnt, cloff_t idx,
25759 +                                   struct inode *inode)
25760 +{
25761 +       if (idx > off_to_clust(cnt, inode))
25762 +               return 0;
25763 +       if (idx < off_to_clust(cnt, inode))
25764 +               return inode_cluster_size(inode);
25765 +       return off_to_cloff(cnt, inode);
25766 +}
25767 +
25768 +static inline unsigned
25769 +fsize_to_count(reiser4_cluster_t * clust, struct inode *inode)
25770 +{
25771 +       assert("edward-288", clust != NULL);
25772 +       assert("edward-289", inode != NULL);
25773 +
25774 +       return cnt_to_clcnt(inode->i_size, clust->index, inode);
25775 +}
25776 +
25777 +static inline int
25778 +cluster_is_complete(reiser4_cluster_t * clust, struct inode * inode)
25779 +{
25780 +       return clust->tc.lsize == inode_cluster_size(inode);
25781 +}
25782 +
25783 +static inline void reiser4_slide_init(reiser4_slide_t * win)
25784 +{
25785 +       assert("edward-1084", win != NULL);
25786 +       memset(win, 0, sizeof *win);
25787 +}
25788 +
25789 +static inline tfm_action
25790 +cluster_get_tfm_act(tfm_cluster_t * tc)
25791 +{
25792 +       assert("edward-1356", tc != NULL);
25793 +       return tc->act;
25794 +}
25795 +
25796 +static inline void
25797 +cluster_set_tfm_act(tfm_cluster_t * tc, tfm_action act)
25798 +{
25799 +       assert("edward-1356", tc != NULL);
25800 +       tc->act = act;
25801 +}
25802 +
25803 +static inline void
25804 +cluster_init_act (reiser4_cluster_t * clust, tfm_action act, reiser4_slide_t * window){
25805 +       assert("edward-84", clust != NULL);
25806 +       memset(clust, 0, sizeof *clust);
25807 +       cluster_set_tfm_act(&clust->tc, act);
25808 +       clust->dstat = INVAL_DISK_CLUSTER;
25809 +       clust->win = window;
25810 +}
25811 +
25812 +static inline void
25813 +cluster_init_read(reiser4_cluster_t * clust, reiser4_slide_t * window)
25814 +{
25815 +       cluster_init_act (clust, TFMA_READ, window);
25816 +}
25817 +
25818 +static inline void
25819 +cluster_init_write(reiser4_cluster_t * clust, reiser4_slide_t * window)
25820 +{
25821 +       cluster_init_act (clust, TFMA_WRITE, window);
25822 +}
25823 +
25824 +static inline int dclust_get_extension_dsize(hint_t * hint)
25825 +{
25826 +       return hint->ext_coord.extension.ctail.dsize;
25827 +}
25828 +
25829 +static inline void dclust_set_extension_dsize(hint_t * hint, int dsize)
25830 +{
25831 +       hint->ext_coord.extension.ctail.dsize = dsize;
25832 +}
25833 +
25834 +static inline int dclust_get_extension_shift(hint_t * hint)
25835 +{
25836 +       return hint->ext_coord.extension.ctail.shift;
25837 +}
25838 +
25839 +static inline int dclust_get_extension_ncount(hint_t * hint)
25840 +{
25841 +       return hint->ext_coord.extension.ctail.ncount;
25842 +}
25843 +
25844 +static inline void dclust_inc_extension_ncount(hint_t * hint)
25845 +{
25846 +       hint->ext_coord.extension.ctail.ncount ++;
25847 +}
25848 +
25849 +static inline void dclust_init_extension(hint_t * hint)
25850 +{
25851 +       memset(&hint->ext_coord.extension.ctail, 0,
25852 +              sizeof(hint->ext_coord.extension.ctail));
25853 +}
25854 +
25855 +static inline int hint_is_unprepped_dclust(hint_t * hint)
25856 +{
25857 +       assert("edward-1451", hint_is_valid(hint));
25858 +       return dclust_get_extension_shift(hint) == (int)UCTAIL_SHIFT;
25859 +}
25860 +
25861 +static inline void coord_set_between_clusters(coord_t * coord)
25862 +{
25863 +#if REISER4_DEBUG
25864 +       int result;
25865 +       result = zload(coord->node);
25866 +       assert("edward-1296", !result);
25867 +#endif
25868 +       if (!coord_is_between_items(coord)) {
25869 +               coord->between = AFTER_ITEM;
25870 +               coord->unit_pos = 0;
25871 +       }
25872 +#if REISER4_DEBUG
25873 +       zrelse(coord->node);
25874 +#endif
25875 +}
25876 +
25877 +int reiser4_inflate_cluster(reiser4_cluster_t *, struct inode *);
25878 +int find_disk_cluster(reiser4_cluster_t *, struct inode *, int read,
25879 +                     znode_lock_mode mode);
25880 +int flush_cluster_pages(reiser4_cluster_t *, jnode *, struct inode *);
25881 +int reiser4_deflate_cluster(reiser4_cluster_t *, struct inode *);
25882 +void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t start,
25883 +                                        int even_cows);
25884 +void invalidate_hint_cluster(reiser4_cluster_t * clust);
25885 +void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
25886 +                     znode_lock_mode mode);
25887 +int get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
25888 +                           znode_lock_mode lock_mode);
25889 +void reset_cluster_params(reiser4_cluster_t * clust);
25890 +int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
25891 +                       int count);
25892 +int prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
25893 +                        int capture);
25894 +void reiser4_release_cluster_pages(reiser4_cluster_t *);
25895 +void put_cluster_handle(reiser4_cluster_t * clust);
25896 +int grab_tfm_stream(struct inode *inode, tfm_cluster_t * tc, tfm_stream_id id);
25897 +int tfm_cluster_is_uptodate(tfm_cluster_t * tc);
25898 +void tfm_cluster_set_uptodate(tfm_cluster_t * tc);
25899 +void tfm_cluster_clr_uptodate(tfm_cluster_t * tc);
25900 +
25901 +/* move cluster handle to the target position
25902 +   specified by the page of index @pgidx
25903 +*/
25904 +static inline void move_cluster_forward(reiser4_cluster_t * clust,
25905 +                                       struct inode *inode,
25906 +                                       pgoff_t pgidx)
25907 +{
25908 +       assert("edward-1297", clust != NULL);
25909 +       assert("edward-1298", inode != NULL);
25910 +
25911 +       reset_cluster_params(clust);
25912 +       if (clust->index_valid &&
25913 +           /* Hole in the indices. Hint became invalid and can not be
25914 +              used by find_cluster_item() even if seal/node versions
25915 +              will coincide */
25916 +           pg_to_clust(pgidx, inode) != clust->index + 1) {
25917 +               reiser4_unset_hint(clust->hint);
25918 +               invalidate_hint_cluster(clust);
25919 +       }
25920 +       clust->index = pg_to_clust(pgidx, inode);
25921 +       clust->index_valid = 1;
25922 +}
25923 +
25924 +static inline int
25925 +alloc_clust_pages(reiser4_cluster_t * clust, struct inode *inode)
25926 +{
25927 +       assert("edward-791", clust != NULL);
25928 +       assert("edward-792", inode != NULL);
25929 +       clust->pages =
25930 +               kmalloc(sizeof(*clust->pages) << inode_cluster_shift(inode),
25931 +                       reiser4_ctx_gfp_mask_get());
25932 +       if (!clust->pages)
25933 +               return -ENOMEM;
25934 +       return 0;
25935 +}
25936 +
25937 +static inline void free_clust_pages(reiser4_cluster_t * clust)
25938 +{
25939 +       kfree(clust->pages);
25940 +}
25941 +
25942 +#endif                         /* __FS_REISER4_CLUSTER_H__ */
25943 +
25944 +/* Make Linus happy.
25945 +   Local variables:
25946 +   c-indentation-style: "K&R"
25947 +   mode-name: "LC"
25948 +   c-basic-offset: 8
25949 +   tab-width: 8
25950 +   fill-column: 120
25951 +   scroll-step: 1
25952 +   End:
25953 +*/
25954 diff --git a/fs/reiser4/plugin/compress/Makefile b/fs/reiser4/plugin/compress/Makefile
25955 new file mode 100644
25956 index 0000000..82793a4
25957 --- /dev/null
25958 +++ b/fs/reiser4/plugin/compress/Makefile
25959 @@ -0,0 +1,6 @@
25960 +obj-$(CONFIG_REISER4_FS) += compress_plugins.o
25961 +
25962 +compress_plugins-objs :=       \
25963 +       compress.o              \
25964 +       minilzo.o               \
25965 +       compress_mode.o
25966 diff --git a/fs/reiser4/plugin/compress/compress.c b/fs/reiser4/plugin/compress/compress.c
25967 new file mode 100644
25968 index 0000000..7e64d0c
25969 --- /dev/null
25970 +++ b/fs/reiser4/plugin/compress/compress.c
25971 @@ -0,0 +1,381 @@
25972 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
25973 +/* reiser4 compression transform plugins */
25974 +
25975 +#include "../../debug.h"
25976 +#include "../../inode.h"
25977 +#include "../plugin.h"
25978 +#include "minilzo.h"
25979 +
25980 +#include <linux/zlib.h>
25981 +#include <linux/types.h>
25982 +#include <linux/hardirq.h>
25983 +
25984 +static int change_compression(struct inode *inode,
25985 +                             reiser4_plugin * plugin,
25986 +                             pset_member memb)
25987 +{
25988 +       assert("edward-1316", inode != NULL);
25989 +       assert("edward-1317", plugin != NULL);
25990 +       assert("edward-1318", is_reiser4_inode(inode));
25991 +       assert("edward-1319",
25992 +              plugin->h.type_id == REISER4_COMPRESSION_PLUGIN_TYPE);
25993 +
25994 +       /* cannot change compression plugin of already existing regular object */
25995 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
25996 +               return RETERR(-EINVAL);
25997 +
25998 +       /* If matches, nothing to change. */
25999 +       if (inode_hash_plugin(inode) != NULL &&
26000 +           inode_hash_plugin(inode)->h.id == plugin->h.id)
26001 +               return 0;
26002 +
26003 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset,
26004 +                              PSET_COMPRESSION, plugin);
26005 +}
26006 +
26007 +static reiser4_plugin_ops compression_plugin_ops = {
26008 +       .init = NULL,
26009 +       .load = NULL,
26010 +       .save_len = NULL,
26011 +       .save = NULL,
26012 +       .change = &change_compression
26013 +};
26014 +
26015 +/******************************************************************************/
26016 +/*                         gzip1 compression                                  */
26017 +/******************************************************************************/
26018 +
26019 +#define GZIP1_DEF_LEVEL                        Z_BEST_SPEED
26020 +#define GZIP1_DEF_WINBITS              15
26021 +#define GZIP1_DEF_MEMLEVEL             MAX_MEM_LEVEL
26022 +
26023 +static int gzip1_init(void)
26024 +{
26025 +       int ret = -EINVAL;
26026 +#if REISER4_ZLIB
26027 +       ret = 0;
26028 +#endif
26029 +       if (ret == -EINVAL)
26030 +               warning("edward-1337", "Zlib not compiled into kernel");
26031 +       return ret;
26032 +}
26033 +
26034 +static int gzip1_overrun(unsigned src_len UNUSED_ARG)
26035 +{
26036 +       return 0;
26037 +}
26038 +
26039 +static coa_t gzip1_alloc(tfm_action act)
26040 +{
26041 +       coa_t coa = NULL;
26042 +#if REISER4_ZLIB
26043 +       int ret = 0;
26044 +       switch (act) {
26045 +       case TFMA_WRITE:        /* compress */
26046 +               coa = reiser4_vmalloc(zlib_deflate_workspacesize());
26047 +               if (!coa) {
26048 +                       ret = -ENOMEM;
26049 +                       break;
26050 +               }
26051 +               memset(coa, 0, zlib_deflate_workspacesize());
26052 +               break;
26053 +       case TFMA_READ: /* decompress */
26054 +               coa = reiser4_vmalloc(zlib_inflate_workspacesize());
26055 +               if (!coa) {
26056 +                       ret = -ENOMEM;
26057 +                       break;
26058 +               }
26059 +               memset(coa, 0, zlib_inflate_workspacesize());
26060 +               break;
26061 +       default:
26062 +               impossible("edward-767",
26063 +                          "trying to alloc workspace for unknown tfm action");
26064 +       }
26065 +       if (ret) {
26066 +               warning("edward-768",
26067 +                       "alloc workspace for gzip1 (tfm action = %d) failed\n",
26068 +                       act);
26069 +               return ERR_PTR(ret);
26070 +       }
26071 +#endif
26072 +       return coa;
26073 +}
26074 +
26075 +static void gzip1_free(coa_t coa, tfm_action act)
26076 +{
26077 +       assert("edward-769", coa != NULL);
26078 +
26079 +       switch (act) {
26080 +       case TFMA_WRITE:        /* compress */
26081 +               vfree(coa);
26082 +               break;
26083 +       case TFMA_READ:         /* decompress */
26084 +               vfree(coa);
26085 +               break;
26086 +       default:
26087 +               impossible("edward-770", "unknown tfm action");
26088 +       }
26089 +       return;
26090 +}
26091 +
26092 +static int gzip1_min_size_deflate(void)
26093 +{
26094 +       return 64;
26095 +}
26096 +
26097 +static void
26098 +gzip1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
26099 +              __u8 * dst_first, unsigned *dst_len)
26100 +{
26101 +#if REISER4_ZLIB
26102 +       int ret = 0;
26103 +       struct z_stream_s stream;
26104 +
26105 +       memset(&stream, 0, sizeof(stream));
26106 +
26107 +       assert("edward-842", coa != NULL);
26108 +       assert("edward-875", src_len != 0);
26109 +
26110 +       stream.workspace = coa;
26111 +       ret = zlib_deflateInit2(&stream, GZIP1_DEF_LEVEL, Z_DEFLATED,
26112 +                               -GZIP1_DEF_WINBITS, GZIP1_DEF_MEMLEVEL,
26113 +                               Z_DEFAULT_STRATEGY);
26114 +       if (ret != Z_OK) {
26115 +               warning("edward-771", "zlib_deflateInit2 returned %d\n", ret);
26116 +               goto rollback;
26117 +       }
26118 +       ret = zlib_deflateReset(&stream);
26119 +       if (ret != Z_OK) {
26120 +               warning("edward-772", "zlib_deflateReset returned %d\n", ret);
26121 +               goto rollback;
26122 +       }
26123 +       stream.next_in = src_first;
26124 +       stream.avail_in = src_len;
26125 +       stream.next_out = dst_first;
26126 +       stream.avail_out = *dst_len;
26127 +
26128 +       ret = zlib_deflate(&stream, Z_FINISH);
26129 +       if (ret != Z_STREAM_END) {
26130 +               if (ret != Z_OK)
26131 +                       warning("edward-773",
26132 +                               "zlib_deflate returned %d\n", ret);
26133 +               goto rollback;
26134 +       }
26135 +       *dst_len = stream.total_out;
26136 +       return;
26137 +      rollback:
26138 +       *dst_len = src_len;
26139 +#endif
26140 +       return;
26141 +}
26142 +
26143 +static void
26144 +gzip1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
26145 +                __u8 * dst_first, unsigned *dst_len)
26146 +{
26147 +#if REISER4_ZLIB
26148 +       int ret = 0;
26149 +       struct z_stream_s stream;
26150 +
26151 +       memset(&stream, 0, sizeof(stream));
26152 +
26153 +       assert("edward-843", coa != NULL);
26154 +       assert("edward-876", src_len != 0);
26155 +
26156 +       stream.workspace = coa;
26157 +       ret = zlib_inflateInit2(&stream, -GZIP1_DEF_WINBITS);
26158 +       if (ret != Z_OK) {
26159 +               warning("edward-774", "zlib_inflateInit2 returned %d\n", ret);
26160 +               return;
26161 +       }
26162 +       ret = zlib_inflateReset(&stream);
26163 +       if (ret != Z_OK) {
26164 +               warning("edward-775", "zlib_inflateReset returned %d\n", ret);
26165 +               return;
26166 +       }
26167 +
26168 +       stream.next_in = src_first;
26169 +       stream.avail_in = src_len;
26170 +       stream.next_out = dst_first;
26171 +       stream.avail_out = *dst_len;
26172 +
26173 +       ret = zlib_inflate(&stream, Z_SYNC_FLUSH);
26174 +       /*
26175 +        * Work around a bug in zlib, which sometimes wants to taste an extra
26176 +        * byte when being used in the (undocumented) raw deflate mode.
26177 +        * (From USAGI).
26178 +        */
26179 +       if (ret == Z_OK && !stream.avail_in && stream.avail_out) {
26180 +               u8 zerostuff = 0;
26181 +               stream.next_in = &zerostuff;
26182 +               stream.avail_in = 1;
26183 +               ret = zlib_inflate(&stream, Z_FINISH);
26184 +       }
26185 +       if (ret != Z_STREAM_END) {
26186 +               warning("edward-776", "zlib_inflate returned %d\n", ret);
26187 +               return;
26188 +       }
26189 +       *dst_len = stream.total_out;
26190 +#endif
26191 +       return;
26192 +}
26193 +
26194 +/******************************************************************************/
26195 +/*                            lzo1 compression                                */
26196 +/******************************************************************************/
26197 +
26198 +static int lzo1_init(void)
26199 +{
26200 +       int ret;
26201 +       ret = lzo_init();
26202 +       if (ret != LZO_E_OK)
26203 +               warning("edward-848", "lzo_init() failed with ret = %d\n", ret);
26204 +       return ret;
26205 +}
26206 +
26207 +static int lzo1_overrun(unsigned in_len)
26208 +{
26209 +       return in_len / 64 + 16 + 3;
26210 +}
26211 +
26212 +#define LZO_HEAP_SIZE(size) \
26213 +       sizeof(lzo_align_t) * (((size) + (sizeof(lzo_align_t) - 1)) / sizeof(lzo_align_t))
26214 +
26215 +static coa_t lzo1_alloc(tfm_action act)
26216 +{
26217 +       int ret = 0;
26218 +       coa_t coa = NULL;
26219 +
26220 +       switch (act) {
26221 +       case TFMA_WRITE:        /* compress */
26222 +               coa = reiser4_vmalloc(LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
26223 +               if (!coa) {
26224 +                       ret = -ENOMEM;
26225 +                       break;
26226 +               }
26227 +               memset(coa, 0, LZO_HEAP_SIZE(LZO1X_1_MEM_COMPRESS));
26228 +       case TFMA_READ:         /* decompress */
26229 +               break;
26230 +       default:
26231 +               impossible("edward-877",
26232 +                          "trying to alloc workspace for unknown tfm action");
26233 +       }
26234 +       if (ret) {
26235 +               warning("edward-878",
26236 +                       "alloc workspace for lzo1 (tfm action = %d) failed\n",
26237 +                       act);
26238 +               return ERR_PTR(ret);
26239 +       }
26240 +       return coa;
26241 +}
26242 +
26243 +static void lzo1_free(coa_t coa, tfm_action act)
26244 +{
26245 +       assert("edward-879", coa != NULL);
26246 +
26247 +       switch (act) {
26248 +       case TFMA_WRITE:        /* compress */
26249 +               vfree(coa);
26250 +               break;
26251 +       case TFMA_READ:         /* decompress */
26252 +               impossible("edward-1304",
26253 +                          "trying to free non-allocated workspace");
26254 +       default:
26255 +               impossible("edward-880", "unknown tfm action");
26256 +       }
26257 +       return;
26258 +}
26259 +
26260 +static int lzo1_min_size_deflate(void)
26261 +{
26262 +       return 256;
26263 +}
26264 +
26265 +static void
26266 +lzo1_compress(coa_t coa, __u8 * src_first, unsigned src_len,
26267 +             __u8 * dst_first, unsigned *dst_len)
26268 +{
26269 +       int result;
26270 +
26271 +       assert("edward-846", coa != NULL);
26272 +       assert("edward-847", src_len != 0);
26273 +
26274 +       result = lzo1x_1_compress(src_first, src_len, dst_first, dst_len, coa);
26275 +       if (result != LZO_E_OK) {
26276 +               warning("edward-849", "lzo1x_1_compress failed\n");
26277 +               goto out;
26278 +       }
26279 +       if (*dst_len >= src_len) {
26280 +               //warning("edward-850", "lzo1x_1_compress: incompressible data\n");
26281 +               goto out;
26282 +       }
26283 +       return;
26284 +      out:
26285 +       *dst_len = src_len;
26286 +       return;
26287 +}
26288 +
26289 +static void
26290 +lzo1_decompress(coa_t coa, __u8 * src_first, unsigned src_len,
26291 +               __u8 * dst_first, unsigned *dst_len)
26292 +{
26293 +       int result;
26294 +
26295 +       assert("edward-851", coa == NULL);
26296 +       assert("edward-852", src_len != 0);
26297 +
26298 +       result = lzo1x_decompress(src_first, src_len, dst_first, dst_len, NULL);
26299 +       if (result != LZO_E_OK)
26300 +               warning("edward-853", "lzo1x_1_decompress failed\n");
26301 +       return;
26302 +}
26303 +
26304 +compression_plugin compression_plugins[LAST_COMPRESSION_ID] = {
26305 +       [LZO1_COMPRESSION_ID] = {
26306 +               .h = {
26307 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26308 +                       .id = LZO1_COMPRESSION_ID,
26309 +                       .pops = &compression_plugin_ops,
26310 +                       .label = "lzo1",
26311 +                       .desc = "lzo1 compression transform",
26312 +                       .linkage = {NULL, NULL}
26313 +               },
26314 +               .init = lzo1_init,
26315 +               .overrun = lzo1_overrun,
26316 +               .alloc = lzo1_alloc,
26317 +               .free = lzo1_free,
26318 +               .min_size_deflate = lzo1_min_size_deflate,
26319 +               .checksum = reiser4_adler32,
26320 +               .compress = lzo1_compress,
26321 +               .decompress = lzo1_decompress
26322 +       },
26323 +       [GZIP1_COMPRESSION_ID] = {
26324 +               .h = {
26325 +                       .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
26326 +                       .id = GZIP1_COMPRESSION_ID,
26327 +                       .pops = &compression_plugin_ops,
26328 +                       .label = "gzip1",
26329 +                       .desc = "gzip1 compression transform",
26330 +                       .linkage = {NULL, NULL}
26331 +               },
26332 +               .init = gzip1_init,
26333 +               .overrun = gzip1_overrun,
26334 +               .alloc = gzip1_alloc,
26335 +               .free = gzip1_free,
26336 +               .min_size_deflate = gzip1_min_size_deflate,
26337 +               .checksum = reiser4_adler32,
26338 +               .compress = gzip1_compress,
26339 +               .decompress = gzip1_decompress
26340 +       }
26341 +};
26342 +
26343 +/*
26344 +  Local variables:
26345 +  c-indentation-style: "K&R"
26346 +  mode-name: "LC"
26347 +  c-basic-offset: 8
26348 +  tab-width: 8
26349 +  fill-column: 120
26350 +  scroll-step: 1
26351 +  End:
26352 +*/
26353 diff --git a/fs/reiser4/plugin/compress/compress.h b/fs/reiser4/plugin/compress/compress.h
26354 new file mode 100644
26355 index 0000000..922ca0b
26356 --- /dev/null
26357 +++ b/fs/reiser4/plugin/compress/compress.h
26358 @@ -0,0 +1,38 @@
26359 +#if !defined( __FS_REISER4_COMPRESS_H__ )
26360 +#define __FS_REISER4_COMPRESS_H__
26361 +
26362 +#include <linux/types.h>
26363 +#include <linux/string.h>
26364 +
26365 +typedef enum {
26366 +       TFMA_READ,
26367 +       TFMA_WRITE,
26368 +       TFMA_LAST
26369 +} tfm_action;
26370 +
26371 +/* builtin compression plugins */
26372 +
26373 +typedef enum {
26374 +       LZO1_COMPRESSION_ID,
26375 +       GZIP1_COMPRESSION_ID,
26376 +       LAST_COMPRESSION_ID,
26377 +} reiser4_compression_id;
26378 +
26379 +typedef unsigned long cloff_t;
26380 +typedef void *coa_t;
26381 +typedef coa_t coa_set[LAST_COMPRESSION_ID][TFMA_LAST];
26382 +
26383 +__u32 reiser4_adler32(char *data, __u32 len);
26384 +
26385 +#endif                         /* __FS_REISER4_COMPRESS_H__ */
26386 +
26387 +/* Make Linus happy.
26388 +   Local variables:
26389 +   c-indentation-style: "K&R"
26390 +   mode-name: "LC"
26391 +   c-basic-offset: 8
26392 +   tab-width: 8
26393 +   fill-column: 120
26394 +   scroll-step: 1
26395 +   End:
26396 +*/
26397 diff --git a/fs/reiser4/plugin/compress/compress_mode.c b/fs/reiser4/plugin/compress/compress_mode.c
26398 new file mode 100644
26399 index 0000000..2ae7856
26400 --- /dev/null
26401 +++ b/fs/reiser4/plugin/compress/compress_mode.c
26402 @@ -0,0 +1,162 @@
26403 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
26404 +/* This file contains Reiser4 compression mode plugins.
26405 +
26406 +   Compression mode plugin is a set of handlers called by compressor
26407 +   at flush time and represent some heuristics including the ones
26408 +   which are to avoid compression of incompressible data, see
26409 +   http://www.namesys.com/cryptcompress_design.html for more details.
26410 +*/
26411 +#include "../../inode.h"
26412 +#include "../plugin.h"
26413 +
26414 +static int should_deflate_none(struct inode * inode, cloff_t index)
26415 +{
26416 +       return 0;
26417 +}
26418 +
26419 +static int should_deflate_common(struct inode * inode, cloff_t index)
26420 +{
26421 +       return compression_is_on(cryptcompress_inode_data(inode));
26422 +}
26423 +
26424 +static int discard_hook_ultim(struct inode *inode, cloff_t index)
26425 +{
26426 +       turn_off_compression(cryptcompress_inode_data(inode));
26427 +       return 0;
26428 +}
26429 +
26430 +static int discard_hook_lattd(struct inode *inode, cloff_t index)
26431 +{
26432 +       cryptcompress_info_t * info = cryptcompress_inode_data(inode);
26433 +
26434 +       assert("edward-1462",
26435 +              get_lattice_factor(info) >= MIN_LATTICE_FACTOR &&
26436 +              get_lattice_factor(info) <= MAX_LATTICE_FACTOR);
26437 +
26438 +       turn_off_compression(info);
26439 +       if (get_lattice_factor(info) < MAX_LATTICE_FACTOR)
26440 +               set_lattice_factor(info, get_lattice_factor(info) << 1);
26441 +       return 0;
26442 +}
26443 +
26444 +static int accept_hook_lattd(struct inode *inode, cloff_t index)
26445 +{
26446 +       turn_on_compression(cryptcompress_inode_data(inode));
26447 +       set_lattice_factor(cryptcompress_inode_data(inode), MIN_LATTICE_FACTOR);
26448 +       return 0;
26449 +}
26450 +
26451 +/* Check on dynamic lattice, the adaptive compression modes which
26452 +   defines the following behavior:
26453 +
26454 +   Compression is on: try to compress everything and turn
26455 +   it off, whenever cluster is incompressible.
26456 +
26457 +   Compression is off: try to compress clusters of indexes
26458 +   k * FACTOR (k = 0, 1, 2, ...) and turn it on, if some of
26459 +   them is compressible. If incompressible, then increase FACTOR */
26460 +
26461 +/* check if @index belongs to one-dimensional lattice
26462 +   of sparce factor @factor */
26463 +static int is_on_lattice(cloff_t index, int factor)
26464 +{
26465 +       return (factor ? index % factor == 0: index == 0);
26466 +}
26467 +
26468 +static int should_deflate_lattd(struct inode * inode, cloff_t index)
26469 +{
26470 +       return should_deflate_common(inode, index) ||
26471 +               is_on_lattice(index,
26472 +                             get_lattice_factor
26473 +                             (cryptcompress_inode_data(inode)));
26474 +}
26475 +
26476 +/* compression mode_plugins */
26477 +compression_mode_plugin compression_mode_plugins[LAST_COMPRESSION_MODE_ID] = {
26478 +       [NONE_COMPRESSION_MODE_ID] = {
26479 +               .h = {
26480 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26481 +                       .id = NONE_COMPRESSION_MODE_ID,
26482 +                       .pops = NULL,
26483 +                       .label = "none",
26484 +                       .desc = "Compress nothing",
26485 +                       .linkage = {NULL, NULL}
26486 +               },
26487 +               .should_deflate = should_deflate_none,
26488 +               .accept_hook = NULL,
26489 +               .discard_hook = NULL
26490 +       },
26491 +       /* Check-on-dynamic-lattice adaptive compression mode */
26492 +       [LATTD_COMPRESSION_MODE_ID] = {
26493 +               .h = {
26494 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26495 +                       .id = LATTD_COMPRESSION_MODE_ID,
26496 +                       .pops = NULL,
26497 +                       .label = "lattd",
26498 +                       .desc = "Check on dynamic lattice",
26499 +                       .linkage = {NULL, NULL}
26500 +               },
26501 +               .should_deflate = should_deflate_lattd,
26502 +               .accept_hook = accept_hook_lattd,
26503 +               .discard_hook = discard_hook_lattd
26504 +       },
26505 +       /* Check-ultimately compression mode:
26506 +          Turn off compression forever as soon as we meet
26507 +          incompressible data */
26508 +       [ULTIM_COMPRESSION_MODE_ID] = {
26509 +               .h = {
26510 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26511 +                       .id = ULTIM_COMPRESSION_MODE_ID,
26512 +                       .pops = NULL,
26513 +                       .label = "ultim",
26514 +                       .desc = "Check ultimately",
26515 +                       .linkage = {NULL, NULL}
26516 +               },
26517 +               .should_deflate = should_deflate_common,
26518 +               .accept_hook = NULL,
26519 +               .discard_hook = discard_hook_ultim
26520 +       },
26521 +       /* Force-to-compress-everything compression mode */
26522 +       [FORCE_COMPRESSION_MODE_ID] = {
26523 +               .h = {
26524 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26525 +                       .id = FORCE_COMPRESSION_MODE_ID,
26526 +                       .pops = NULL,
26527 +                       .label = "force",
26528 +                       .desc = "Force to compress everything",
26529 +                       .linkage = {NULL, NULL}
26530 +               },
26531 +               .should_deflate = NULL,
26532 +               .accept_hook = NULL,
26533 +               .discard_hook = NULL
26534 +       },
26535 +       /* Convert-to-extent compression mode.
26536 +          In this mode items will be converted to extents and management
26537 +          will be passed to (classic) unix file plugin as soon as ->write()
26538 +          detects that the first complete logical cluster (of index #0) is
26539 +          incompressible. */
26540 +       [CONVX_COMPRESSION_MODE_ID] = {
26541 +               .h = {
26542 +                       .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
26543 +                       .id = CONVX_COMPRESSION_MODE_ID,
26544 +                       .pops = NULL,
26545 +                       .label = "conv",
26546 +                       .desc = "Convert to extent",
26547 +                       .linkage = {NULL, NULL}
26548 +               },
26549 +               .should_deflate = should_deflate_common,
26550 +               .accept_hook = NULL,
26551 +               .discard_hook = NULL
26552 +       }
26553 +};
26554 +
26555 +/*
26556 +  Local variables:
26557 +  c-indentation-style: "K&R"
26558 +  mode-name: "LC"
26559 +  c-basic-offset: 8
26560 +  tab-width: 8
26561 +  fill-column: 120
26562 +  scroll-step: 1
26563 +  End:
26564 +*/
26565 diff --git a/fs/reiser4/plugin/compress/lzoconf.h b/fs/reiser4/plugin/compress/lzoconf.h
26566 new file mode 100644
26567 index 0000000..cc0fa4d
26568 --- /dev/null
26569 +++ b/fs/reiser4/plugin/compress/lzoconf.h
26570 @@ -0,0 +1,216 @@
26571 +/* lzoconf.h -- configuration for the LZO real-time data compression library
26572 +   adopted for reiser4 compression transform plugin.
26573 +
26574 +   This file is part of the LZO real-time data compression library
26575 +   and not included in any proprietary licenses of reiser4.
26576 +
26577 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26578 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26579 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26580 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26581 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26582 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26583 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26584 +   All Rights Reserved.
26585 +
26586 +   The LZO library is free software; you can redistribute it and/or
26587 +   modify it under the terms of the GNU General Public License as
26588 +   published by the Free Software Foundation; either version 2 of
26589 +   the License, or (at your option) any later version.
26590 +
26591 +   The LZO library is distributed in the hope that it will be useful,
26592 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
26593 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26594 +   GNU General Public License for more details.
26595 +
26596 +   You should have received a copy of the GNU General Public License
26597 +   along with the LZO library; see the file COPYING.
26598 +   If not, write to the Free Software Foundation, Inc.,
26599 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26600 +
26601 +   Markus F.X.J. Oberhumer
26602 +   <markus@oberhumer.com>
26603 +   http://www.oberhumer.com/opensource/lzo/
26604 + */
26605 +
26606 +#include <linux/kernel.h>      /* for UINT_MAX, ULONG_MAX - edward */
26607 +
26608 +#ifndef __LZOCONF_H
26609 +#define __LZOCONF_H
26610 +
26611 +#define LZO_VERSION             0x1080
26612 +#define LZO_VERSION_STRING      "1.08"
26613 +#define LZO_VERSION_DATE        "Jul 12 2002"
26614 +
26615 +/* internal Autoconf configuration file - only used when building LZO */
26616 +
26617 +/***********************************************************************
26618 +// LZO requires a conforming <limits.h>
26619 +************************************************************************/
26620 +
26621 +#define CHAR_BIT  8
26622 +#define USHRT_MAX 0xffff
26623 +
26624 +/* workaround a cpp bug under hpux 10.20 */
26625 +#define LZO_0xffffffffL         4294967295ul
26626 +
26627 +/***********************************************************************
26628 +// architecture defines
26629 +************************************************************************/
26630 +
26631 +#if !defined(__LZO_i386)
26632 +#  if defined(__i386__) || defined(__386__) || defined(_M_IX86)
26633 +#    define __LZO_i386
26634 +#  endif
26635 +#endif
26636 +
26637 +/* memory checkers */
26638 +#if !defined(__LZO_CHECKER)
26639 +#  if defined(__BOUNDS_CHECKING_ON)
26640 +#    define __LZO_CHECKER
26641 +#  elif defined(__CHECKER__)
26642 +#    define __LZO_CHECKER
26643 +#  elif defined(__INSURE__)
26644 +#    define __LZO_CHECKER
26645 +#  elif defined(__PURIFY__)
26646 +#    define __LZO_CHECKER
26647 +#  endif
26648 +#endif
26649 +
26650 +/***********************************************************************
26651 +// integral and pointer types
26652 +************************************************************************/
26653 +
26654 +/* Integral types with 32 bits or more */
26655 +#if !defined(LZO_UINT32_MAX)
26656 +#  if (UINT_MAX >= LZO_0xffffffffL)
26657 +       typedef unsigned int lzo_uint32;
26658 +       typedef int lzo_int32;
26659 +#    define LZO_UINT32_MAX      UINT_MAX
26660 +#    define LZO_INT32_MAX       INT_MAX
26661 +#    define LZO_INT32_MIN       INT_MIN
26662 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
26663 +       typedef unsigned long lzo_uint32;
26664 +       typedef long lzo_int32;
26665 +#    define LZO_UINT32_MAX      ULONG_MAX
26666 +#    define LZO_INT32_MAX       LONG_MAX
26667 +#    define LZO_INT32_MIN       LONG_MIN
26668 +#  else
26669 +#    error "lzo_uint32"
26670 +#  endif
26671 +#endif
26672 +
26673 +/* lzo_uint is used like size_t */
26674 +#if !defined(LZO_UINT_MAX)
26675 +#  if (UINT_MAX >= LZO_0xffffffffL)
26676 +       typedef unsigned int lzo_uint;
26677 +       typedef int lzo_int;
26678 +#    define LZO_UINT_MAX        UINT_MAX
26679 +#    define LZO_INT_MAX         INT_MAX
26680 +#    define LZO_INT_MIN         INT_MIN
26681 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
26682 +       typedef unsigned long lzo_uint;
26683 +       typedef long lzo_int;
26684 +#    define LZO_UINT_MAX        ULONG_MAX
26685 +#    define LZO_INT_MAX         LONG_MAX
26686 +#    define LZO_INT_MIN         LONG_MIN
26687 +#  else
26688 +#    error "lzo_uint"
26689 +#  endif
26690 +#endif
26691 +
26692 +       typedef int lzo_bool;
26693 +
26694 +/***********************************************************************
26695 +// memory models
26696 +************************************************************************/
26697 +
26698 +/* Memory model that allows to access memory at offsets of lzo_uint. */
26699 +#if !defined(__LZO_MMODEL)
26700 +#  if (LZO_UINT_MAX <= UINT_MAX)
26701 +#    define __LZO_MMODEL
26702 +#  else
26703 +#    error "__LZO_MMODEL"
26704 +#  endif
26705 +#endif
26706 +
26707 +/* no typedef here because of const-pointer issues */
26708 +#define lzo_byte                unsigned char __LZO_MMODEL
26709 +#define lzo_bytep               unsigned char __LZO_MMODEL *
26710 +#define lzo_charp               char __LZO_MMODEL *
26711 +#define lzo_voidp               void __LZO_MMODEL *
26712 +#define lzo_shortp              short __LZO_MMODEL *
26713 +#define lzo_ushortp             unsigned short __LZO_MMODEL *
26714 +#define lzo_uint32p             lzo_uint32 __LZO_MMODEL *
26715 +#define lzo_int32p              lzo_int32 __LZO_MMODEL *
26716 +#define lzo_uintp               lzo_uint __LZO_MMODEL *
26717 +#define lzo_intp                lzo_int __LZO_MMODEL *
26718 +#define lzo_voidpp              lzo_voidp __LZO_MMODEL *
26719 +#define lzo_bytepp              lzo_bytep __LZO_MMODEL *
26720 +
26721 +#ifndef lzo_sizeof_dict_t
26722 +#  define lzo_sizeof_dict_t     sizeof(lzo_bytep)
26723 +#endif
26724 +
26725 +typedef int (*lzo_compress_t) (const lzo_byte * src, lzo_uint src_len,
26726 +                              lzo_byte * dst, lzo_uintp dst_len,
26727 +                              lzo_voidp wrkmem);
26728 +
26729 +
26730 +/***********************************************************************
26731 +// error codes and prototypes
26732 +************************************************************************/
26733 +
26734 +/* Error codes for the compression/decompression functions. Negative
26735 + * values are errors, positive values will be used for special but
26736 + * normal events.
26737 + */
26738 +#define LZO_E_OK                    0
26739 +#define LZO_E_ERROR                 (-1)
26740 +#define LZO_E_OUT_OF_MEMORY         (-2)       /* not used right now */
26741 +#define LZO_E_NOT_COMPRESSIBLE      (-3)       /* not used right now */
26742 +#define LZO_E_INPUT_OVERRUN         (-4)
26743 +#define LZO_E_OUTPUT_OVERRUN        (-5)
26744 +#define LZO_E_LOOKBEHIND_OVERRUN    (-6)
26745 +#define LZO_E_EOF_NOT_FOUND         (-7)
26746 +#define LZO_E_INPUT_NOT_CONSUMED    (-8)
26747 +
26748 +/* lzo_init() should be the first function you call.
26749 + * Check the return code !
26750 + *
26751 + * lzo_init() is a macro to allow checking that the library and the
26752 + * compiler's view of various types are consistent.
26753 + */
26754 +#define lzo_init() __lzo_init2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
26755 +    (int)sizeof(long),(int)sizeof(lzo_uint32),(int)sizeof(lzo_uint),\
26756 +    (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
26757 +    (int)sizeof(lzo_compress_t))
26758 +        extern int __lzo_init2(unsigned, int, int, int, int, int, int,
26759 +                               int, int, int);
26760 +
26761 +/* checksum functions */
26762 +extern lzo_uint32 lzo_crc32(lzo_uint32 _c, const lzo_byte * _buf,
26763 +                           lzo_uint _len);
26764 +/* misc. */
26765 +       typedef union {
26766 +               lzo_bytep p;
26767 +               lzo_uint u;
26768 +       } __lzo_pu_u;
26769 +       typedef union {
26770 +               lzo_bytep p;
26771 +               lzo_uint32 u32;
26772 +       } __lzo_pu32_u;
26773 +       typedef union {
26774 +               void *vp;
26775 +               lzo_bytep bp;
26776 +               lzo_uint32 u32;
26777 +               long l;
26778 +       } lzo_align_t;
26779 +
26780 +#define LZO_PTR_ALIGN_UP(_ptr,_size) \
26781 +    ((_ptr) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(_ptr),(lzo_uint)(_size)))
26782 +
26783 +/* deprecated - only for backward compatibility */
26784 +#define LZO_ALIGN(_ptr,_size) LZO_PTR_ALIGN_UP(_ptr,_size)
26785 +
26786 +#endif                         /* already included */
26787 diff --git a/fs/reiser4/plugin/compress/minilzo.c b/fs/reiser4/plugin/compress/minilzo.c
26788 new file mode 100644
26789 index 0000000..2dba187
26790 --- /dev/null
26791 +++ b/fs/reiser4/plugin/compress/minilzo.c
26792 @@ -0,0 +1,1967 @@
26793 +/* minilzo.c -- mini subset of the LZO real-time data compression library
26794 +   adopted for reiser4 compression transform plugin.
26795 +
26796 +   This file is part of the LZO real-time data compression library
26797 +   and not included in any proprietary licenses of reiser4.
26798 +
26799 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
26800 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
26801 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
26802 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
26803 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
26804 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
26805 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
26806 +   All Rights Reserved.
26807 +
26808 +   The LZO library is free software; you can redistribute it and/or
26809 +   modify it under the terms of the GNU General Public License as
26810 +   published by the Free Software Foundation; either version 2 of
26811 +   the License, or (at your option) any later version.
26812 +
26813 +   The LZO library is distributed in the hope that it will be useful,
26814 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
26815 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26816 +   GNU General Public License for more details.
26817 +
26818 +   You should have received a copy of the GNU General Public License
26819 +   along with the LZO library; see the file COPYING.
26820 +   If not, write to the Free Software Foundation, Inc.,
26821 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26822 +
26823 +   Markus F.X.J. Oberhumer
26824 +   <markus@oberhumer.com>
26825 +   http://www.oberhumer.com/opensource/lzo/
26826 + */
26827 +
26828 +/*
26829 + * NOTE:
26830 + *   the full LZO package can be found at
26831 + *   http://www.oberhumer.com/opensource/lzo/
26832 + */
26833 +
26834 +#include "../../debug.h"       /* for reiser4 assert macro -edward */
26835 +
26836 +#define __LZO_IN_MINILZO
26837 +#define LZO_BUILD
26838 +
26839 +#include "minilzo.h"
26840 +
26841 +#if !defined(MINILZO_VERSION) || (MINILZO_VERSION != 0x1080)
26842 +#  error "version mismatch in miniLZO source files"
26843 +#endif
26844 +
26845 +#ifndef __LZO_CONF_H
26846 +#define __LZO_CONF_H
26847 +
26848 +#  define BOUNDS_CHECKING_OFF_DURING(stmt)      stmt
26849 +#  define BOUNDS_CHECKING_OFF_IN_EXPR(expr)     (expr)
26850 +
26851 +#  define HAVE_MEMCMP
26852 +#  define HAVE_MEMCPY
26853 +#  define HAVE_MEMMOVE
26854 +#  define HAVE_MEMSET
26855 +
26856 +#undef NDEBUG
26857 +#if !defined(LZO_DEBUG)
26858 +#  define NDEBUG
26859 +#endif
26860 +#if defined(LZO_DEBUG) || !defined(NDEBUG)
26861 +#  if !defined(NO_STDIO_H)
26862 +#    include <stdio.h>
26863 +#  endif
26864 +#endif
26865 +
26866 +#if !defined(LZO_COMPILE_TIME_ASSERT)
26867 +#  define LZO_COMPILE_TIME_ASSERT(expr) \
26868 +       { typedef int __lzo_compile_time_assert_fail[1 - 2 * !(expr)]; }
26869 +#endif
26870 +
26871 +#if !defined(LZO_UNUSED)
26872 +#  if 1
26873 +#    define LZO_UNUSED(var)     ((void)&var)
26874 +#  elif 0
26875 +#    define LZO_UNUSED(var)     { typedef int __lzo_unused[sizeof(var) ? 2 : 1]; }
26876 +#  else
26877 +#    define LZO_UNUSED(parm)    (parm = parm)
26878 +#  endif
26879 +#endif
26880 +
26881 +#if defined(NO_MEMCMP)
26882 +#  undef HAVE_MEMCMP
26883 +#endif
26884 +
26885 +#if !defined(HAVE_MEMSET)
26886 +#  undef memset
26887 +#  define memset    lzo_memset
26888 +#endif
26889 +
26890 +#  define LZO_BYTE(x)       ((unsigned char) ((x) & 0xff))
26891 +
26892 +#define LZO_MAX(a,b)        ((a) >= (b) ? (a) : (b))
26893 +#define LZO_MIN(a,b)        ((a) <= (b) ? (a) : (b))
26894 +#define LZO_MAX3(a,b,c)     ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
26895 +#define LZO_MIN3(a,b,c)     ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
26896 +
26897 +#define lzo_sizeof(type)    ((lzo_uint) (sizeof(type)))
26898 +
26899 +#define LZO_HIGH(array)     ((lzo_uint) (sizeof(array)/sizeof(*(array))))
26900 +
26901 +#define LZO_SIZE(bits)      (1u << (bits))
26902 +#define LZO_MASK(bits)      (LZO_SIZE(bits) - 1)
26903 +
26904 +#define LZO_LSIZE(bits)     (1ul << (bits))
26905 +#define LZO_LMASK(bits)     (LZO_LSIZE(bits) - 1)
26906 +
26907 +#define LZO_USIZE(bits)     ((lzo_uint) 1 << (bits))
26908 +#define LZO_UMASK(bits)     (LZO_USIZE(bits) - 1)
26909 +
26910 +#define LZO_STYPE_MAX(b)    (((1l  << (8*(b)-2)) - 1l)  + (1l  << (8*(b)-2)))
26911 +#define LZO_UTYPE_MAX(b)    (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
26912 +
26913 +#if !defined(SIZEOF_UNSIGNED)
26914 +#  if (UINT_MAX == 0xffff)
26915 +#    define SIZEOF_UNSIGNED         2
26916 +#  elif (UINT_MAX == LZO_0xffffffffL)
26917 +#    define SIZEOF_UNSIGNED         4
26918 +#  elif (UINT_MAX >= LZO_0xffffffffL)
26919 +#    define SIZEOF_UNSIGNED         8
26920 +#  else
26921 +#    error "SIZEOF_UNSIGNED"
26922 +#  endif
26923 +#endif
26924 +
26925 +#if !defined(SIZEOF_UNSIGNED_LONG)
26926 +#  if (ULONG_MAX == LZO_0xffffffffL)
26927 +#    define SIZEOF_UNSIGNED_LONG    4
26928 +#  elif (ULONG_MAX >= LZO_0xffffffffL)
26929 +#    define SIZEOF_UNSIGNED_LONG    8
26930 +#  else
26931 +#    error "SIZEOF_UNSIGNED_LONG"
26932 +#  endif
26933 +#endif
26934 +
26935 +#if !defined(SIZEOF_SIZE_T)
26936 +#  define SIZEOF_SIZE_T             SIZEOF_UNSIGNED
26937 +#endif
26938 +#if !defined(SIZE_T_MAX)
26939 +#  define SIZE_T_MAX                LZO_UTYPE_MAX(SIZEOF_SIZE_T)
26940 +#endif
26941 +
26942 +#if 1 && defined(__LZO_i386) && (UINT_MAX == LZO_0xffffffffL)
26943 +#  if !defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX == 0xffff)
26944 +#    define LZO_UNALIGNED_OK_2
26945 +#  endif
26946 +#  if !defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX == LZO_0xffffffffL)
26947 +#    define LZO_UNALIGNED_OK_4
26948 +#  endif
26949 +#endif
26950 +
26951 +#if defined(LZO_UNALIGNED_OK_2) || defined(LZO_UNALIGNED_OK_4)
26952 +#  if !defined(LZO_UNALIGNED_OK)
26953 +#    define LZO_UNALIGNED_OK
26954 +#  endif
26955 +#endif
26956 +
26957 +#if defined(__LZO_NO_UNALIGNED)
26958 +#  undef LZO_UNALIGNED_OK
26959 +#  undef LZO_UNALIGNED_OK_2
26960 +#  undef LZO_UNALIGNED_OK_4
26961 +#endif
26962 +
26963 +#if defined(LZO_UNALIGNED_OK_2) && (USHRT_MAX != 0xffff)
26964 +#  error "LZO_UNALIGNED_OK_2 must not be defined on this system"
26965 +#endif
26966 +#if defined(LZO_UNALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26967 +#  error "LZO_UNALIGNED_OK_4 must not be defined on this system"
26968 +#endif
26969 +
26970 +#if defined(__LZO_NO_ALIGNED)
26971 +#  undef LZO_ALIGNED_OK_4
26972 +#endif
26973 +
26974 +#if defined(LZO_ALIGNED_OK_4) && (LZO_UINT32_MAX != LZO_0xffffffffL)
26975 +#  error "LZO_ALIGNED_OK_4 must not be defined on this system"
26976 +#endif
26977 +
26978 +#define LZO_LITTLE_ENDIAN       1234
26979 +#define LZO_BIG_ENDIAN          4321
26980 +#define LZO_PDP_ENDIAN          3412
26981 +
26982 +#if !defined(LZO_BYTE_ORDER)
26983 +#  if defined(MFX_BYTE_ORDER)
26984 +#    define LZO_BYTE_ORDER      MFX_BYTE_ORDER
26985 +#  elif defined(__LZO_i386)
26986 +#    define LZO_BYTE_ORDER      LZO_LITTLE_ENDIAN
26987 +#  elif defined(BYTE_ORDER)
26988 +#    define LZO_BYTE_ORDER      BYTE_ORDER
26989 +#  elif defined(__BYTE_ORDER)
26990 +#    define LZO_BYTE_ORDER      __BYTE_ORDER
26991 +#  endif
26992 +#endif
26993 +
26994 +#if defined(LZO_BYTE_ORDER)
26995 +#  if (LZO_BYTE_ORDER != LZO_LITTLE_ENDIAN) && \
26996 +      (LZO_BYTE_ORDER != LZO_BIG_ENDIAN)
26997 +#    error "invalid LZO_BYTE_ORDER"
26998 +#  endif
26999 +#endif
27000 +
27001 +#if defined(LZO_UNALIGNED_OK) && !defined(LZO_BYTE_ORDER)
27002 +#  error "LZO_BYTE_ORDER is not defined"
27003 +#endif
27004 +
27005 +#define LZO_OPTIMIZE_GNUC_i386_IS_BUGGY
27006 +
27007 +#if defined(NDEBUG) && !defined(LZO_DEBUG) && !defined(__LZO_CHECKER)
27008 +#  if defined(__GNUC__) && defined(__i386__)
27009 +#    if !defined(LZO_OPTIMIZE_GNUC_i386_IS_BUGGY)
27010 +#      define LZO_OPTIMIZE_GNUC_i386
27011 +#    endif
27012 +#  endif
27013 +#endif
27014 +
27015 +extern const lzo_uint32 _lzo_crc32_table[256];
27016 +
27017 +#define _LZO_STRINGIZE(x)           #x
27018 +#define _LZO_MEXPAND(x)             _LZO_STRINGIZE(x)
27019 +
27020 +#define _LZO_CONCAT2(a,b)           a ## b
27021 +#define _LZO_CONCAT3(a,b,c)         a ## b ## c
27022 +#define _LZO_CONCAT4(a,b,c,d)       a ## b ## c ## d
27023 +#define _LZO_CONCAT5(a,b,c,d,e)     a ## b ## c ## d ## e
27024 +
27025 +#define _LZO_ECONCAT2(a,b)          _LZO_CONCAT2(a,b)
27026 +#define _LZO_ECONCAT3(a,b,c)        _LZO_CONCAT3(a,b,c)
27027 +#define _LZO_ECONCAT4(a,b,c,d)      _LZO_CONCAT4(a,b,c,d)
27028 +#define _LZO_ECONCAT5(a,b,c,d,e)    _LZO_CONCAT5(a,b,c,d,e)
27029 +
27030 +#ifndef __LZO_PTR_H
27031 +#define __LZO_PTR_H
27032 +
27033 +#if !defined(lzo_ptrdiff_t)
27034 +#  if (UINT_MAX >= LZO_0xffffffffL)
27035 +typedef ptrdiff_t lzo_ptrdiff_t;
27036 +#  else
27037 +typedef long lzo_ptrdiff_t;
27038 +#  endif
27039 +#endif
27040 +
27041 +#if !defined(__LZO_HAVE_PTR_T)
27042 +#  if defined(lzo_ptr_t)
27043 +#    define __LZO_HAVE_PTR_T
27044 +#  endif
27045 +#endif
27046 +#if !defined(__LZO_HAVE_PTR_T)
27047 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_LONG)
27048 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_LONG)
27049 +typedef unsigned long lzo_ptr_t;
27050 +typedef long lzo_sptr_t;
27051 +#      define __LZO_HAVE_PTR_T
27052 +#    endif
27053 +#  endif
27054 +#endif
27055 +#if !defined(__LZO_HAVE_PTR_T)
27056 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED)
27057 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED)
27058 +typedef unsigned int lzo_ptr_t;
27059 +typedef int lzo_sptr_t;
27060 +#      define __LZO_HAVE_PTR_T
27061 +#    endif
27062 +#  endif
27063 +#endif
27064 +#if !defined(__LZO_HAVE_PTR_T)
27065 +#  if defined(SIZEOF_CHAR_P) && defined(SIZEOF_UNSIGNED_SHORT)
27066 +#    if (SIZEOF_CHAR_P == SIZEOF_UNSIGNED_SHORT)
27067 +typedef unsigned short lzo_ptr_t;
27068 +typedef short lzo_sptr_t;
27069 +#      define __LZO_HAVE_PTR_T
27070 +#    endif
27071 +#  endif
27072 +#endif
27073 +#if !defined(__LZO_HAVE_PTR_T)
27074 +#  if defined(LZO_HAVE_CONFIG_H) || defined(SIZEOF_CHAR_P)
27075 +#    error "no suitable type for lzo_ptr_t"
27076 +#  else
27077 +typedef unsigned long lzo_ptr_t;
27078 +typedef long lzo_sptr_t;
27079 +#    define __LZO_HAVE_PTR_T
27080 +#  endif
27081 +#endif
27082 +
27083 +#define PTR(a)              ((lzo_ptr_t) (a))
27084 +#define PTR_LINEAR(a)       PTR(a)
27085 +#define PTR_ALIGNED_4(a)    ((PTR_LINEAR(a) & 3) == 0)
27086 +#define PTR_ALIGNED_8(a)    ((PTR_LINEAR(a) & 7) == 0)
27087 +#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
27088 +#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
27089 +
27090 +#define PTR_LT(a,b)         (PTR(a) < PTR(b))
27091 +#define PTR_GE(a,b)         (PTR(a) >= PTR(b))
27092 +#define PTR_DIFF(a,b)       ((lzo_ptrdiff_t) (PTR(a) - PTR(b)))
27093 +#define pd(a,b)             ((lzo_uint) ((a)-(b)))
27094 +
27095 +typedef union {
27096 +       char a_char;
27097 +       unsigned char a_uchar;
27098 +       short a_short;
27099 +       unsigned short a_ushort;
27100 +       int a_int;
27101 +       unsigned int a_uint;
27102 +       long a_long;
27103 +       unsigned long a_ulong;
27104 +       lzo_int a_lzo_int;
27105 +       lzo_uint a_lzo_uint;
27106 +       lzo_int32 a_lzo_int32;
27107 +       lzo_uint32 a_lzo_uint32;
27108 +       ptrdiff_t a_ptrdiff_t;
27109 +       lzo_ptrdiff_t a_lzo_ptrdiff_t;
27110 +       lzo_ptr_t a_lzo_ptr_t;
27111 +       lzo_voidp a_lzo_voidp;
27112 +       void *a_void_p;
27113 +       lzo_bytep a_lzo_bytep;
27114 +       lzo_bytepp a_lzo_bytepp;
27115 +       lzo_uintp a_lzo_uintp;
27116 +       lzo_uint *a_lzo_uint_p;
27117 +       lzo_uint32p a_lzo_uint32p;
27118 +       lzo_uint32 *a_lzo_uint32_p;
27119 +       unsigned char *a_uchar_p;
27120 +       char *a_char_p;
27121 +} lzo_full_align_t;
27122 +
27123 +#endif
27124 +#define LZO_DETERMINISTIC
27125 +#define LZO_DICT_USE_PTR
27126 +#  define lzo_dict_t    const lzo_bytep
27127 +#  define lzo_dict_p    lzo_dict_t __LZO_MMODEL *
27128 +#if !defined(lzo_moff_t)
27129 +#define lzo_moff_t      lzo_uint
27130 +#endif
27131 +#endif
27132 +static lzo_ptr_t __lzo_ptr_linear(const lzo_voidp ptr)
27133 +{
27134 +       return PTR_LINEAR(ptr);
27135 +}
27136 +
27137 +static unsigned __lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
27138 +{
27139 +       lzo_ptr_t p, s, n;
27140 +
27141 +       assert("lzo-01", size > 0);
27142 +
27143 +       p = __lzo_ptr_linear(ptr);
27144 +       s = (lzo_ptr_t) (size - 1);
27145 +       n = (((p + s) / size) * size) - p;
27146 +
27147 +       assert("lzo-02", (long)n >= 0);
27148 +       assert("lzo-03", n <= s);
27149 +
27150 +       return (unsigned)n;
27151 +}
27152 +
27153 +#ifndef __LZO_UTIL_H
27154 +#define __LZO_UTIL_H
27155 +
27156 +#ifndef __LZO_CONF_H
27157 +#endif
27158 +
27159 +#if 1 && defined(HAVE_MEMCPY)
27160 +#define MEMCPY8_DS(dest,src,len) \
27161 +    memcpy(dest,src,len); \
27162 +    dest += len; \
27163 +    src += len
27164 +#endif
27165 +
27166 +#if !defined(MEMCPY8_DS)
27167 +
27168 +#define MEMCPY8_DS(dest,src,len) \
27169 +    { register lzo_uint __l = (len) / 8; \
27170 +    do { \
27171 +       *dest++ = *src++; \
27172 +       *dest++ = *src++; \
27173 +       *dest++ = *src++; \
27174 +       *dest++ = *src++; \
27175 +       *dest++ = *src++; \
27176 +       *dest++ = *src++; \
27177 +       *dest++ = *src++; \
27178 +       *dest++ = *src++; \
27179 +    } while (--__l > 0); }
27180 +
27181 +#endif
27182 +
27183 +#define MEMCPY_DS(dest,src,len) \
27184 +    do *dest++ = *src++; \
27185 +    while (--len > 0)
27186 +
27187 +#define MEMMOVE_DS(dest,src,len) \
27188 +    do *dest++ = *src++; \
27189 +    while (--len > 0)
27190 +
27191 +#if (LZO_UINT_MAX <= SIZE_T_MAX) && defined(HAVE_MEMSET)
27192 +
27193 +#define BZERO8_PTR(s,l,n)   memset((s),0,(lzo_uint)(l)*(n))
27194 +
27195 +#else
27196 +
27197 +#define BZERO8_PTR(s,l,n) \
27198 +    lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
27199 +
27200 +#endif
27201 +#endif
27202 +
27203 +/* If you use the LZO library in a product, you *must* keep this
27204 + * copyright string in the executable of your product.
27205 + */
27206 +
27207 +static const lzo_byte __lzo_copyright[] =
27208 +#if !defined(__LZO_IN_MINLZO)
27209 +    LZO_VERSION_STRING;
27210 +#else
27211 +    "\n\n\n"
27212 +    "LZO real-time data compression library.\n"
27213 +    "Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer\n"
27214 +    "<markus.oberhumer@jk.uni-linz.ac.at>\n"
27215 +    "http://www.oberhumer.com/opensource/lzo/\n"
27216 +    "\n"
27217 +    "LZO version: v" LZO_VERSION_STRING ", " LZO_VERSION_DATE "\n"
27218 +    "LZO build date: " __DATE__ " " __TIME__ "\n\n"
27219 +    "LZO special compilation options:\n"
27220 +#ifdef __cplusplus
27221 +    " __cplusplus\n"
27222 +#endif
27223 +#if defined(__PIC__)
27224 +    " __PIC__\n"
27225 +#elif defined(__pic__)
27226 +    " __pic__\n"
27227 +#endif
27228 +#if (UINT_MAX < LZO_0xffffffffL)
27229 +    " 16BIT\n"
27230 +#endif
27231 +#if defined(__LZO_STRICT_16BIT)
27232 +    " __LZO_STRICT_16BIT\n"
27233 +#endif
27234 +#if (UINT_MAX > LZO_0xffffffffL)
27235 +    " UINT_MAX=" _LZO_MEXPAND(UINT_MAX) "\n"
27236 +#endif
27237 +#if (ULONG_MAX > LZO_0xffffffffL)
27238 +    " ULONG_MAX=" _LZO_MEXPAND(ULONG_MAX) "\n"
27239 +#endif
27240 +#if defined(LZO_BYTE_ORDER)
27241 +    " LZO_BYTE_ORDER=" _LZO_MEXPAND(LZO_BYTE_ORDER) "\n"
27242 +#endif
27243 +#if defined(LZO_UNALIGNED_OK_2)
27244 +    " LZO_UNALIGNED_OK_2\n"
27245 +#endif
27246 +#if defined(LZO_UNALIGNED_OK_4)
27247 +    " LZO_UNALIGNED_OK_4\n"
27248 +#endif
27249 +#if defined(LZO_ALIGNED_OK_4)
27250 +    " LZO_ALIGNED_OK_4\n"
27251 +#endif
27252 +#if defined(LZO_DICT_USE_PTR)
27253 +    " LZO_DICT_USE_PTR\n"
27254 +#endif
27255 +#if defined(__LZO_QUERY_COMPRESS)
27256 +    " __LZO_QUERY_COMPRESS\n"
27257 +#endif
27258 +#if defined(__LZO_QUERY_DECOMPRESS)
27259 +    " __LZO_QUERY_DECOMPRESS\n"
27260 +#endif
27261 +#if defined(__LZO_IN_MINILZO)
27262 +    " __LZO_IN_MINILZO\n"
27263 +#endif
27264 +    "\n\n" "$Id: LZO " LZO_VERSION_STRING " built " __DATE__ " " __TIME__
27265 +#if defined(__GNUC__) && defined(__VERSION__)
27266 +    " by gcc " __VERSION__
27267 +#elif defined(__BORLANDC__)
27268 +    " by Borland C " _LZO_MEXPAND(__BORLANDC__)
27269 +#elif defined(_MSC_VER)
27270 +    " by Microsoft C " _LZO_MEXPAND(_MSC_VER)
27271 +#elif defined(__PUREC__)
27272 +    " by Pure C " _LZO_MEXPAND(__PUREC__)
27273 +#elif defined(__SC__)
27274 +    " by Symantec C " _LZO_MEXPAND(__SC__)
27275 +#elif defined(__TURBOC__)
27276 +    " by Turbo C " _LZO_MEXPAND(__TURBOC__)
27277 +#elif defined(__WATCOMC__)
27278 +    " by Watcom C " _LZO_MEXPAND(__WATCOMC__)
27279 +#endif
27280 +    " $\n"
27281 +    "$Copyright: LZO (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002 Markus Franz Xaver Johannes Oberhumer $\n";
27282 +#endif
27283 +
27284 +#define LZO_BASE 65521u
27285 +#define LZO_NMAX 5552
27286 +
27287 +#define LZO_DO1(buf,i)  {s1 += buf[i]; s2 += s1;}
27288 +#define LZO_DO2(buf,i)  LZO_DO1(buf,i); LZO_DO1(buf,i+1);
27289 +#define LZO_DO4(buf,i)  LZO_DO2(buf,i); LZO_DO2(buf,i+2);
27290 +#define LZO_DO8(buf,i)  LZO_DO4(buf,i); LZO_DO4(buf,i+4);
27291 +#define LZO_DO16(buf,i) LZO_DO8(buf,i); LZO_DO8(buf,i+8);
27292 +
27293 +#  define IS_SIGNED(type)       (((type) (-1)) < ((type) 0))
27294 +#  define IS_UNSIGNED(type)     (((type) (-1)) > ((type) 0))
27295 +
27296 +#define IS_POWER_OF_2(x)        (((x) & ((x) - 1)) == 0)
27297 +
27298 +static lzo_bool schedule_insns_bug(void);
27299 +static lzo_bool strength_reduce_bug(int *);
27300 +
27301 +#  define __lzo_assert(x)   ((x) ? 1 : 0)
27302 +
27303 +#undef COMPILE_TIME_ASSERT
27304 +
27305 +#  define COMPILE_TIME_ASSERT(expr)     LZO_COMPILE_TIME_ASSERT(expr)
27306 +
27307 +static lzo_bool basic_integral_check(void)
27308 +{
27309 +       lzo_bool r = 1;
27310 +
27311 +       COMPILE_TIME_ASSERT(CHAR_BIT == 8);
27312 +       COMPILE_TIME_ASSERT(sizeof(char) == 1);
27313 +       COMPILE_TIME_ASSERT(sizeof(short) >= 2);
27314 +       COMPILE_TIME_ASSERT(sizeof(long) >= 4);
27315 +       COMPILE_TIME_ASSERT(sizeof(int) >= sizeof(short));
27316 +       COMPILE_TIME_ASSERT(sizeof(long) >= sizeof(int));
27317 +
27318 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint) == sizeof(lzo_int));
27319 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == sizeof(lzo_int32));
27320 +
27321 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= 4);
27322 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) >= sizeof(unsigned));
27323 +#if defined(__LZO_STRICT_16BIT)
27324 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint) == 2);
27325 +#else
27326 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= 4);
27327 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint) >= sizeof(unsigned));
27328 +#endif
27329 +
27330 +#if (USHRT_MAX == 65535u)
27331 +       COMPILE_TIME_ASSERT(sizeof(short) == 2);
27332 +#elif (USHRT_MAX == LZO_0xffffffffL)
27333 +       COMPILE_TIME_ASSERT(sizeof(short) == 4);
27334 +#elif (USHRT_MAX >= LZO_0xffffffffL)
27335 +       COMPILE_TIME_ASSERT(sizeof(short) > 4);
27336 +#endif
27337 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned char));
27338 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned short));
27339 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned));
27340 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(unsigned long));
27341 +       COMPILE_TIME_ASSERT(IS_SIGNED(short));
27342 +       COMPILE_TIME_ASSERT(IS_SIGNED(int));
27343 +       COMPILE_TIME_ASSERT(IS_SIGNED(long));
27344 +
27345 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint32));
27346 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_uint));
27347 +       COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int32));
27348 +       COMPILE_TIME_ASSERT(IS_SIGNED(lzo_int));
27349 +
27350 +       COMPILE_TIME_ASSERT(INT_MAX == LZO_STYPE_MAX(sizeof(int)));
27351 +       COMPILE_TIME_ASSERT(UINT_MAX == LZO_UTYPE_MAX(sizeof(unsigned)));
27352 +       COMPILE_TIME_ASSERT(LONG_MAX == LZO_STYPE_MAX(sizeof(long)));
27353 +       COMPILE_TIME_ASSERT(ULONG_MAX == LZO_UTYPE_MAX(sizeof(unsigned long)));
27354 +       COMPILE_TIME_ASSERT(USHRT_MAX == LZO_UTYPE_MAX(sizeof(unsigned short)));
27355 +       COMPILE_TIME_ASSERT(LZO_UINT32_MAX ==
27356 +                           LZO_UTYPE_MAX(sizeof(lzo_uint32)));
27357 +       COMPILE_TIME_ASSERT(LZO_UINT_MAX == LZO_UTYPE_MAX(sizeof(lzo_uint)));
27358 +
27359 +       r &= __lzo_assert(LZO_BYTE(257) == 1);
27360 +
27361 +       return r;
27362 +}
27363 +
27364 +static lzo_bool basic_ptr_check(void)
27365 +{
27366 +       lzo_bool r = 1;
27367 +
27368 +       COMPILE_TIME_ASSERT(sizeof(char *) >= sizeof(int));
27369 +       COMPILE_TIME_ASSERT(sizeof(lzo_byte *) >= sizeof(char *));
27370 +
27371 +       COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_byte *));
27372 +       COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_voidpp));
27373 +       COMPILE_TIME_ASSERT(sizeof(lzo_voidp) == sizeof(lzo_bytepp));
27374 +       COMPILE_TIME_ASSERT(sizeof(lzo_voidp) >= sizeof(lzo_uint));
27375 +
27376 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_voidp));
27377 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) == sizeof(lzo_sptr_t));
27378 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptr_t) >= sizeof(lzo_uint));
27379 +
27380 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= 4);
27381 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(ptrdiff_t));
27382 +
27383 +       COMPILE_TIME_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t));
27384 +       COMPILE_TIME_ASSERT(sizeof(lzo_ptrdiff_t) >= sizeof(lzo_uint));
27385 +
27386 +#if defined(SIZEOF_CHAR_P)
27387 +       COMPILE_TIME_ASSERT(SIZEOF_CHAR_P == sizeof(char *));
27388 +#endif
27389 +#if defined(SIZEOF_PTRDIFF_T)
27390 +       COMPILE_TIME_ASSERT(SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t));
27391 +#endif
27392 +
27393 +       COMPILE_TIME_ASSERT(IS_SIGNED(ptrdiff_t));
27394 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(size_t));
27395 +       COMPILE_TIME_ASSERT(IS_SIGNED(lzo_ptrdiff_t));
27396 +       COMPILE_TIME_ASSERT(IS_SIGNED(lzo_sptr_t));
27397 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_ptr_t));
27398 +       COMPILE_TIME_ASSERT(IS_UNSIGNED(lzo_moff_t));
27399 +
27400 +       return r;
27401 +}
27402 +
27403 +static lzo_bool ptr_check(void)
27404 +{
27405 +       lzo_bool r = 1;
27406 +       int i;
27407 +       char _wrkmem[10 * sizeof(lzo_byte *) + sizeof(lzo_full_align_t)];
27408 +       lzo_bytep wrkmem;
27409 +       lzo_bytepp dict;
27410 +       unsigned char x[4 * sizeof(lzo_full_align_t)];
27411 +       long d;
27412 +       lzo_full_align_t a;
27413 +       lzo_full_align_t u;
27414 +
27415 +       for (i = 0; i < (int)sizeof(x); i++)
27416 +               x[i] = LZO_BYTE(i);
27417 +
27418 +       wrkmem =
27419 +           LZO_PTR_ALIGN_UP((lzo_byte *) _wrkmem, sizeof(lzo_full_align_t));
27420 +
27421 +       u.a_lzo_bytep = wrkmem;
27422 +       dict = u.a_lzo_bytepp;
27423 +
27424 +       d = (long)((const lzo_bytep)dict - (const lzo_bytep)_wrkmem);
27425 +       r &= __lzo_assert(d >= 0);
27426 +       r &= __lzo_assert(d < (long)sizeof(lzo_full_align_t));
27427 +
27428 +       memset(&a, 0, sizeof(a));
27429 +       r &= __lzo_assert(a.a_lzo_voidp == NULL);
27430 +
27431 +       memset(&a, 0xff, sizeof(a));
27432 +       r &= __lzo_assert(a.a_ushort == USHRT_MAX);
27433 +       r &= __lzo_assert(a.a_uint == UINT_MAX);
27434 +       r &= __lzo_assert(a.a_ulong == ULONG_MAX);
27435 +       r &= __lzo_assert(a.a_lzo_uint == LZO_UINT_MAX);
27436 +       r &= __lzo_assert(a.a_lzo_uint32 == LZO_UINT32_MAX);
27437 +
27438 +       if (r == 1) {
27439 +               for (i = 0; i < 8; i++)
27440 +                       r &= __lzo_assert((const lzo_voidp)(&dict[i]) ==
27441 +                                         (const
27442 +                                          lzo_voidp)(&wrkmem[i *
27443 +                                                             sizeof(lzo_byte
27444 +                                                                    *)]));
27445 +       }
27446 +
27447 +       memset(&a, 0, sizeof(a));
27448 +       r &= __lzo_assert(a.a_char_p == NULL);
27449 +       r &= __lzo_assert(a.a_lzo_bytep == NULL);
27450 +       r &= __lzo_assert(NULL == (void *)0);
27451 +       if (r == 1) {
27452 +               for (i = 0; i < 10; i++)
27453 +                       dict[i] = wrkmem;
27454 +               BZERO8_PTR(dict + 1, sizeof(dict[0]), 8);
27455 +               r &= __lzo_assert(dict[0] == wrkmem);
27456 +               for (i = 1; i < 9; i++)
27457 +                       r &= __lzo_assert(dict[i] == NULL);
27458 +               r &= __lzo_assert(dict[9] == wrkmem);
27459 +       }
27460 +
27461 +       if (r == 1) {
27462 +               unsigned k = 1;
27463 +               const unsigned n = (unsigned)sizeof(lzo_uint32);
27464 +               lzo_byte *p0;
27465 +               lzo_byte *p1;
27466 +
27467 +               k += __lzo_align_gap(&x[k], n);
27468 +               p0 = (lzo_bytep) & x[k];
27469 +#if defined(PTR_LINEAR)
27470 +               r &= __lzo_assert((PTR_LINEAR(p0) & (n - 1)) == 0);
27471 +#else
27472 +               r &= __lzo_assert(n == 4);
27473 +               r &= __lzo_assert(PTR_ALIGNED_4(p0));
27474 +#endif
27475 +
27476 +               r &= __lzo_assert(k >= 1);
27477 +               p1 = (lzo_bytep) & x[1];
27478 +               r &= __lzo_assert(PTR_GE(p0, p1));
27479 +
27480 +               r &= __lzo_assert(k < 1 + n);
27481 +               p1 = (lzo_bytep) & x[1 + n];
27482 +               r &= __lzo_assert(PTR_LT(p0, p1));
27483 +
27484 +               if (r == 1) {
27485 +                       lzo_uint32 v0, v1;
27486 +
27487 +                       u.a_uchar_p = &x[k];
27488 +                       v0 = *u.a_lzo_uint32_p;
27489 +                       u.a_uchar_p = &x[k + n];
27490 +                       v1 = *u.a_lzo_uint32_p;
27491 +
27492 +                       r &= __lzo_assert(v0 > 0);
27493 +                       r &= __lzo_assert(v1 > 0);
27494 +               }
27495 +       }
27496 +
27497 +       return r;
27498 +}
27499 +
27500 +static int _lzo_config_check(void)
27501 +{
27502 +       lzo_bool r = 1;
27503 +       int i;
27504 +       union {
27505 +               lzo_uint32 a;
27506 +               unsigned short b;
27507 +               lzo_uint32 aa[4];
27508 +               unsigned char x[4 * sizeof(lzo_full_align_t)];
27509 +       } u;
27510 +
27511 +       COMPILE_TIME_ASSERT((int)((unsigned char)((signed char)-1)) == 255);
27512 +       COMPILE_TIME_ASSERT((((unsigned char)128) << (int)(8 * sizeof(int) - 8))
27513 +                           < 0);
27514 +
27515 +       r &= basic_integral_check();
27516 +       r &= basic_ptr_check();
27517 +       if (r != 1)
27518 +               return LZO_E_ERROR;
27519 +
27520 +       u.a = 0;
27521 +       u.b = 0;
27522 +       for (i = 0; i < (int)sizeof(u.x); i++)
27523 +               u.x[i] = LZO_BYTE(i);
27524 +
27525 +#if defined(LZO_BYTE_ORDER)
27526 +       if (r == 1) {
27527 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27528 +               lzo_uint32 a = (lzo_uint32) (u.a & LZO_0xffffffffL);
27529 +               unsigned short b = (unsigned short)(u.b & 0xffff);
27530 +               r &= __lzo_assert(a == 0x03020100L);
27531 +               r &= __lzo_assert(b == 0x0100);
27532 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27533 +               lzo_uint32 a = u.a >> (8 * sizeof(u.a) - 32);
27534 +               unsigned short b = u.b >> (8 * sizeof(u.b) - 16);
27535 +               r &= __lzo_assert(a == 0x00010203L);
27536 +               r &= __lzo_assert(b == 0x0001);
27537 +#  else
27538 +#    error "invalid LZO_BYTE_ORDER"
27539 +#  endif
27540 +       }
27541 +#endif
27542 +
27543 +#if defined(LZO_UNALIGNED_OK_2)
27544 +       COMPILE_TIME_ASSERT(sizeof(short) == 2);
27545 +       if (r == 1) {
27546 +               unsigned short b[4];
27547 +
27548 +               for (i = 0; i < 4; i++)
27549 +                       b[i] = *(const unsigned short *)&u.x[i];
27550 +
27551 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27552 +               r &= __lzo_assert(b[0] == 0x0100);
27553 +               r &= __lzo_assert(b[1] == 0x0201);
27554 +               r &= __lzo_assert(b[2] == 0x0302);
27555 +               r &= __lzo_assert(b[3] == 0x0403);
27556 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27557 +               r &= __lzo_assert(b[0] == 0x0001);
27558 +               r &= __lzo_assert(b[1] == 0x0102);
27559 +               r &= __lzo_assert(b[2] == 0x0203);
27560 +               r &= __lzo_assert(b[3] == 0x0304);
27561 +#  endif
27562 +       }
27563 +#endif
27564 +
27565 +#if defined(LZO_UNALIGNED_OK_4)
27566 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27567 +       if (r == 1) {
27568 +               lzo_uint32 a[4];
27569 +
27570 +               for (i = 0; i < 4; i++)
27571 +                       a[i] = *(const lzo_uint32 *)&u.x[i];
27572 +
27573 +#  if (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
27574 +               r &= __lzo_assert(a[0] == 0x03020100L);
27575 +               r &= __lzo_assert(a[1] == 0x04030201L);
27576 +               r &= __lzo_assert(a[2] == 0x05040302L);
27577 +               r &= __lzo_assert(a[3] == 0x06050403L);
27578 +#  elif (LZO_BYTE_ORDER == LZO_BIG_ENDIAN)
27579 +               r &= __lzo_assert(a[0] == 0x00010203L);
27580 +               r &= __lzo_assert(a[1] == 0x01020304L);
27581 +               r &= __lzo_assert(a[2] == 0x02030405L);
27582 +               r &= __lzo_assert(a[3] == 0x03040506L);
27583 +#  endif
27584 +       }
27585 +#endif
27586 +
27587 +#if defined(LZO_ALIGNED_OK_4)
27588 +       COMPILE_TIME_ASSERT(sizeof(lzo_uint32) == 4);
27589 +#endif
27590 +
27591 +       COMPILE_TIME_ASSERT(lzo_sizeof_dict_t == sizeof(lzo_dict_t));
27592 +
27593 +       if (r == 1) {
27594 +               r &= __lzo_assert(!schedule_insns_bug());
27595 +       }
27596 +
27597 +       if (r == 1) {
27598 +               static int x[3];
27599 +               static unsigned xn = 3;
27600 +               register unsigned j;
27601 +
27602 +               for (j = 0; j < xn; j++)
27603 +                       x[j] = (int)j - 3;
27604 +               r &= __lzo_assert(!strength_reduce_bug(x));
27605 +       }
27606 +
27607 +       if (r == 1) {
27608 +               r &= ptr_check();
27609 +       }
27610 +
27611 +       return r == 1 ? LZO_E_OK : LZO_E_ERROR;
27612 +}
27613 +
27614 +static lzo_bool schedule_insns_bug(void)
27615 +{
27616 +#if defined(__LZO_CHECKER)
27617 +       return 0;
27618 +#else
27619 +       const int clone[] = { 1, 2, 0 };
27620 +       const int *q;
27621 +       q = clone;
27622 +       return (*q) ? 0 : 1;
27623 +#endif
27624 +}
27625 +
27626 +static lzo_bool strength_reduce_bug(int *x)
27627 +{
27628 +       return x[0] != -3 || x[1] != -2 || x[2] != -1;
27629 +}
27630 +
27631 +#undef COMPILE_TIME_ASSERT
27632 +
27633 +int __lzo_init2(unsigned v, int s1, int s2, int s3, int s4, int s5,
27634 +               int s6, int s7, int s8, int s9)
27635 +{
27636 +       int r;
27637 +
27638 +       if (v == 0)
27639 +               return LZO_E_ERROR;
27640 +
27641 +       r = (s1 == -1 || s1 == (int)sizeof(short)) &&
27642 +           (s2 == -1 || s2 == (int)sizeof(int)) &&
27643 +           (s3 == -1 || s3 == (int)sizeof(long)) &&
27644 +           (s4 == -1 || s4 == (int)sizeof(lzo_uint32)) &&
27645 +           (s5 == -1 || s5 == (int)sizeof(lzo_uint)) &&
27646 +           (s6 == -1 || s6 == (int)lzo_sizeof_dict_t) &&
27647 +           (s7 == -1 || s7 == (int)sizeof(char *)) &&
27648 +           (s8 == -1 || s8 == (int)sizeof(lzo_voidp)) &&
27649 +           (s9 == -1 || s9 == (int)sizeof(lzo_compress_t));
27650 +       if (!r)
27651 +               return LZO_E_ERROR;
27652 +
27653 +       r = _lzo_config_check();
27654 +       if (r != LZO_E_OK)
27655 +               return r;
27656 +
27657 +       return r;
27658 +}
27659 +
27660 +#define do_compress         _lzo1x_1_do_compress
27661 +
27662 +#define LZO_NEED_DICT_H
27663 +#define D_BITS          14
27664 +#define D_INDEX1(d,p)       d = DM((0x21*DX3(p,5,5,6)) >> 5)
27665 +#define D_INDEX2(d,p)       d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
27666 +
27667 +#ifndef __LZO_CONFIG1X_H
27668 +#define __LZO_CONFIG1X_H
27669 +
27670 +#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
27671 +#  define LZO1X
27672 +#endif
27673 +
27674 +#define LZO_EOF_CODE
27675 +#undef LZO_DETERMINISTIC
27676 +
27677 +#define M1_MAX_OFFSET   0x0400
27678 +#ifndef M2_MAX_OFFSET
27679 +#define M2_MAX_OFFSET   0x0800
27680 +#endif
27681 +#define M3_MAX_OFFSET   0x4000
27682 +#define M4_MAX_OFFSET   0xbfff
27683 +
27684 +#define MX_MAX_OFFSET   (M1_MAX_OFFSET + M2_MAX_OFFSET)
27685 +
27686 +#define M1_MIN_LEN      2
27687 +#define M1_MAX_LEN      2
27688 +#define M2_MIN_LEN      3
27689 +#ifndef M2_MAX_LEN
27690 +#define M2_MAX_LEN      8
27691 +#endif
27692 +#define M3_MIN_LEN      3
27693 +#define M3_MAX_LEN      33
27694 +#define M4_MIN_LEN      3
27695 +#define M4_MAX_LEN      9
27696 +
27697 +#define M1_MARKER       0
27698 +#define M2_MARKER       64
27699 +#define M3_MARKER       32
27700 +#define M4_MARKER       16
27701 +
27702 +#ifndef MIN_LOOKAHEAD
27703 +#define MIN_LOOKAHEAD       (M2_MAX_LEN + 1)
27704 +#endif
27705 +
27706 +#if defined(LZO_NEED_DICT_H)
27707 +
27708 +#ifndef LZO_HASH
27709 +#define LZO_HASH            LZO_HASH_LZO_INCREMENTAL_B
27710 +#endif
27711 +#define DL_MIN_LEN          M2_MIN_LEN
27712 +
27713 +#ifndef __LZO_DICT_H
27714 +#define __LZO_DICT_H
27715 +
27716 +#if !defined(D_BITS) && defined(DBITS)
27717 +#  define D_BITS        DBITS
27718 +#endif
27719 +#if !defined(D_BITS)
27720 +#  error "D_BITS is not defined"
27721 +#endif
27722 +#if (D_BITS < 16)
27723 +#  define D_SIZE        LZO_SIZE(D_BITS)
27724 +#  define D_MASK        LZO_MASK(D_BITS)
27725 +#else
27726 +#  define D_SIZE        LZO_USIZE(D_BITS)
27727 +#  define D_MASK        LZO_UMASK(D_BITS)
27728 +#endif
27729 +#define D_HIGH          ((D_MASK >> 1) + 1)
27730 +
27731 +#if !defined(DD_BITS)
27732 +#  define DD_BITS       0
27733 +#endif
27734 +#define DD_SIZE         LZO_SIZE(DD_BITS)
27735 +#define DD_MASK         LZO_MASK(DD_BITS)
27736 +
27737 +#if !defined(DL_BITS)
27738 +#  define DL_BITS       (D_BITS - DD_BITS)
27739 +#endif
27740 +#if (DL_BITS < 16)
27741 +#  define DL_SIZE       LZO_SIZE(DL_BITS)
27742 +#  define DL_MASK       LZO_MASK(DL_BITS)
27743 +#else
27744 +#  define DL_SIZE       LZO_USIZE(DL_BITS)
27745 +#  define DL_MASK       LZO_UMASK(DL_BITS)
27746 +#endif
27747 +
27748 +#if (D_BITS != DL_BITS + DD_BITS)
27749 +#  error "D_BITS does not match"
27750 +#endif
27751 +#if (D_BITS < 8 || D_BITS > 18)
27752 +#  error "invalid D_BITS"
27753 +#endif
27754 +#if (DL_BITS < 8 || DL_BITS > 20)
27755 +#  error "invalid DL_BITS"
27756 +#endif
27757 +#if (DD_BITS < 0 || DD_BITS > 6)
27758 +#  error "invalid DD_BITS"
27759 +#endif
27760 +
27761 +#if !defined(DL_MIN_LEN)
27762 +#  define DL_MIN_LEN    3
27763 +#endif
27764 +#if !defined(DL_SHIFT)
27765 +#  define DL_SHIFT      ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
27766 +#endif
27767 +
27768 +#define LZO_HASH_GZIP                   1
27769 +#define LZO_HASH_GZIP_INCREMENTAL       2
27770 +#define LZO_HASH_LZO_INCREMENTAL_A      3
27771 +#define LZO_HASH_LZO_INCREMENTAL_B      4
27772 +
27773 +#if !defined(LZO_HASH)
27774 +#  error "choose a hashing strategy"
27775 +#endif
27776 +
27777 +#if (DL_MIN_LEN == 3)
27778 +#  define _DV2_A(p,shift1,shift2) \
27779 +       (((( (lzo_uint32)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
27780 +#  define _DV2_B(p,shift1,shift2) \
27781 +       (((( (lzo_uint32)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
27782 +#  define _DV3_B(p,shift1,shift2,shift3) \
27783 +       ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
27784 +#elif (DL_MIN_LEN == 2)
27785 +#  define _DV2_A(p,shift1,shift2) \
27786 +       (( (lzo_uint32)(p[0]) << shift1) ^ p[1])
27787 +#  define _DV2_B(p,shift1,shift2) \
27788 +       (( (lzo_uint32)(p[1]) << shift1) ^ p[2])
27789 +#else
27790 +#  error "invalid DL_MIN_LEN"
27791 +#endif
27792 +#define _DV_A(p,shift)      _DV2_A(p,shift,shift)
27793 +#define _DV_B(p,shift)      _DV2_B(p,shift,shift)
27794 +#define DA2(p,s1,s2) \
27795 +       (((((lzo_uint32)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
27796 +#define DS2(p,s1,s2) \
27797 +       (((((lzo_uint32)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
27798 +#define DX2(p,s1,s2) \
27799 +       (((((lzo_uint32)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
27800 +#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
27801 +#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
27802 +#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
27803 +#define DMS(v,s)        ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
27804 +#define DM(v)           DMS(v,0)
27805 +
27806 +#if (LZO_HASH == LZO_HASH_GZIP)
27807 +#  define _DINDEX(dv,p)     (_DV_A((p),DL_SHIFT))
27808 +
27809 +#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
27810 +#  define __LZO_HASH_INCREMENTAL
27811 +#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),DL_SHIFT)
27812 +#  define DVAL_NEXT(dv,p)   dv = (((dv) << DL_SHIFT) ^ p[2])
27813 +#  define _DINDEX(dv,p)     (dv)
27814 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
27815 +
27816 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
27817 +#  define __LZO_HASH_INCREMENTAL
27818 +#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),5)
27819 +#  define DVAL_NEXT(dv,p) \
27820 +               dv ^= (lzo_uint32)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
27821 +#  define _DINDEX(dv,p)     ((0x9f5f * (dv)) >> 5)
27822 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
27823 +
27824 +#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
27825 +#  define __LZO_HASH_INCREMENTAL
27826 +#  define DVAL_FIRST(dv,p)  dv = _DV_B((p),5)
27827 +#  define DVAL_NEXT(dv,p) \
27828 +               dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_uint32)(p[2]) << (2*5)))
27829 +#  define _DINDEX(dv,p)     ((0x9f5f * (dv)) >> 5)
27830 +#  define DVAL_LOOKAHEAD    DL_MIN_LEN
27831 +
27832 +#else
27833 +#  error "choose a hashing strategy"
27834 +#endif
27835 +
27836 +#ifndef DINDEX
27837 +#define DINDEX(dv,p)        ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
27838 +#endif
27839 +#if !defined(DINDEX1) && defined(D_INDEX1)
27840 +#define DINDEX1             D_INDEX1
27841 +#endif
27842 +#if !defined(DINDEX2) && defined(D_INDEX2)
27843 +#define DINDEX2             D_INDEX2
27844 +#endif
27845 +
27846 +#if !defined(__LZO_HASH_INCREMENTAL)
27847 +#  define DVAL_FIRST(dv,p)  ((void) 0)
27848 +#  define DVAL_NEXT(dv,p)   ((void) 0)
27849 +#  define DVAL_LOOKAHEAD    0
27850 +#endif
27851 +
27852 +#if !defined(DVAL_ASSERT)
27853 +#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
27854 +static void DVAL_ASSERT(lzo_uint32 dv, const lzo_byte * p)
27855 +{
27856 +       lzo_uint32 df;
27857 +       DVAL_FIRST(df, (p));
27858 +       assert(DINDEX(dv, p) == DINDEX(df, p));
27859 +}
27860 +#else
27861 +#  define DVAL_ASSERT(dv,p) ((void) 0)
27862 +#endif
27863 +#endif
27864 +
27865 +#  define DENTRY(p,in)                          (p)
27866 +#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_pos = dict[dindex]
27867 +
27868 +#if (DD_BITS == 0)
27869 +
27870 +#  define UPDATE_D(dict,drun,dv,p,in)       dict[ DINDEX(dv,p) ] = DENTRY(p,in)
27871 +#  define UPDATE_I(dict,drun,index,p,in)    dict[index] = DENTRY(p,in)
27872 +#  define UPDATE_P(ptr,drun,p,in)           (ptr)[0] = DENTRY(p,in)
27873 +
27874 +#else
27875 +
27876 +#  define UPDATE_D(dict,drun,dv,p,in)   \
27877 +       dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27878 +#  define UPDATE_I(dict,drun,index,p,in)    \
27879 +       dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
27880 +#  define UPDATE_P(ptr,drun,p,in)   \
27881 +       (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
27882 +
27883 +#endif
27884 +
27885 +#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
27886 +       (m_pos == NULL || (m_off = (lzo_moff_t) (ip - m_pos)) > max_offset)
27887 +
27888 +#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
27889 +    (BOUNDS_CHECKING_OFF_IN_EXPR( \
27890 +       (PTR_LT(m_pos,in) || \
27891 +        (m_off = (lzo_moff_t) PTR_DIFF(ip,m_pos)) <= 0 || \
27892 +         m_off > max_offset) ))
27893 +
27894 +#if defined(LZO_DETERMINISTIC)
27895 +#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_DET
27896 +#else
27897 +#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_NON_DET
27898 +#endif
27899 +#endif
27900 +#endif
27901 +#endif
27902 +#define DO_COMPRESS     lzo1x_1_compress
27903 +static
27904 +lzo_uint do_compress(const lzo_byte * in, lzo_uint in_len,
27905 +                    lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
27906 +{
27907 +       register const lzo_byte *ip;
27908 +       lzo_byte *op;
27909 +       const lzo_byte *const in_end = in + in_len;
27910 +       const lzo_byte *const ip_end = in + in_len - M2_MAX_LEN - 5;
27911 +       const lzo_byte *ii;
27912 +       lzo_dict_p const dict = (lzo_dict_p) wrkmem;
27913 +
27914 +       op = out;
27915 +       ip = in;
27916 +       ii = ip;
27917 +
27918 +       ip += 4;
27919 +       for (;;) {
27920 +               register const lzo_byte *m_pos;
27921 +
27922 +               lzo_moff_t m_off;
27923 +               lzo_uint m_len;
27924 +               lzo_uint dindex;
27925 +
27926 +               DINDEX1(dindex, ip);
27927 +               GINDEX(m_pos, m_off, dict, dindex, in);
27928 +               if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27929 +                       goto literal;
27930 +#if 1
27931 +               if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27932 +                       goto try_match;
27933 +               DINDEX2(dindex, ip);
27934 +#endif
27935 +               GINDEX(m_pos, m_off, dict, dindex, in);
27936 +               if (LZO_CHECK_MPOS_NON_DET(m_pos, m_off, in, ip, M4_MAX_OFFSET))
27937 +                       goto literal;
27938 +               if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
27939 +                       goto try_match;
27940 +               goto literal;
27941 +
27942 +             try_match:
27943 +#if 1 && defined(LZO_UNALIGNED_OK_2)
27944 +               if (*(const lzo_ushortp)m_pos != *(const lzo_ushortp)ip) {
27945 +#else
27946 +               if (m_pos[0] != ip[0] || m_pos[1] != ip[1]) {
27947 +#endif
27948 +                       ;
27949 +               } else {
27950 +                       if (m_pos[2] == ip[2]) {
27951 +                               goto match;
27952 +                       } else {
27953 +                               ;
27954 +                       }
27955 +               }
27956 +
27957 +             literal:
27958 +               UPDATE_I(dict, 0, dindex, ip, in);
27959 +               ++ip;
27960 +               if (ip >= ip_end)
27961 +                       break;
27962 +               continue;
27963 +
27964 +             match:
27965 +               UPDATE_I(dict, 0, dindex, ip, in);
27966 +               if (pd(ip, ii) > 0) {
27967 +                       register lzo_uint t = pd(ip, ii);
27968 +
27969 +                       if (t <= 3) {
27970 +                               assert("lzo-04", op - 2 > out);
27971 +                               op[-2] |= LZO_BYTE(t);
27972 +                       } else if (t <= 18)
27973 +                               *op++ = LZO_BYTE(t - 3);
27974 +                       else {
27975 +                               register lzo_uint tt = t - 18;
27976 +
27977 +                               *op++ = 0;
27978 +                               while (tt > 255) {
27979 +                                       tt -= 255;
27980 +                                       *op++ = 0;
27981 +                               }
27982 +                               assert("lzo-05", tt > 0);
27983 +                               *op++ = LZO_BYTE(tt);
27984 +                       }
27985 +                       do
27986 +                               *op++ = *ii++;
27987 +                       while (--t > 0);
27988 +               }
27989 +
27990 +               assert("lzo-06", ii == ip);
27991 +               ip += 3;
27992 +               if (m_pos[3] != *ip++ || m_pos[4] != *ip++ || m_pos[5] != *ip++
27993 +                   || m_pos[6] != *ip++ || m_pos[7] != *ip++
27994 +                   || m_pos[8] != *ip++
27995 +#ifdef LZO1Y
27996 +                   || m_pos[9] != *ip++ || m_pos[10] != *ip++
27997 +                   || m_pos[11] != *ip++ || m_pos[12] != *ip++
27998 +                   || m_pos[13] != *ip++ || m_pos[14] != *ip++
27999 +#endif
28000 +                   ) {
28001 +                       --ip;
28002 +                       m_len = ip - ii;
28003 +                       assert("lzo-07", m_len >= 3);
28004 +                       assert("lzo-08", m_len <= M2_MAX_LEN);
28005 +
28006 +                       if (m_off <= M2_MAX_OFFSET) {
28007 +                               m_off -= 1;
28008 +#if defined(LZO1X)
28009 +                               *op++ =
28010 +                                   LZO_BYTE(((m_len -
28011 +                                              1) << 5) | ((m_off & 7) << 2));
28012 +                               *op++ = LZO_BYTE(m_off >> 3);
28013 +#elif defined(LZO1Y)
28014 +                               *op++ =
28015 +                                   LZO_BYTE(((m_len +
28016 +                                              1) << 4) | ((m_off & 3) << 2));
28017 +                               *op++ = LZO_BYTE(m_off >> 2);
28018 +#endif
28019 +                       } else if (m_off <= M3_MAX_OFFSET) {
28020 +                               m_off -= 1;
28021 +                               *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
28022 +                               goto m3_m4_offset;
28023 +                       } else
28024 +#if defined(LZO1X)
28025 +                       {
28026 +                               m_off -= 0x4000;
28027 +                               assert("lzo-09", m_off > 0);
28028 +                               assert("lzo-10", m_off <= 0x7fff);
28029 +                               *op++ = LZO_BYTE(M4_MARKER |
28030 +                                                ((m_off & 0x4000) >> 11) |
28031 +                                                (m_len - 2));
28032 +                               goto m3_m4_offset;
28033 +                       }
28034 +#elif defined(LZO1Y)
28035 +                               goto m4_match;
28036 +#endif
28037 +               } else {
28038 +                       {
28039 +                               const lzo_byte *end = in_end;
28040 +                               const lzo_byte *m = m_pos + M2_MAX_LEN + 1;
28041 +                               while (ip < end && *m == *ip)
28042 +                                       m++, ip++;
28043 +                               m_len = (ip - ii);
28044 +                       }
28045 +                       assert("lzo-11", m_len > M2_MAX_LEN);
28046 +
28047 +                       if (m_off <= M3_MAX_OFFSET) {
28048 +                               m_off -= 1;
28049 +                               if (m_len <= 33)
28050 +                                       *op++ =
28051 +                                           LZO_BYTE(M3_MARKER | (m_len - 2));
28052 +                               else {
28053 +                                       m_len -= 33;
28054 +                                       *op++ = M3_MARKER | 0;
28055 +                                       goto m3_m4_len;
28056 +                               }
28057 +                       } else {
28058 +#if defined(LZO1Y)
28059 +                             m4_match:
28060 +#endif
28061 +                               m_off -= 0x4000;
28062 +                               assert("lzo-12", m_off > 0);
28063 +                               assert("lzo-13", m_off <= 0x7fff);
28064 +                               if (m_len <= M4_MAX_LEN)
28065 +                                       *op++ = LZO_BYTE(M4_MARKER |
28066 +                                                        ((m_off & 0x4000) >>
28067 +                                                         11) | (m_len - 2));
28068 +                               else {
28069 +                                       m_len -= M4_MAX_LEN;
28070 +                                       *op++ =
28071 +                                           LZO_BYTE(M4_MARKER |
28072 +                                                    ((m_off & 0x4000) >> 11));
28073 +                                     m3_m4_len:
28074 +                                       while (m_len > 255) {
28075 +                                               m_len -= 255;
28076 +                                               *op++ = 0;
28077 +                                       }
28078 +                                       assert("lzo-14", m_len > 0);
28079 +                                       *op++ = LZO_BYTE(m_len);
28080 +                               }
28081 +                       }
28082 +
28083 +                     m3_m4_offset:
28084 +                       *op++ = LZO_BYTE((m_off & 63) << 2);
28085 +                       *op++ = LZO_BYTE(m_off >> 6);
28086 +               }
28087 +
28088 +               ii = ip;
28089 +               if (ip >= ip_end)
28090 +                       break;
28091 +       }
28092 +
28093 +       *out_len = op - out;
28094 +       return pd(in_end, ii);
28095 +}
28096 +
28097 +int DO_COMPRESS(const lzo_byte * in, lzo_uint in_len,
28098 +               lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28099 +{
28100 +       lzo_byte *op = out;
28101 +       lzo_uint t;
28102 +
28103 +#if defined(__LZO_QUERY_COMPRESS)
28104 +       if (__LZO_IS_COMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28105 +               return __LZO_QUERY_COMPRESS(in, in_len, out, out_len, wrkmem,
28106 +                                           D_SIZE, lzo_sizeof(lzo_dict_t));
28107 +#endif
28108 +
28109 +       if (in_len <= M2_MAX_LEN + 5)
28110 +               t = in_len;
28111 +       else {
28112 +               t = do_compress(in, in_len, op, out_len, wrkmem);
28113 +               op += *out_len;
28114 +       }
28115 +
28116 +       if (t > 0) {
28117 +               const lzo_byte *ii = in + in_len - t;
28118 +
28119 +               if (op == out && t <= 238)
28120 +                       *op++ = LZO_BYTE(17 + t);
28121 +               else if (t <= 3)
28122 +                       op[-2] |= LZO_BYTE(t);
28123 +               else if (t <= 18)
28124 +                       *op++ = LZO_BYTE(t - 3);
28125 +               else {
28126 +                       lzo_uint tt = t - 18;
28127 +
28128 +                       *op++ = 0;
28129 +                       while (tt > 255) {
28130 +                               tt -= 255;
28131 +                               *op++ = 0;
28132 +                       }
28133 +                       assert("lzo-15", tt > 0);
28134 +                       *op++ = LZO_BYTE(tt);
28135 +               }
28136 +               do
28137 +                       *op++ = *ii++;
28138 +               while (--t > 0);
28139 +       }
28140 +
28141 +       *op++ = M4_MARKER | 1;
28142 +       *op++ = 0;
28143 +       *op++ = 0;
28144 +
28145 +       *out_len = op - out;
28146 +       return LZO_E_OK;
28147 +}
28148 +
28149 +#undef do_compress
28150 +#undef DO_COMPRESS
28151 +#undef LZO_HASH
28152 +
28153 +#undef LZO_TEST_DECOMPRESS_OVERRUN
28154 +#undef LZO_TEST_DECOMPRESS_OVERRUN_INPUT
28155 +#undef LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT
28156 +#undef LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28157 +#undef DO_DECOMPRESS
28158 +#define DO_DECOMPRESS       lzo1x_decompress
28159 +
28160 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28161 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28162 +#    define LZO_TEST_DECOMPRESS_OVERRUN_INPUT       2
28163 +#  endif
28164 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28165 +#    define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT      2
28166 +#  endif
28167 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28168 +#    define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28169 +#  endif
28170 +#endif
28171 +
28172 +#undef TEST_IP
28173 +#undef TEST_OP
28174 +#undef TEST_LOOKBEHIND
28175 +#undef NEED_IP
28176 +#undef NEED_OP
28177 +#undef HAVE_TEST_IP
28178 +#undef HAVE_TEST_OP
28179 +#undef HAVE_NEED_IP
28180 +#undef HAVE_NEED_OP
28181 +#undef HAVE_ANY_IP
28182 +#undef HAVE_ANY_OP
28183 +
28184 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28185 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28186 +#    define TEST_IP             (ip < ip_end)
28187 +#  endif
28188 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28189 +#    define NEED_IP(x) \
28190 +           if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
28191 +#  endif
28192 +#endif
28193 +
28194 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28195 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28196 +#    define TEST_OP             (op <= op_end)
28197 +#  endif
28198 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28199 +#    undef TEST_OP
28200 +#    define NEED_OP(x) \
28201 +           if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
28202 +#  endif
28203 +#endif
28204 +
28205 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28206 +#  define TEST_LOOKBEHIND(m_pos,out)    if (m_pos < out) goto lookbehind_overrun
28207 +#else
28208 +#  define TEST_LOOKBEHIND(m_pos,op)     ((void) 0)
28209 +#endif
28210 +
28211 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28212 +#  define TEST_IP               (ip < ip_end)
28213 +#endif
28214 +
28215 +#if defined(TEST_IP)
28216 +#  define HAVE_TEST_IP
28217 +#else
28218 +#  define TEST_IP               1
28219 +#endif
28220 +#if defined(TEST_OP)
28221 +#  define HAVE_TEST_OP
28222 +#else
28223 +#  define TEST_OP               1
28224 +#endif
28225 +
28226 +#if defined(NEED_IP)
28227 +#  define HAVE_NEED_IP
28228 +#else
28229 +#  define NEED_IP(x)            ((void) 0)
28230 +#endif
28231 +#if defined(NEED_OP)
28232 +#  define HAVE_NEED_OP
28233 +#else
28234 +#  define NEED_OP(x)            ((void) 0)
28235 +#endif
28236 +
28237 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28238 +#  define HAVE_ANY_IP
28239 +#endif
28240 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28241 +#  define HAVE_ANY_OP
28242 +#endif
28243 +
28244 +#undef __COPY4
28245 +#define __COPY4(dst,src)    * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28246 +
28247 +#undef COPY4
28248 +#if defined(LZO_UNALIGNED_OK_4)
28249 +#  define COPY4(dst,src)    __COPY4(dst,src)
28250 +#elif defined(LZO_ALIGNED_OK_4)
28251 +#  define COPY4(dst,src)    __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28252 +#endif
28253 +
28254 +#if defined(DO_DECOMPRESS)
28255 +int DO_DECOMPRESS(const lzo_byte * in, lzo_uint in_len,
28256 +                 lzo_byte * out, lzo_uintp out_len, lzo_voidp wrkmem)
28257 +#endif
28258 +{
28259 +       register lzo_byte *op;
28260 +       register const lzo_byte *ip;
28261 +       register lzo_uint t;
28262 +#if defined(COPY_DICT)
28263 +       lzo_uint m_off;
28264 +       const lzo_byte *dict_end;
28265 +#else
28266 +       register const lzo_byte *m_pos;
28267 +#endif
28268 +
28269 +       const lzo_byte *const ip_end = in + in_len;
28270 +#if defined(HAVE_ANY_OP)
28271 +       lzo_byte *const op_end = out + *out_len;
28272 +#endif
28273 +#if defined(LZO1Z)
28274 +       lzo_uint last_m_off = 0;
28275 +#endif
28276 +
28277 +       LZO_UNUSED(wrkmem);
28278 +
28279 +#if defined(__LZO_QUERY_DECOMPRESS)
28280 +       if (__LZO_IS_DECOMPRESS_QUERY(in, in_len, out, out_len, wrkmem))
28281 +               return __LZO_QUERY_DECOMPRESS(in, in_len, out, out_len, wrkmem,
28282 +                                             0, 0);
28283 +#endif
28284 +
28285 +#if defined(COPY_DICT)
28286 +       if (dict) {
28287 +               if (dict_len > M4_MAX_OFFSET) {
28288 +                       dict += dict_len - M4_MAX_OFFSET;
28289 +                       dict_len = M4_MAX_OFFSET;
28290 +               }
28291 +               dict_end = dict + dict_len;
28292 +       } else {
28293 +               dict_len = 0;
28294 +               dict_end = NULL;
28295 +       }
28296 +#endif
28297 +
28298 +       *out_len = 0;
28299 +
28300 +       op = out;
28301 +       ip = in;
28302 +
28303 +       if (*ip > 17) {
28304 +               t = *ip++ - 17;
28305 +               if (t < 4)
28306 +                       goto match_next;
28307 +               assert("lzo-16", t > 0);
28308 +               NEED_OP(t);
28309 +               NEED_IP(t + 1);
28310 +               do
28311 +                       *op++ = *ip++;
28312 +               while (--t > 0);
28313 +               goto first_literal_run;
28314 +       }
28315 +
28316 +       while (TEST_IP && TEST_OP) {
28317 +               t = *ip++;
28318 +               if (t >= 16)
28319 +                       goto match;
28320 +               if (t == 0) {
28321 +                       NEED_IP(1);
28322 +                       while (*ip == 0) {
28323 +                               t += 255;
28324 +                               ip++;
28325 +                               NEED_IP(1);
28326 +                       }
28327 +                       t += 15 + *ip++;
28328 +               }
28329 +               assert("lzo-17", t > 0);
28330 +               NEED_OP(t + 3);
28331 +               NEED_IP(t + 4);
28332 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28333 +#if !defined(LZO_UNALIGNED_OK_4)
28334 +               if (PTR_ALIGNED2_4(op, ip)) {
28335 +#endif
28336 +                       COPY4(op, ip);
28337 +                       op += 4;
28338 +                       ip += 4;
28339 +                       if (--t > 0) {
28340 +                               if (t >= 4) {
28341 +                                       do {
28342 +                                               COPY4(op, ip);
28343 +                                               op += 4;
28344 +                                               ip += 4;
28345 +                                               t -= 4;
28346 +                                       } while (t >= 4);
28347 +                                       if (t > 0)
28348 +                                               do
28349 +                                                       *op++ = *ip++;
28350 +                                               while (--t > 0);
28351 +                               } else
28352 +                                       do
28353 +                                               *op++ = *ip++;
28354 +                                       while (--t > 0);
28355 +                       }
28356 +#if !defined(LZO_UNALIGNED_OK_4)
28357 +               } else
28358 +#endif
28359 +#endif
28360 +#if !defined(LZO_UNALIGNED_OK_4)
28361 +               {
28362 +                       *op++ = *ip++;
28363 +                       *op++ = *ip++;
28364 +                       *op++ = *ip++;
28365 +                       do
28366 +                               *op++ = *ip++;
28367 +                       while (--t > 0);
28368 +               }
28369 +#endif
28370 +
28371 +             first_literal_run:
28372 +
28373 +               t = *ip++;
28374 +               if (t >= 16)
28375 +                       goto match;
28376 +#if defined(COPY_DICT)
28377 +#if defined(LZO1Z)
28378 +               m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28379 +               last_m_off = m_off;
28380 +#else
28381 +               m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
28382 +#endif
28383 +               NEED_OP(3);
28384 +               t = 3;
28385 +               COPY_DICT(t, m_off)
28386 +#else
28387 +#if defined(LZO1Z)
28388 +               t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
28389 +               m_pos = op - t;
28390 +               last_m_off = t;
28391 +#else
28392 +               m_pos = op - (1 + M2_MAX_OFFSET);
28393 +               m_pos -= t >> 2;
28394 +               m_pos -= *ip++ << 2;
28395 +#endif
28396 +               TEST_LOOKBEHIND(m_pos, out);
28397 +               NEED_OP(3);
28398 +               *op++ = *m_pos++;
28399 +               *op++ = *m_pos++;
28400 +               *op++ = *m_pos;
28401 +#endif
28402 +               goto match_done;
28403 +
28404 +               while (TEST_IP && TEST_OP) {
28405 +                     match:
28406 +                       if (t >= 64) {
28407 +#if defined(COPY_DICT)
28408 +#if defined(LZO1X)
28409 +                               m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
28410 +                               t = (t >> 5) - 1;
28411 +#elif defined(LZO1Y)
28412 +                               m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
28413 +                               t = (t >> 4) - 3;
28414 +#elif defined(LZO1Z)
28415 +                               m_off = t & 0x1f;
28416 +                               if (m_off >= 0x1c)
28417 +                                       m_off = last_m_off;
28418 +                               else {
28419 +                                       m_off = 1 + (m_off << 6) + (*ip++ >> 2);
28420 +                                       last_m_off = m_off;
28421 +                               }
28422 +                               t = (t >> 5) - 1;
28423 +#endif
28424 +#else
28425 +#if defined(LZO1X)
28426 +                               m_pos = op - 1;
28427 +                               m_pos -= (t >> 2) & 7;
28428 +                               m_pos -= *ip++ << 3;
28429 +                               t = (t >> 5) - 1;
28430 +#elif defined(LZO1Y)
28431 +                               m_pos = op - 1;
28432 +                               m_pos -= (t >> 2) & 3;
28433 +                               m_pos -= *ip++ << 2;
28434 +                               t = (t >> 4) - 3;
28435 +#elif defined(LZO1Z)
28436 +                               {
28437 +                                       lzo_uint off = t & 0x1f;
28438 +                                       m_pos = op;
28439 +                                       if (off >= 0x1c) {
28440 +                                               assert(last_m_off > 0);
28441 +                                               m_pos -= last_m_off;
28442 +                                       } else {
28443 +                                               off =
28444 +                                                   1 + (off << 6) +
28445 +                                                   (*ip++ >> 2);
28446 +                                               m_pos -= off;
28447 +                                               last_m_off = off;
28448 +                                       }
28449 +                               }
28450 +                               t = (t >> 5) - 1;
28451 +#endif
28452 +                               TEST_LOOKBEHIND(m_pos, out);
28453 +                               assert("lzo-18", t > 0);
28454 +                               NEED_OP(t + 3 - 1);
28455 +                               goto copy_match;
28456 +#endif
28457 +                       } else if (t >= 32) {
28458 +                               t &= 31;
28459 +                               if (t == 0) {
28460 +                                       NEED_IP(1);
28461 +                                       while (*ip == 0) {
28462 +                                               t += 255;
28463 +                                               ip++;
28464 +                                               NEED_IP(1);
28465 +                                       }
28466 +                                       t += 31 + *ip++;
28467 +                               }
28468 +#if defined(COPY_DICT)
28469 +#if defined(LZO1Z)
28470 +                               m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
28471 +                               last_m_off = m_off;
28472 +#else
28473 +                               m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
28474 +#endif
28475 +#else
28476 +#if defined(LZO1Z)
28477 +                               {
28478 +                                       lzo_uint off =
28479 +                                           1 + (ip[0] << 6) + (ip[1] >> 2);
28480 +                                       m_pos = op - off;
28481 +                                       last_m_off = off;
28482 +                               }
28483 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28484 +                               m_pos = op - 1;
28485 +                               m_pos -= (*(const lzo_ushortp)ip) >> 2;
28486 +#else
28487 +                               m_pos = op - 1;
28488 +                               m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28489 +#endif
28490 +#endif
28491 +                               ip += 2;
28492 +                       } else if (t >= 16) {
28493 +#if defined(COPY_DICT)
28494 +                               m_off = (t & 8) << 11;
28495 +#else
28496 +                               m_pos = op;
28497 +                               m_pos -= (t & 8) << 11;
28498 +#endif
28499 +                               t &= 7;
28500 +                               if (t == 0) {
28501 +                                       NEED_IP(1);
28502 +                                       while (*ip == 0) {
28503 +                                               t += 255;
28504 +                                               ip++;
28505 +                                               NEED_IP(1);
28506 +                                       }
28507 +                                       t += 7 + *ip++;
28508 +                               }
28509 +#if defined(COPY_DICT)
28510 +#if defined(LZO1Z)
28511 +                               m_off += (ip[0] << 6) + (ip[1] >> 2);
28512 +#else
28513 +                               m_off += (ip[0] >> 2) + (ip[1] << 6);
28514 +#endif
28515 +                               ip += 2;
28516 +                               if (m_off == 0)
28517 +                                       goto eof_found;
28518 +                               m_off += 0x4000;
28519 +#if defined(LZO1Z)
28520 +                               last_m_off = m_off;
28521 +#endif
28522 +#else
28523 +#if defined(LZO1Z)
28524 +                               m_pos -= (ip[0] << 6) + (ip[1] >> 2);
28525 +#elif defined(LZO_UNALIGNED_OK_2) && (LZO_BYTE_ORDER == LZO_LITTLE_ENDIAN)
28526 +                               m_pos -= (*(const lzo_ushortp)ip) >> 2;
28527 +#else
28528 +                               m_pos -= (ip[0] >> 2) + (ip[1] << 6);
28529 +#endif
28530 +                               ip += 2;
28531 +                               if (m_pos == op)
28532 +                                       goto eof_found;
28533 +                               m_pos -= 0x4000;
28534 +#if defined(LZO1Z)
28535 +                               last_m_off = op - m_pos;
28536 +#endif
28537 +#endif
28538 +                       } else {
28539 +#if defined(COPY_DICT)
28540 +#if defined(LZO1Z)
28541 +                               m_off = 1 + (t << 6) + (*ip++ >> 2);
28542 +                               last_m_off = m_off;
28543 +#else
28544 +                               m_off = 1 + (t >> 2) + (*ip++ << 2);
28545 +#endif
28546 +                               NEED_OP(2);
28547 +                               t = 2;
28548 +                               COPY_DICT(t, m_off)
28549 +#else
28550 +#if defined(LZO1Z)
28551 +                               t = 1 + (t << 6) + (*ip++ >> 2);
28552 +                               m_pos = op - t;
28553 +                               last_m_off = t;
28554 +#else
28555 +                               m_pos = op - 1;
28556 +                               m_pos -= t >> 2;
28557 +                               m_pos -= *ip++ << 2;
28558 +#endif
28559 +                               TEST_LOOKBEHIND(m_pos, out);
28560 +                               NEED_OP(2);
28561 +                               *op++ = *m_pos++;
28562 +                               *op++ = *m_pos;
28563 +#endif
28564 +                               goto match_done;
28565 +                       }
28566 +
28567 +#if defined(COPY_DICT)
28568 +
28569 +                       NEED_OP(t + 3 - 1);
28570 +                       t += 3 - 1;
28571 +                       COPY_DICT(t, m_off)
28572 +#else
28573 +
28574 +                       TEST_LOOKBEHIND(m_pos, out);
28575 +                       assert("lzo-19", t > 0);
28576 +                       NEED_OP(t + 3 - 1);
28577 +#if defined(LZO_UNALIGNED_OK_4) || defined(LZO_ALIGNED_OK_4)
28578 +#if !defined(LZO_UNALIGNED_OK_4)
28579 +                       if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op, m_pos)) {
28580 +                               assert((op - m_pos) >= 4);
28581 +#else
28582 +                       if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4) {
28583 +#endif
28584 +                               COPY4(op, m_pos);
28585 +                               op += 4;
28586 +                               m_pos += 4;
28587 +                               t -= 4 - (3 - 1);
28588 +                               do {
28589 +                                       COPY4(op, m_pos);
28590 +                                       op += 4;
28591 +                                       m_pos += 4;
28592 +                                       t -= 4;
28593 +                               } while (t >= 4);
28594 +                               if (t > 0)
28595 +                                       do
28596 +                                               *op++ = *m_pos++;
28597 +                                       while (--t > 0);
28598 +                       } else
28599 +#endif
28600 +                       {
28601 +                             copy_match:
28602 +                               *op++ = *m_pos++;
28603 +                               *op++ = *m_pos++;
28604 +                               do
28605 +                                       *op++ = *m_pos++;
28606 +                               while (--t > 0);
28607 +                       }
28608 +
28609 +#endif
28610 +
28611 +                     match_done:
28612 +#if defined(LZO1Z)
28613 +                       t = ip[-1] & 3;
28614 +#else
28615 +                       t = ip[-2] & 3;
28616 +#endif
28617 +                       if (t == 0)
28618 +                               break;
28619 +
28620 +                     match_next:
28621 +                       assert("lzo-20", t > 0);
28622 +                       NEED_OP(t);
28623 +                       NEED_IP(t + 1);
28624 +                       do
28625 +                               *op++ = *ip++;
28626 +                       while (--t > 0);
28627 +                       t = *ip++;
28628 +               }
28629 +       }
28630 +
28631 +#if defined(HAVE_TEST_IP) || defined(HAVE_TEST_OP)
28632 +       *out_len = op - out;
28633 +       return LZO_E_EOF_NOT_FOUND;
28634 +#endif
28635 +
28636 +      eof_found:
28637 +       assert("lzo-21", t == 1);
28638 +       *out_len = op - out;
28639 +       return (ip == ip_end ? LZO_E_OK :
28640 +               (ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
28641 +
28642 +#if defined(HAVE_NEED_IP)
28643 +      input_overrun:
28644 +       *out_len = op - out;
28645 +       return LZO_E_INPUT_OVERRUN;
28646 +#endif
28647 +
28648 +#if defined(HAVE_NEED_OP)
28649 +      output_overrun:
28650 +       *out_len = op - out;
28651 +       return LZO_E_OUTPUT_OVERRUN;
28652 +#endif
28653 +
28654 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28655 +      lookbehind_overrun:
28656 +       *out_len = op - out;
28657 +       return LZO_E_LOOKBEHIND_OVERRUN;
28658 +#endif
28659 +}
28660 +
28661 +#define LZO_TEST_DECOMPRESS_OVERRUN
28662 +#undef DO_DECOMPRESS
28663 +#define DO_DECOMPRESS       lzo1x_decompress_safe
28664 +
28665 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN)
28666 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28667 +#    define LZO_TEST_DECOMPRESS_OVERRUN_INPUT       2
28668 +#  endif
28669 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28670 +#    define LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT      2
28671 +#  endif
28672 +#  if !defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28673 +#    define LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND
28674 +#  endif
28675 +#endif
28676 +
28677 +#undef TEST_IP
28678 +#undef TEST_OP
28679 +#undef TEST_LOOKBEHIND
28680 +#undef NEED_IP
28681 +#undef NEED_OP
28682 +#undef HAVE_TEST_IP
28683 +#undef HAVE_TEST_OP
28684 +#undef HAVE_NEED_IP
28685 +#undef HAVE_NEED_OP
28686 +#undef HAVE_ANY_IP
28687 +#undef HAVE_ANY_OP
28688 +
28689 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_INPUT)
28690 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 1)
28691 +#    define TEST_IP             (ip < ip_end)
28692 +#  endif
28693 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_INPUT >= 2)
28694 +#    define NEED_IP(x) \
28695 +           if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
28696 +#  endif
28697 +#endif
28698 +
28699 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT)
28700 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 1)
28701 +#    define TEST_OP             (op <= op_end)
28702 +#  endif
28703 +#  if (LZO_TEST_DECOMPRESS_OVERRUN_OUTPUT >= 2)
28704 +#    undef TEST_OP
28705 +#    define NEED_OP(x) \
28706 +           if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
28707 +#  endif
28708 +#endif
28709 +
28710 +#if defined(LZO_TEST_DECOMPRESS_OVERRUN_LOOKBEHIND)
28711 +#  define TEST_LOOKBEHIND(m_pos,out)    if (m_pos < out) goto lookbehind_overrun
28712 +#else
28713 +#  define TEST_LOOKBEHIND(m_pos,op)     ((void) 0)
28714 +#endif
28715 +
28716 +#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
28717 +#  define TEST_IP               (ip < ip_end)
28718 +#endif
28719 +
28720 +#if defined(TEST_IP)
28721 +#  define HAVE_TEST_IP
28722 +#else
28723 +#  define TEST_IP               1
28724 +#endif
28725 +#if defined(TEST_OP)
28726 +#  define HAVE_TEST_OP
28727 +#else
28728 +#  define TEST_OP               1
28729 +#endif
28730 +
28731 +#if defined(NEED_IP)
28732 +#  define HAVE_NEED_IP
28733 +#else
28734 +#  define NEED_IP(x)            ((void) 0)
28735 +#endif
28736 +#if defined(NEED_OP)
28737 +#  define HAVE_NEED_OP
28738 +#else
28739 +#  define NEED_OP(x)            ((void) 0)
28740 +#endif
28741 +
28742 +#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
28743 +#  define HAVE_ANY_IP
28744 +#endif
28745 +#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
28746 +#  define HAVE_ANY_OP
28747 +#endif
28748 +
28749 +#undef __COPY4
28750 +#define __COPY4(dst,src)    * (lzo_uint32p)(dst) = * (const lzo_uint32p)(src)
28751 +
28752 +#undef COPY4
28753 +#if defined(LZO_UNALIGNED_OK_4)
28754 +#  define COPY4(dst,src)    __COPY4(dst,src)
28755 +#elif defined(LZO_ALIGNED_OK_4)
28756 +#  define COPY4(dst,src)    __COPY4((lzo_ptr_t)(dst),(lzo_ptr_t)(src))
28757 +#endif
28758 +
28759 +/***** End of minilzo.c *****/
28760 diff --git a/fs/reiser4/plugin/compress/minilzo.h b/fs/reiser4/plugin/compress/minilzo.h
28761 new file mode 100644
28762 index 0000000..6a47001
28763 --- /dev/null
28764 +++ b/fs/reiser4/plugin/compress/minilzo.h
28765 @@ -0,0 +1,70 @@
28766 +/* minilzo.h -- mini subset of the LZO real-time data compression library
28767 +   adopted for reiser4 compression transform plugin.
28768 +
28769 +   This file is part of the LZO real-time data compression library
28770 +   and not included in any proprietary licenses of reiser4.
28771 +
28772 +   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
28773 +   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
28774 +   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
28775 +   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
28776 +   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
28777 +   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
28778 +   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
28779 +   All Rights Reserved.
28780 +
28781 +   The LZO library is free software; you can redistribute it and/or
28782 +   modify it under the terms of the GNU General Public License as
28783 +   published by the Free Software Foundation; either version 2 of
28784 +   the License, or (at your option) any later version.
28785 +
28786 +   The LZO library is distributed in the hope that it will be useful,
28787 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
28788 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
28789 +   GNU General Public License for more details.
28790 +
28791 +   You should have received a copy of the GNU General Public License
28792 +   along with the LZO library; see the file COPYING.
28793 +   If not, write to the Free Software Foundation, Inc.,
28794 +   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
28795 +
28796 +   Markus F.X.J. Oberhumer
28797 +   <markus@oberhumer.com>
28798 +   http://www.oberhumer.com/opensource/lzo/
28799 + */
28800 +
28801 +/*
28802 + * NOTE:
28803 + *   the full LZO package can be found at
28804 + *   http://www.oberhumer.com/opensource/lzo/
28805 + */
28806 +
28807 +#ifndef __MINILZO_H
28808 +#define __MINILZO_H
28809 +
28810 +#define MINILZO_VERSION         0x1080
28811 +
28812 +#include "lzoconf.h"
28813 +
28814 +/* Memory required for the wrkmem parameter.
28815 + * When the required size is 0, you can also pass a NULL pointer.
28816 + */
28817 +
28818 +#define LZO1X_MEM_COMPRESS      LZO1X_1_MEM_COMPRESS
28819 +#define LZO1X_1_MEM_COMPRESS    ((lzo_uint32) (16384L * lzo_sizeof_dict_t))
28820 +#define LZO1X_MEM_DECOMPRESS    (0)
28821 +
28822 +/* compression */
28823 +extern int lzo1x_1_compress(const lzo_byte * src, lzo_uint src_len,
28824 +                           lzo_byte * dst, lzo_uintp dst_len,
28825 +                           lzo_voidp wrkmem);
28826 +/* decompression */
28827 +extern int lzo1x_decompress(const lzo_byte * src, lzo_uint src_len,
28828 +                           lzo_byte * dst, lzo_uintp dst_len,
28829 +                           lzo_voidp wrkmem /* NOT USED */);
28830 +/* safe decompression with overrun testing */
28831 +extern int lzo1x_decompress_safe(const lzo_byte * src, lzo_uint src_len,
28832 +                                lzo_byte * dst, lzo_uintp dst_len,
28833 +                                lzo_voidp wrkmem /* NOT USED */ );
28834 +
28835 +#endif                         /* already included */
28836 diff --git a/fs/reiser4/plugin/crypto/cipher.c b/fs/reiser4/plugin/crypto/cipher.c
28837 new file mode 100644
28838 index 0000000..e918154
28839 --- /dev/null
28840 +++ b/fs/reiser4/plugin/crypto/cipher.c
28841 @@ -0,0 +1,37 @@
28842 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
28843 +   licensing governed by reiser4/README */
28844 +/* Reiser4 cipher transform plugins */
28845 +
28846 +#include "../../debug.h"
28847 +#include "../plugin.h"
28848 +
28849 +cipher_plugin cipher_plugins[LAST_CIPHER_ID] = {
28850 +       [NONE_CIPHER_ID] = {
28851 +               .h = {
28852 +                       .type_id = REISER4_CIPHER_PLUGIN_TYPE,
28853 +                       .id = NONE_CIPHER_ID,
28854 +                       .pops = NULL,
28855 +                       .label = "none",
28856 +                       .desc = "no cipher transform",
28857 +                       .linkage = {NULL, NULL}
28858 +               },
28859 +               .alloc = NULL,
28860 +               .free = NULL,
28861 +               .scale = NULL,
28862 +               .align_stream = NULL,
28863 +               .setkey = NULL,
28864 +               .encrypt = NULL,
28865 +               .decrypt = NULL
28866 +       }
28867 +};
28868 +
28869 +/* Make Linus happy.
28870 +   Local variables:
28871 +   c-indentation-style: "K&R"
28872 +   mode-name: "LC"
28873 +   c-basic-offset: 8
28874 +   tab-width: 8
28875 +   fill-column: 120
28876 +   scroll-step: 1
28877 +   End:
28878 +*/
28879 diff --git a/fs/reiser4/plugin/crypto/cipher.h b/fs/reiser4/plugin/crypto/cipher.h
28880 new file mode 100644
28881 index 0000000..e896c67
28882 --- /dev/null
28883 +++ b/fs/reiser4/plugin/crypto/cipher.h
28884 @@ -0,0 +1,55 @@
28885 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28886 +/* This file contains definitions for the objects operated
28887 +   by reiser4 key manager, which is something like keyring
28888 +   wrapped by appropriate reiser4 plugin */
28889 +
28890 +#if !defined( __FS_REISER4_CRYPT_H__ )
28891 +#define __FS_REISER4_CRYPT_H__
28892 +
28893 +#include <linux/crypto.h>
28894 +
28895 +/* key info imported from user space */
28896 +typedef struct crypto_data {
28897 +       int keysize;    /* uninstantiated key size */
28898 +       __u8 * key;     /* uninstantiated key */
28899 +       int keyid_size; /* size of passphrase */
28900 +       __u8 * keyid;   /* passphrase */
28901 +} crypto_data_t;
28902 +
28903 +/* This object contains all needed infrastructure to implement
28904 +   cipher transform. This is operated (allocating, inheriting,
28905 +   validating, binding to host inode, etc..) by reiser4 key manager.
28906 +
28907 +   This info can be allocated in two cases:
28908 +   1. importing a key from user space.
28909 +   2. reading inode from disk */
28910 +typedef struct crypto_stat {
28911 +       struct inode * host;
28912 +       struct crypto_hash      * digest;
28913 +       struct crypto_blkcipher * cipher;
28914 +#if 0
28915 +       cipher_key_plugin * kplug; /* key manager */
28916 +#endif
28917 +       __u8 * keyid;              /* key fingerprint, created by digest plugin,
28918 +                                     using uninstantiated key and passphrase.
28919 +                                     supposed to be stored in disk stat-data */
28920 +       int inst;                  /* this indicates if the cipher key is
28921 +                                     instantiated (case 1 above) */
28922 +       int keysize;               /* uninstantiated key size (bytes), supposed
28923 +                                     to be stored in disk stat-data */
28924 +       int keyload_count;         /* number of the objects which has this
28925 +                                     crypto-stat attached */
28926 +} crypto_stat_t;
28927 +
28928 +#endif /* __FS_REISER4_CRYPT_H__ */
28929 +
28930 +/*
28931 +   Local variables:
28932 +   c-indentation-style: "K&R"
28933 +   mode-name: "LC"
28934 +   c-basic-offset: 8
28935 +   tab-width: 8
28936 +   fill-column: 120
28937 +   scroll-step: 1
28938 +   End:
28939 +*/
28940 diff --git a/fs/reiser4/plugin/crypto/digest.c b/fs/reiser4/plugin/crypto/digest.c
28941 new file mode 100644
28942 index 0000000..7508917
28943 --- /dev/null
28944 +++ b/fs/reiser4/plugin/crypto/digest.c
28945 @@ -0,0 +1,58 @@
28946 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
28947 +
28948 +/* reiser4 digest transform plugin (is used by cryptcompress object plugin) */
28949 +/* EDWARD-FIXME-HANS: and it does what? a digest is a what? */
28950 +#include "../../debug.h"
28951 +#include "../plugin_header.h"
28952 +#include "../plugin.h"
28953 +#include "../file/cryptcompress.h"
28954 +
28955 +#include <linux/types.h>
28956 +
28957 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
28958 +
28959 +static struct crypto_hash * alloc_sha256 (void)
28960 +{
28961 +#if REISER4_SHA256
28962 +       return crypto_alloc_hash ("sha256", 0, CRYPTO_ALG_ASYNC);
28963 +#else
28964 +       warning("edward-1418", "sha256 unsupported");
28965 +       return ERR_PTR(-EINVAL);
28966 +#endif
28967 +}
28968 +
28969 +static void free_sha256 (struct crypto_hash * tfm)
28970 +{
28971 +#if REISER4_SHA256
28972 +       crypto_free_hash(tfm);
28973 +#endif
28974 +       return;
28975 +}
28976 +
28977 +/* digest plugins */
28978 +digest_plugin digest_plugins[LAST_DIGEST_ID] = {
28979 +       [SHA256_32_DIGEST_ID] = {
28980 +               .h = {
28981 +                       .type_id = REISER4_DIGEST_PLUGIN_TYPE,
28982 +                       .id = SHA256_32_DIGEST_ID,
28983 +                       .pops = NULL,
28984 +                       .label = "sha256_32",
28985 +                       .desc = "sha256_32 digest transform",
28986 +                       .linkage = {NULL, NULL}
28987 +               },
28988 +               .fipsize = sizeof(__u32),
28989 +               .alloc = alloc_sha256,
28990 +               .free = free_sha256
28991 +       }
28992 +};
28993 +
28994 +/*
28995 +  Local variables:
28996 +  c-indentation-style: "K&R"
28997 +  mode-name: "LC"
28998 +  c-basic-offset: 8
28999 +  tab-width: 8
29000 +  fill-column: 120
29001 +  scroll-step: 1
29002 +  End:
29003 +*/
29004 diff --git a/fs/reiser4/plugin/dir/Makefile b/fs/reiser4/plugin/dir/Makefile
29005 new file mode 100644
29006 index 0000000..ed370b1
29007 --- /dev/null
29008 +++ b/fs/reiser4/plugin/dir/Makefile
29009 @@ -0,0 +1,5 @@
29010 +obj-$(CONFIG_REISER4_FS) += dir_plugins.o
29011 +
29012 +dir_plugins-objs :=    \
29013 +       hashed_dir.o    \
29014 +       seekable_dir.o
29015 diff --git a/fs/reiser4/plugin/dir/dir.h b/fs/reiser4/plugin/dir/dir.h
29016 new file mode 100644
29017 index 0000000..4a91ebe
29018 --- /dev/null
29019 +++ b/fs/reiser4/plugin/dir/dir.h
29020 @@ -0,0 +1,36 @@
29021 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29022 + * reiser4/README */
29023 +
29024 +/* this file contains declarations of methods implementing directory plugins */
29025 +
29026 +#if !defined( __REISER4_DIR_H__ )
29027 +#define __REISER4_DIR_H__
29028 +
29029 +/*#include "../../key.h"
29030 +
29031 +#include <linux/fs.h>*/
29032 +
29033 +/* declarations of functions implementing HASHED_DIR_PLUGIN_ID dir plugin */
29034 +
29035 +/* "hashed" directory methods of dir plugin */
29036 +void build_entry_key_hashed(const struct inode *, const struct qstr *,
29037 +                           reiser4_key *);
29038 +
29039 +/* declarations of functions implementing SEEKABLE_HASHED_DIR_PLUGIN_ID dir plugin */
29040 +
29041 +/* "seekable" directory methods of dir plugin */
29042 +void build_entry_key_seekable(const struct inode *, const struct qstr *,
29043 +                             reiser4_key *);
29044 +
29045 +/* __REISER4_DIR_H__ */
29046 +#endif
29047 +
29048 +/*
29049 +   Local variables:
29050 +   c-indentation-style: "K&R"
29051 +   mode-name: "LC"
29052 +   c-basic-offset: 8
29053 +   tab-width: 8
29054 +   fill-column: 120
29055 +   End:
29056 +*/
29057 diff --git a/fs/reiser4/plugin/dir/hashed_dir.c b/fs/reiser4/plugin/dir/hashed_dir.c
29058 new file mode 100644
29059 index 0000000..0f34824
29060 --- /dev/null
29061 +++ b/fs/reiser4/plugin/dir/hashed_dir.c
29062 @@ -0,0 +1,81 @@
29063 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
29064 + * reiser4/README */
29065 +
29066 +/* Directory plugin using hashes (see fs/reiser4/plugin/hash.c) to map file
29067 +   names to the files. */
29068 +
29069 +/*
29070 + * Hashed directory logically consists of persistent directory
29071 + * entries. Directory entry is a pair of a file name and a key of stat-data of
29072 + * a file that has this name in the given directory.
29073 + *
29074 + * Directory entries are stored in the tree in the form of directory
29075 + * items. Directory item should implement dir_entry_ops portion of item plugin
29076 + * interface (see plugin/item/item.h). Hashed directory interacts with
29077 + * directory item plugin exclusively through dir_entry_ops operations.
29078 + *
29079 + * Currently there are two implementations of directory items: "simple
29080 + * directory item" (plugin/item/sde.[ch]), and "compound directory item"
29081 + * (plugin/item/cde.[ch]) with the latter being the default.
29082 + *
29083 + * There is, however some delicate way through which directory code interferes
29084 + * with item plugin: key assignment policy. A key for a directory item is
29085 + * chosen by directory code, and as described in kassign.c, this key contains
29086 + * a portion of file name. Directory item uses this knowledge to avoid storing
29087 + * this portion of file name twice: in the key and in the directory item body.
29088 + *
29089 + */
29090 +
29091 +#include "../../inode.h"
29092 +
29093 +void complete_entry_key(const struct inode *, const char *name,
29094 +                       int len, reiser4_key * result);
29095 +
29096 +/* this is implementation of build_entry_key method of dir
29097 +   plugin for HASHED_DIR_PLUGIN_ID
29098 + */
29099 +void build_entry_key_hashed(const struct inode *dir,   /* directory where entry is
29100 +                                                        * (or will be) in.*/
29101 +                           const struct qstr *qname,   /* name of file referenced
29102 +                                                        * by this entry */
29103 +                           reiser4_key * result        /* resulting key of directory
29104 +                                                        * entry */ )
29105 +{
29106 +       const char *name;
29107 +       int len;
29108 +
29109 +       assert("nikita-1139", dir != NULL);
29110 +       assert("nikita-1140", qname != NULL);
29111 +       assert("nikita-1141", qname->name != NULL);
29112 +       assert("nikita-1142", result != NULL);
29113 +
29114 +       name = qname->name;
29115 +       len = qname->len;
29116 +
29117 +       assert("nikita-2867", strlen(name) == len);
29118 +
29119 +       reiser4_key_init(result);
29120 +       /* locality of directory entry's key is objectid of parent
29121 +          directory */
29122 +       set_key_locality(result, get_inode_oid(dir));
29123 +       /* minor packing locality is constant */
29124 +       set_key_type(result, KEY_FILE_NAME_MINOR);
29125 +       /* dot is special case---we always want it to be first entry in
29126 +          a directory. Actually, we just want to have smallest
29127 +          directory entry.
29128 +        */
29129 +       if (len == 1 && name[0] == '.')
29130 +               return;
29131 +
29132 +       /* initialize part of entry key which depends on file name */
29133 +       complete_entry_key(dir, name, len, result);
29134 +}
29135 +
29136 +/* Local variables:
29137 +   c-indentation-style: "K&R"
29138 +   mode-name: "LC"
29139 +   c-basic-offset: 8
29140 +   tab-width: 8
29141 +   fill-column: 120
29142 +   End:
29143 +*/
29144 diff --git a/fs/reiser4/plugin/dir/seekable_dir.c b/fs/reiser4/plugin/dir/seekable_dir.c
29145 new file mode 100644
29146 index 0000000..c1c6c4c
29147 --- /dev/null
29148 +++ b/fs/reiser4/plugin/dir/seekable_dir.c
29149 @@ -0,0 +1,46 @@
29150 +/* Copyright 2005 by Hans Reiser, licensing governed by
29151 + * reiser4/README */
29152 +
29153 +#include "../../inode.h"
29154 +
29155 +/* this is implementation of build_entry_key method of dir
29156 +   plugin for SEEKABLE_HASHED_DIR_PLUGIN_ID
29157 +   This is for directories where we want repeatable and restartable readdir()
29158 +   even in case 32bit user level struct dirent (readdir(3)).
29159 +*/
29160 +void
29161 +build_entry_key_seekable(const struct inode *dir, const struct qstr *name,
29162 +                        reiser4_key * result)
29163 +{
29164 +       oid_t objectid;
29165 +
29166 +       assert("nikita-2283", dir != NULL);
29167 +       assert("nikita-2284", name != NULL);
29168 +       assert("nikita-2285", name->name != NULL);
29169 +       assert("nikita-2286", result != NULL);
29170 +
29171 +       reiser4_key_init(result);
29172 +       /* locality of directory entry's key is objectid of parent
29173 +          directory */
29174 +       set_key_locality(result, get_inode_oid(dir));
29175 +       /* minor packing locality is constant */
29176 +       set_key_type(result, KEY_FILE_NAME_MINOR);
29177 +       /* dot is special case---we always want it to be first entry in
29178 +          a directory. Actually, we just want to have smallest
29179 +          directory entry.
29180 +        */
29181 +       if ((name->len == 1) && (name->name[0] == '.'))
29182 +               return;
29183 +
29184 +       /* objectid of key is 31 lowest bits of hash. */
29185 +       objectid =
29186 +           inode_hash_plugin(dir)->hash(name->name,
29187 +                                        (int)name->len) & 0x7fffffff;
29188 +
29189 +       assert("nikita-2303", !(objectid & ~KEY_OBJECTID_MASK));
29190 +       set_key_objectid(result, objectid);
29191 +
29192 +       /* offset is always 0. */
29193 +       set_key_offset(result, (__u64) 0);
29194 +       return;
29195 +}
29196 diff --git a/fs/reiser4/plugin/dir_plugin_common.c b/fs/reiser4/plugin/dir_plugin_common.c
29197 new file mode 100644
29198 index 0000000..f5e1028
29199 --- /dev/null
29200 +++ b/fs/reiser4/plugin/dir_plugin_common.c
29201 @@ -0,0 +1,872 @@
29202 +/* Copyright 2005 by Hans Reiser, licensing governed by
29203 +   reiser4/README */
29204 +
29205 +/* this file contains typical implementations for most of methods of
29206 +   directory plugin
29207 +*/
29208 +
29209 +#include "../inode.h"
29210 +
29211 +int reiser4_find_entry(struct inode *dir, struct dentry *name,
29212 +              lock_handle *, znode_lock_mode, reiser4_dir_entry_desc *);
29213 +int reiser4_lookup_name(struct inode *parent, struct dentry *dentry, reiser4_key * key);
29214 +void check_light_weight(struct inode *inode, struct inode *parent);
29215 +
29216 +/* this is common implementation of get_parent method of dir plugin
29217 +   this is used by NFS kernel server to "climb" up directory tree to
29218 +   check permissions
29219 + */
29220 +struct dentry *get_parent_common(struct inode *child)
29221 +{
29222 +       struct super_block *s;
29223 +       struct inode *parent;
29224 +       struct dentry dotdot;
29225 +       struct dentry *dentry;
29226 +       reiser4_key key;
29227 +       int result;
29228 +
29229 +       /*
29230 +        * lookup dotdot entry.
29231 +        */
29232 +
29233 +       s = child->i_sb;
29234 +       memset(&dotdot, 0, sizeof(dotdot));
29235 +       dotdot.d_name.name = "..";
29236 +       dotdot.d_name.len = 2;
29237 +       dotdot.d_op = &get_super_private(s)->ops.dentry;
29238 +
29239 +       result = reiser4_lookup_name(child, &dotdot, &key);
29240 +       if (result != 0)
29241 +               return ERR_PTR(result);
29242 +
29243 +       parent = reiser4_iget(s, &key, 1);
29244 +       if (!IS_ERR(parent)) {
29245 +               /*
29246 +                * FIXME-NIKITA dubious: attributes are inherited from @child
29247 +                * to @parent. But:
29248 +                *
29249 +                *     (*) this is the only this we can do
29250 +                *
29251 +                *     (*) attributes of light-weight object are inherited
29252 +                *     from a parent through which object was looked up first,
29253 +                *     so it is ambiguous anyway.
29254 +                *
29255 +                */
29256 +               check_light_weight(parent, child);
29257 +               reiser4_iget_complete(parent);
29258 +               dentry = d_alloc_anon(parent);
29259 +               if (dentry == NULL) {
29260 +                       iput(parent);
29261 +                       dentry = ERR_PTR(RETERR(-ENOMEM));
29262 +               } else
29263 +                       dentry->d_op = &get_super_private(s)->ops.dentry;
29264 +       } else if (PTR_ERR(parent) == -ENOENT)
29265 +               dentry = ERR_PTR(RETERR(-ESTALE));
29266 +       else
29267 +               dentry = (void *)parent;
29268 +       return dentry;
29269 +}
29270 +
29271 +/* this is common implementation of is_name_acceptable method of dir
29272 +   plugin
29273 + */
29274 +int is_name_acceptable_common(const struct inode *inode,       /* directory to check */
29275 +                             const char *name UNUSED_ARG,      /* name to check */
29276 +                             int len /* @name's length */ )
29277 +{
29278 +       assert("nikita-733", inode != NULL);
29279 +       assert("nikita-734", name != NULL);
29280 +       assert("nikita-735", len > 0);
29281 +
29282 +       return len <= reiser4_max_filename_len(inode);
29283 +}
29284 +
29285 +/* there is no common implementation of build_entry_key method of dir
29286 +   plugin. See plugin/dir/hashed_dir.c:build_entry_key_hashed() or
29287 +   plugin/dir/seekable.c:build_entry_key_seekable() for example
29288 +*/
29289 +
29290 +/* this is common implementation of build_readdir_key method of dir
29291 +   plugin
29292 +   see reiser4_readdir_common for more details
29293 +*/
29294 +int build_readdir_key_common(struct file *dir /* directory being read */ ,
29295 +                            reiser4_key * result /* where to store key */ )
29296 +{
29297 +       reiser4_file_fsdata *fdata;
29298 +       struct inode *inode;
29299 +
29300 +       assert("nikita-1361", dir != NULL);
29301 +       assert("nikita-1362", result != NULL);
29302 +       assert("nikita-1363", dir->f_dentry != NULL);
29303 +       inode = dir->f_dentry->d_inode;
29304 +       assert("nikita-1373", inode != NULL);
29305 +
29306 +       fdata = reiser4_get_file_fsdata(dir);
29307 +       if (IS_ERR(fdata))
29308 +               return PTR_ERR(fdata);
29309 +       assert("nikita-1364", fdata != NULL);
29310 +       return extract_key_from_de_id(get_inode_oid(inode),
29311 +                                     &fdata->dir.readdir.position.
29312 +                                     dir_entry_key, result);
29313 +
29314 +}
29315 +
29316 +void reiser4_adjust_dir_file(struct inode *, const struct dentry *, int offset,
29317 +                            int adj);
29318 +
29319 +/* this is common implementation of add_entry method of dir plugin
29320 +*/
29321 +int reiser4_add_entry_common(struct inode *object, /* directory to add new name
29322 +                                                   * in */
29323 +                            struct dentry *where,      /* new name */
29324 +                            reiser4_object_create_data * data, /* parameters of
29325 +                                                               *  new object */
29326 +                            reiser4_dir_entry_desc * entry /* parameters of
29327 +                                                            * new directory
29328 +                                                            * entry */)
29329 +{
29330 +       int result;
29331 +       coord_t *coord;
29332 +       lock_handle lh;
29333 +       reiser4_dentry_fsdata *fsdata;
29334 +       reiser4_block_nr reserve;
29335 +
29336 +       assert("nikita-1114", object != NULL);
29337 +       assert("nikita-1250", where != NULL);
29338 +
29339 +       fsdata = reiser4_get_dentry_fsdata(where);
29340 +       if (unlikely(IS_ERR(fsdata)))
29341 +               return PTR_ERR(fsdata);
29342 +
29343 +       reserve = inode_dir_plugin(object)->estimate.add_entry(object);
29344 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29345 +               return RETERR(-ENOSPC);
29346 +
29347 +       init_lh(&lh);
29348 +       coord = &fsdata->dec.entry_coord;
29349 +       coord_clear_iplug(coord);
29350 +
29351 +       /* check for this entry in a directory. This is plugin method. */
29352 +       result = reiser4_find_entry(object, where, &lh, ZNODE_WRITE_LOCK,
29353 +                                   entry);
29354 +       if (likely(result == -ENOENT)) {
29355 +               /* add new entry. Just pass control to the directory
29356 +                  item plugin. */
29357 +               assert("nikita-1709", inode_dir_item_plugin(object));
29358 +               assert("nikita-2230", coord->node == lh.node);
29359 +               reiser4_seal_done(&fsdata->dec.entry_seal);
29360 +               result =
29361 +                   inode_dir_item_plugin(object)->s.dir.add_entry(object,
29362 +                                                                  coord, &lh,
29363 +                                                                  where,
29364 +                                                                  entry);
29365 +               if (result == 0) {
29366 +                       reiser4_adjust_dir_file(object, where,
29367 +                                               fsdata->dec.pos + 1, +1);
29368 +                       INODE_INC_FIELD(object, i_size);
29369 +               }
29370 +       } else if (result == 0) {
29371 +               assert("nikita-2232", coord->node == lh.node);
29372 +               result = RETERR(-EEXIST);
29373 +       }
29374 +       done_lh(&lh);
29375 +
29376 +       return result;
29377 +}
29378 +
29379 +/**
29380 + * rem_entry - remove entry from directory item
29381 + * @dir:
29382 + * @dentry:
29383 + * @entry:
29384 + * @coord:
29385 + * @lh:
29386 + *
29387 + * Checks that coordinate @coord is set properly and calls item plugin
29388 + * method to cut entry.
29389 + */
29390 +static int
29391 +rem_entry(struct inode *dir, struct dentry *dentry,
29392 +         reiser4_dir_entry_desc * entry, coord_t * coord, lock_handle * lh)
29393 +{
29394 +       item_plugin *iplug;
29395 +       struct inode *child;
29396 +
29397 +       iplug = inode_dir_item_plugin(dir);
29398 +       child = dentry->d_inode;
29399 +       assert("nikita-3399", child != NULL);
29400 +
29401 +       /* check that we are really destroying an entry for @child */
29402 +       if (REISER4_DEBUG) {
29403 +               int result;
29404 +               reiser4_key key;
29405 +
29406 +               result = iplug->s.dir.extract_key(coord, &key);
29407 +               if (result != 0)
29408 +                       return result;
29409 +               if (get_key_objectid(&key) != get_inode_oid(child)) {
29410 +                       warning("nikita-3397",
29411 +                               "rem_entry: %#llx != %#llx\n",
29412 +                               get_key_objectid(&key),
29413 +                               (unsigned long long)get_inode_oid(child));
29414 +                       return RETERR(-EIO);
29415 +               }
29416 +       }
29417 +       return iplug->s.dir.rem_entry(dir, &dentry->d_name, coord, lh, entry);
29418 +}
29419 +
29420 +/**
29421 + * reiser4_rem_entry_common - remove entry from a directory
29422 + * @dir: directory to remove entry from
29423 + * @where: name that is being removed
29424 + * @entry: description of entry being removed
29425 + *
29426 + * This is common implementation of rem_entry method of dir plugin.
29427 + */
29428 +int reiser4_rem_entry_common(struct inode *dir,
29429 +                            struct dentry *dentry,
29430 +                            reiser4_dir_entry_desc *entry)
29431 +{
29432 +       int result;
29433 +       coord_t *coord;
29434 +       lock_handle lh;
29435 +       reiser4_dentry_fsdata *fsdata;
29436 +       __u64 tograb;
29437 +
29438 +       assert("nikita-1124", dir != NULL);
29439 +       assert("nikita-1125", dentry != NULL);
29440 +
29441 +       tograb = inode_dir_plugin(dir)->estimate.rem_entry(dir);
29442 +       result = reiser4_grab_space(tograb, BA_CAN_COMMIT | BA_RESERVED);
29443 +       if (result != 0)
29444 +               return RETERR(-ENOSPC);
29445 +
29446 +       init_lh(&lh);
29447 +
29448 +       /* check for this entry in a directory. This is plugin method. */
29449 +       result = reiser4_find_entry(dir, dentry, &lh, ZNODE_WRITE_LOCK, entry);
29450 +       fsdata = reiser4_get_dentry_fsdata(dentry);
29451 +       if (IS_ERR(fsdata)) {
29452 +               done_lh(&lh);
29453 +               return PTR_ERR(fsdata);
29454 +       }
29455 +
29456 +       coord = &fsdata->dec.entry_coord;
29457 +
29458 +       assert("nikita-3404",
29459 +              get_inode_oid(dentry->d_inode) != get_inode_oid(dir) ||
29460 +              dir->i_size <= 1);
29461 +
29462 +       coord_clear_iplug(coord);
29463 +       if (result == 0) {
29464 +               /* remove entry. Just pass control to the directory item
29465 +                  plugin. */
29466 +               assert("vs-542", inode_dir_item_plugin(dir));
29467 +               reiser4_seal_done(&fsdata->dec.entry_seal);
29468 +               reiser4_adjust_dir_file(dir, dentry, fsdata->dec.pos, -1);
29469 +               result =
29470 +                   WITH_COORD(coord,
29471 +                              rem_entry(dir, dentry, entry, coord, &lh));
29472 +               if (result == 0) {
29473 +                       if (dir->i_size >= 1)
29474 +                               INODE_DEC_FIELD(dir, i_size);
29475 +                       else {
29476 +                               warning("nikita-2509", "Dir %llu is runt",
29477 +                                       (unsigned long long)
29478 +                                       get_inode_oid(dir));
29479 +                               result = RETERR(-EIO);
29480 +                       }
29481 +
29482 +                       assert("nikita-3405", dentry->d_inode->i_nlink != 1 ||
29483 +                              dentry->d_inode->i_size != 2 ||
29484 +                              inode_dir_plugin(dentry->d_inode) == NULL);
29485 +               }
29486 +       }
29487 +       done_lh(&lh);
29488 +
29489 +       return result;
29490 +}
29491 +
29492 +static reiser4_block_nr estimate_init(struct inode *parent,
29493 +                                     struct inode *object);
29494 +static int create_dot_dotdot(struct inode *object, struct inode *parent);
29495 +
29496 +/* this is common implementation of init method of dir plugin
29497 +   create "." and ".." entries
29498 +*/
29499 +int reiser4_dir_init_common(struct inode *object,      /* new directory */
29500 +                           struct inode *parent,       /* parent directory */
29501 +                           reiser4_object_create_data * data /* info passed
29502 +                                                              * to us, this
29503 +                                                              * is filled by
29504 +                                                              * reiser4()
29505 +                                                              * syscall in
29506 +                                                              * particular */)
29507 +{
29508 +       reiser4_block_nr reserve;
29509 +
29510 +       assert("nikita-680", object != NULL);
29511 +       assert("nikita-681", S_ISDIR(object->i_mode));
29512 +       assert("nikita-682", parent != NULL);
29513 +       assert("nikita-684", data != NULL);
29514 +       assert("nikita-686", data->id == DIRECTORY_FILE_PLUGIN_ID);
29515 +       assert("nikita-687", object->i_mode & S_IFDIR);
29516 +
29517 +       reserve = estimate_init(parent, object);
29518 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
29519 +               return RETERR(-ENOSPC);
29520 +
29521 +       return create_dot_dotdot(object, parent);
29522 +}
29523 +
29524 +/* this is common implementation of done method of dir plugin
29525 +   remove "." entry
29526 +*/
29527 +int reiser4_dir_done_common(struct inode *object /* object being deleted */ )
29528 +{
29529 +       int result;
29530 +       reiser4_block_nr reserve;
29531 +       struct dentry goodby_dots;
29532 +       reiser4_dir_entry_desc entry;
29533 +
29534 +       assert("nikita-1449", object != NULL);
29535 +
29536 +       if (reiser4_inode_get_flag(object, REISER4_NO_SD))
29537 +               return 0;
29538 +
29539 +       /* of course, this can be rewritten to sweep everything in one
29540 +          reiser4_cut_tree(). */
29541 +       memset(&entry, 0, sizeof entry);
29542 +
29543 +       /* FIXME: this done method is called from reiser4_delete_dir_common which
29544 +        * reserved space already */
29545 +       reserve = inode_dir_plugin(object)->estimate.rem_entry(object);
29546 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT | BA_RESERVED))
29547 +               return RETERR(-ENOSPC);
29548 +
29549 +       memset(&goodby_dots, 0, sizeof goodby_dots);
29550 +       entry.obj = goodby_dots.d_inode = object;
29551 +       goodby_dots.d_name.name = ".";
29552 +       goodby_dots.d_name.len = 1;
29553 +       result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29554 +       reiser4_free_dentry_fsdata(&goodby_dots);
29555 +       if (unlikely(result != 0 && result != -ENOMEM && result != -ENOENT))
29556 +               /* only worth a warning
29557 +
29558 +                  "values of \ eB\ f will give rise to dom!\n"
29559 +                  -- v6src/s2/mv.c:89
29560 +                */
29561 +               warning("nikita-2252", "Cannot remove dot of %lli: %i",
29562 +                       (unsigned long long)get_inode_oid(object), result);
29563 +       return 0;
29564 +}
29565 +
29566 +/* this is common implementation of attach method of dir plugin
29567 +*/
29568 +int reiser4_attach_common(struct inode *child UNUSED_ARG,
29569 +                         struct inode *parent UNUSED_ARG)
29570 +{
29571 +       assert("nikita-2647", child != NULL);
29572 +       assert("nikita-2648", parent != NULL);
29573 +
29574 +       return 0;
29575 +}
29576 +
29577 +/* this is common implementation of detach method of dir plugin
29578 +   remove "..", decrease nlink on parent
29579 +*/
29580 +int reiser4_detach_common(struct inode *object, struct inode *parent)
29581 +{
29582 +       int result;
29583 +       struct dentry goodby_dots;
29584 +       reiser4_dir_entry_desc entry;
29585 +
29586 +       assert("nikita-2885", object != NULL);
29587 +       assert("nikita-2886", !reiser4_inode_get_flag(object, REISER4_NO_SD));
29588 +
29589 +       memset(&entry, 0, sizeof entry);
29590 +
29591 +       /* NOTE-NIKITA this only works if @parent is -the- parent of
29592 +          @object, viz. object whose key is stored in dotdot
29593 +          entry. Wouldn't work with hard-links on directories. */
29594 +       memset(&goodby_dots, 0, sizeof goodby_dots);
29595 +       entry.obj = goodby_dots.d_inode = parent;
29596 +       goodby_dots.d_name.name = "..";
29597 +       goodby_dots.d_name.len = 2;
29598 +       result = reiser4_rem_entry_common(object, &goodby_dots, &entry);
29599 +       reiser4_free_dentry_fsdata(&goodby_dots);
29600 +       if (result == 0) {
29601 +               /* the dot should be the only entry remaining at this time... */
29602 +               assert("nikita-3400",
29603 +                      object->i_size == 1 && object->i_nlink <= 2);
29604 +#if 0
29605 +               /* and, together with the only name directory can have, they
29606 +                * provides for the last 2 remaining references. If we get
29607 +                * here as part of error handling during mkdir, @object
29608 +                * possibly has no name yet, so its nlink == 1. If we get here
29609 +                * from rename (targeting empty directory), it has no name
29610 +                * already, so its nlink == 1. */
29611 +               assert("nikita-3401",
29612 +                      object->i_nlink == 2 || object->i_nlink == 1);
29613 +#endif
29614 +
29615 +               /* decrement nlink of directory removed ".." pointed
29616 +                  to */
29617 +               reiser4_del_nlink(parent, NULL, 0);
29618 +       }
29619 +       return result;
29620 +}
29621 +
29622 +/* this is common implementation of estimate.add_entry method of
29623 +   dir plugin
29624 +   estimation of adding entry which supposes that entry is inserting a
29625 +   unit into item
29626 +*/
29627 +reiser4_block_nr estimate_add_entry_common(const struct inode * inode)
29628 +{
29629 +       return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
29630 +}
29631 +
29632 +/* this is common implementation of estimate.rem_entry method of dir
29633 +   plugin
29634 +*/
29635 +reiser4_block_nr estimate_rem_entry_common(const struct inode * inode)
29636 +{
29637 +       return estimate_one_item_removal(reiser4_tree_by_inode(inode));
29638 +}
29639 +
29640 +/* this is common implementation of estimate.unlink method of dir
29641 +   plugin
29642 +*/
29643 +reiser4_block_nr
29644 +dir_estimate_unlink_common(const struct inode * parent,
29645 +                          const struct inode * object)
29646 +{
29647 +       reiser4_block_nr res;
29648 +
29649 +       /* hashed_rem_entry(object) */
29650 +       res = inode_dir_plugin(object)->estimate.rem_entry(object);
29651 +       /* del_nlink(parent) */
29652 +       res += 2 * inode_file_plugin(parent)->estimate.update(parent);
29653 +
29654 +       return res;
29655 +}
29656 +
29657 +/*
29658 + * helper for inode_ops ->lookup() and dir plugin's ->get_parent()
29659 + * methods: if @inode is a light-weight file, setup its credentials
29660 + * that are not stored in the stat-data in this case
29661 + */
29662 +void check_light_weight(struct inode *inode, struct inode *parent)
29663 +{
29664 +       if (reiser4_inode_get_flag(inode, REISER4_LIGHT_WEIGHT)) {
29665 +               inode->i_uid = parent->i_uid;
29666 +               inode->i_gid = parent->i_gid;
29667 +               /* clear light-weight flag. If inode would be read by any
29668 +                  other name, [ug]id wouldn't change. */
29669 +               reiser4_inode_clr_flag(inode, REISER4_LIGHT_WEIGHT);
29670 +       }
29671 +}
29672 +
29673 +/* looks for name specified in @dentry in directory @parent and if name is
29674 +   found - key of object found entry points to is stored in @entry->key */
29675 +int reiser4_lookup_name(struct inode *parent,  /* inode of directory to lookup for
29676 +                                        * name in */
29677 +               struct dentry *dentry,  /* name to look for */
29678 +               reiser4_key * key /* place to store key */ )
29679 +{
29680 +       int result;
29681 +       coord_t *coord;
29682 +       lock_handle lh;
29683 +       const char *name;
29684 +       int len;
29685 +       reiser4_dir_entry_desc entry;
29686 +       reiser4_dentry_fsdata *fsdata;
29687 +
29688 +       assert("nikita-1247", parent != NULL);
29689 +       assert("nikita-1248", dentry != NULL);
29690 +       assert("nikita-1123", dentry->d_name.name != NULL);
29691 +       assert("vs-1486",
29692 +              dentry->d_op == &get_super_private(parent->i_sb)->ops.dentry);
29693 +
29694 +       name = dentry->d_name.name;
29695 +       len = dentry->d_name.len;
29696 +
29697 +       if (!inode_dir_plugin(parent)->is_name_acceptable(parent, name, len))
29698 +               /* some arbitrary error code to return */
29699 +               return RETERR(-ENAMETOOLONG);
29700 +
29701 +       fsdata = reiser4_get_dentry_fsdata(dentry);
29702 +       if (IS_ERR(fsdata))
29703 +               return PTR_ERR(fsdata);
29704 +
29705 +       coord = &fsdata->dec.entry_coord;
29706 +       coord_clear_iplug(coord);
29707 +       init_lh(&lh);
29708 +
29709 +       /* find entry in a directory. This is plugin method. */
29710 +       result = reiser4_find_entry(parent, dentry, &lh, ZNODE_READ_LOCK,
29711 +                                   &entry);
29712 +       if (result == 0) {
29713 +               /* entry was found, extract object key from it. */
29714 +               result =
29715 +                   WITH_COORD(coord,
29716 +                              item_plugin_by_coord(coord)->s.dir.
29717 +                              extract_key(coord, key));
29718 +       }
29719 +       done_lh(&lh);
29720 +       return result;
29721 +
29722 +}
29723 +
29724 +/* helper for reiser4_dir_init_common(): estimate number of blocks to reserve */
29725 +static reiser4_block_nr
29726 +estimate_init(struct inode *parent, struct inode *object)
29727 +{
29728 +       reiser4_block_nr res = 0;
29729 +
29730 +       assert("vpf-321", parent != NULL);
29731 +       assert("vpf-322", object != NULL);
29732 +
29733 +       /* hashed_add_entry(object) */
29734 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
29735 +       /* reiser4_add_nlink(object) */
29736 +       res += inode_file_plugin(object)->estimate.update(object);
29737 +       /* hashed_add_entry(object) */
29738 +       res += inode_dir_plugin(object)->estimate.add_entry(object);
29739 +       /* reiser4_add_nlink(parent) */
29740 +       res += inode_file_plugin(parent)->estimate.update(parent);
29741 +
29742 +       return 0;
29743 +}
29744 +
29745 +/* helper function for reiser4_dir_init_common(). Create "." and ".." */
29746 +static int create_dot_dotdot(struct inode *object /* object to create dot and
29747 +                                                  * dotdot for */ ,
29748 +                            struct inode *parent /* parent of @object */)
29749 +{
29750 +       int result;
29751 +       struct dentry dots_entry;
29752 +       reiser4_dir_entry_desc entry;
29753 +
29754 +       assert("nikita-688", object != NULL);
29755 +       assert("nikita-689", S_ISDIR(object->i_mode));
29756 +       assert("nikita-691", parent != NULL);
29757 +
29758 +       /* We store dot and dotdot as normal directory entries. This is
29759 +          not necessary, because almost all information stored in them
29760 +          is already in the stat-data of directory, the only thing
29761 +          being missed is objectid of grand-parent directory that can
29762 +          easily be added there as extension.
29763 +
29764 +          But it is done the way it is done, because not storing dot
29765 +          and dotdot will lead to the following complications:
29766 +
29767 +          . special case handling in ->lookup().
29768 +          . addition of another extension to the sd.
29769 +          . dependency on key allocation policy for stat data.
29770 +
29771 +        */
29772 +
29773 +       memset(&entry, 0, sizeof entry);
29774 +       memset(&dots_entry, 0, sizeof dots_entry);
29775 +       entry.obj = dots_entry.d_inode = object;
29776 +       dots_entry.d_name.name = ".";
29777 +       dots_entry.d_name.len = 1;
29778 +       result = reiser4_add_entry_common(object, &dots_entry, NULL, &entry);
29779 +       reiser4_free_dentry_fsdata(&dots_entry);
29780 +
29781 +       if (result == 0) {
29782 +               result = reiser4_add_nlink(object, object, 0);
29783 +               if (result == 0) {
29784 +                       entry.obj = dots_entry.d_inode = parent;
29785 +                       dots_entry.d_name.name = "..";
29786 +                       dots_entry.d_name.len = 2;
29787 +                       result = reiser4_add_entry_common(object,
29788 +                                                 &dots_entry, NULL, &entry);
29789 +                       reiser4_free_dentry_fsdata(&dots_entry);
29790 +                       /* if creation of ".." failed, iput() will delete
29791 +                          object with ".". */
29792 +                       if (result == 0) {
29793 +                               result = reiser4_add_nlink(parent, object, 0);
29794 +                               if (result != 0)
29795 +                                       /*
29796 +                                        * if we failed to bump i_nlink, try
29797 +                                        * to remove ".."
29798 +                                        */
29799 +                                       reiser4_detach_common(object, parent);
29800 +                       }
29801 +               }
29802 +       }
29803 +
29804 +       if (result != 0) {
29805 +               /*
29806 +                * in the case of error, at least update stat-data so that,
29807 +                * ->i_nlink updates are not lingering.
29808 +                */
29809 +               reiser4_update_sd(object);
29810 +               reiser4_update_sd(parent);
29811 +       }
29812 +
29813 +       return result;
29814 +}
29815 +
29816 +/*
29817 + * return 0 iff @coord contains a directory entry for the file with the name
29818 + * @name.
29819 + */
29820 +static int
29821 +check_item(const struct inode *dir, const coord_t * coord, const char *name)
29822 +{
29823 +       item_plugin *iplug;
29824 +       char buf[DE_NAME_BUF_LEN];
29825 +
29826 +       iplug = item_plugin_by_coord(coord);
29827 +       if (iplug == NULL) {
29828 +               warning("nikita-1135", "Cannot get item plugin");
29829 +               print_coord("coord", coord, 1);
29830 +               return RETERR(-EIO);
29831 +       } else if (item_id_by_coord(coord) !=
29832 +                  item_id_by_plugin(inode_dir_item_plugin(dir))) {
29833 +               /* item id of current item does not match to id of items a
29834 +                  directory is built of */
29835 +               warning("nikita-1136", "Wrong item plugin");
29836 +               print_coord("coord", coord, 1);
29837 +               return RETERR(-EIO);
29838 +       }
29839 +       assert("nikita-1137", iplug->s.dir.extract_name);
29840 +
29841 +       /* Compare name stored in this entry with name we are looking for.
29842 +
29843 +          NOTE-NIKITA Here should go code for support of something like
29844 +          unicode, code tables, etc.
29845 +        */
29846 +       return !!strcmp(name, iplug->s.dir.extract_name(coord, buf));
29847 +}
29848 +
29849 +static int
29850 +check_entry(const struct inode *dir, coord_t * coord, const struct qstr *name)
29851 +{
29852 +       return WITH_COORD(coord, check_item(dir, coord, name->name));
29853 +}
29854 +
29855 +/*
29856 + * argument package used by entry_actor to scan entries with identical keys.
29857 + */
29858 +typedef struct entry_actor_args {
29859 +       /* name we are looking for */
29860 +       const char *name;
29861 +       /* key of directory entry. entry_actor() scans through sequence of
29862 +        * items/units having the same key */
29863 +       reiser4_key *key;
29864 +       /* how many entries with duplicate key was scanned so far. */
29865 +       int non_uniq;
29866 +#if REISER4_USE_COLLISION_LIMIT
29867 +       /* scan limit */
29868 +       int max_non_uniq;
29869 +#endif
29870 +       /* return parameter: set to true, if ->name wasn't found */
29871 +       int not_found;
29872 +       /* what type of lock to take when moving to the next node during
29873 +        * scan */
29874 +       znode_lock_mode mode;
29875 +
29876 +       /* last coord that was visited during scan */
29877 +       coord_t last_coord;
29878 +       /* last node locked during scan */
29879 +       lock_handle last_lh;
29880 +       /* inode of directory */
29881 +       const struct inode *inode;
29882 +} entry_actor_args;
29883 +
29884 +/* Function called by reiser4_find_entry() to look for given name
29885 +   in the directory. */
29886 +static int entry_actor(reiser4_tree * tree UNUSED_ARG /* tree being scanned */ ,
29887 +                      coord_t * coord /* current coord */ ,
29888 +                      lock_handle * lh /* current lock handle */ ,
29889 +                      void *entry_actor_arg /* argument to scan */ )
29890 +{
29891 +       reiser4_key unit_key;
29892 +       entry_actor_args *args;
29893 +
29894 +       assert("nikita-1131", tree != NULL);
29895 +       assert("nikita-1132", coord != NULL);
29896 +       assert("nikita-1133", entry_actor_arg != NULL);
29897 +
29898 +       args = entry_actor_arg;
29899 +       ++args->non_uniq;
29900 +#if REISER4_USE_COLLISION_LIMIT
29901 +       if (args->non_uniq > args->max_non_uniq) {
29902 +               args->not_found = 1;
29903 +               /* hash collision overflow. */
29904 +               return RETERR(-EBUSY);
29905 +       }
29906 +#endif
29907 +
29908 +       /*
29909 +        * did we just reach the end of the sequence of items/units with
29910 +        * identical keys?
29911 +        */
29912 +       if (!keyeq(args->key, unit_key_by_coord(coord, &unit_key))) {
29913 +               assert("nikita-1791",
29914 +                      keylt(args->key, unit_key_by_coord(coord, &unit_key)));
29915 +               args->not_found = 1;
29916 +               args->last_coord.between = AFTER_UNIT;
29917 +               return 0;
29918 +       }
29919 +
29920 +       coord_dup(&args->last_coord, coord);
29921 +       /*
29922 +        * did scan just moved to the next node?
29923 +        */
29924 +       if (args->last_lh.node != lh->node) {
29925 +               int lock_result;
29926 +
29927 +               /*
29928 +                * if so, lock new node with the mode requested by the caller
29929 +                */
29930 +               done_lh(&args->last_lh);
29931 +               assert("nikita-1896", znode_is_any_locked(lh->node));
29932 +               lock_result = longterm_lock_znode(&args->last_lh, lh->node,
29933 +                                                 args->mode, ZNODE_LOCK_HIPRI);
29934 +               if (lock_result != 0)
29935 +                       return lock_result;
29936 +       }
29937 +       return check_item(args->inode, coord, args->name);
29938 +}
29939 +
29940 +/* Look for given @name within directory @dir.
29941 +
29942 +   This is called during lookup, creation and removal of directory
29943 +   entries and on reiser4_rename_common
29944 +
29945 +   First calculate key that directory entry for @name would have. Search
29946 +   for this key in the tree. If such key is found, scan all items with
29947 +   the same key, checking name in each directory entry along the way.
29948 +*/
29949 +int reiser4_find_entry(struct inode *dir,      /* directory to scan */
29950 +                      struct dentry *de,       /* name to search for */
29951 +                      lock_handle * lh,        /* resulting lock handle */
29952 +                      znode_lock_mode mode,    /* required lock mode */
29953 +                      reiser4_dir_entry_desc * entry   /* parameters of found
29954 +                                                          directory entry */)
29955 +{
29956 +       const struct qstr *name;
29957 +       seal_t *seal;
29958 +       coord_t *coord;
29959 +       int result;
29960 +       __u32 flags;
29961 +       de_location *dec;
29962 +       reiser4_dentry_fsdata *fsdata;
29963 +
29964 +       assert("nikita-1130", lh != NULL);
29965 +       assert("nikita-1128", dir != NULL);
29966 +
29967 +       name = &de->d_name;
29968 +       assert("nikita-1129", name != NULL);
29969 +
29970 +       /* dentry private data don't require lock, because dentry
29971 +          manipulations are protected by i_mutex on parent.
29972 +
29973 +          This is not so for inodes, because there is no -the- parent in
29974 +          inode case.
29975 +        */
29976 +       fsdata = reiser4_get_dentry_fsdata(de);
29977 +       if (IS_ERR(fsdata))
29978 +               return PTR_ERR(fsdata);
29979 +       dec = &fsdata->dec;
29980 +
29981 +       coord = &dec->entry_coord;
29982 +       coord_clear_iplug(coord);
29983 +       seal = &dec->entry_seal;
29984 +       /* compose key of directory entry for @name */
29985 +       inode_dir_plugin(dir)->build_entry_key(dir, name, &entry->key);
29986 +
29987 +       if (reiser4_seal_is_set(seal)) {
29988 +               /* check seal */
29989 +               result = reiser4_seal_validate(seal, coord, &entry->key,
29990 +                                              lh, mode, ZNODE_LOCK_LOPRI);
29991 +               if (result == 0) {
29992 +                       /* key was found. Check that it is really item we are
29993 +                          looking for. */
29994 +                       result = check_entry(dir, coord, name);
29995 +                       if (result == 0)
29996 +                               return 0;
29997 +               }
29998 +       }
29999 +       flags = (mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
30000 +       /*
30001 +        * find place in the tree where directory item should be located.
30002 +        */
30003 +       result = reiser4_object_lookup(dir, &entry->key, coord, lh, mode,
30004 +                                      FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL,
30005 +                                      flags, NULL /*ra_info */ );
30006 +       if (result == CBK_COORD_FOUND) {
30007 +               entry_actor_args arg;
30008 +
30009 +               /* fast path: no hash collisions */
30010 +               result = check_entry(dir, coord, name);
30011 +               if (result == 0) {
30012 +                       reiser4_seal_init(seal, coord, &entry->key);
30013 +                       dec->pos = 0;
30014 +               } else if (result > 0) {
30015 +                       /* Iterate through all units with the same keys. */
30016 +                       arg.name = name->name;
30017 +                       arg.key = &entry->key;
30018 +                       arg.not_found = 0;
30019 +                       arg.non_uniq = 0;
30020 +#if REISER4_USE_COLLISION_LIMIT
30021 +                       arg.max_non_uniq = max_hash_collisions(dir);
30022 +                       assert("nikita-2851", arg.max_non_uniq > 1);
30023 +#endif
30024 +                       arg.mode = mode;
30025 +                       arg.inode = dir;
30026 +                       coord_init_zero(&arg.last_coord);
30027 +                       init_lh(&arg.last_lh);
30028 +
30029 +                       result = reiser4_iterate_tree
30030 +                               (reiser4_tree_by_inode(dir),
30031 +                                coord, lh,
30032 +                                entry_actor, &arg, mode, 1);
30033 +                       /* if end of the tree or extent was reached during
30034 +                          scanning. */
30035 +                       if (arg.not_found || (result == -E_NO_NEIGHBOR)) {
30036 +                               /* step back */
30037 +                               done_lh(lh);
30038 +
30039 +                               result = zload(arg.last_coord.node);
30040 +                               if (result == 0) {
30041 +                                       coord_clear_iplug(&arg.last_coord);
30042 +                                       coord_dup(coord, &arg.last_coord);
30043 +                                       move_lh(lh, &arg.last_lh);
30044 +                                       result = RETERR(-ENOENT);
30045 +                                       zrelse(arg.last_coord.node);
30046 +                                       --arg.non_uniq;
30047 +                               }
30048 +                       }
30049 +
30050 +                       done_lh(&arg.last_lh);
30051 +                       if (result == 0)
30052 +                               reiser4_seal_init(seal, coord, &entry->key);
30053 +
30054 +                       if (result == 0 || result == -ENOENT) {
30055 +                               assert("nikita-2580", arg.non_uniq > 0);
30056 +                               dec->pos = arg.non_uniq - 1;
30057 +                       }
30058 +               }
30059 +       } else
30060 +               dec->pos = -1;
30061 +       return result;
30062 +}
30063 +
30064 +/*
30065 +   Local variables:
30066 +   c-indentation-style: "K&R"
30067 +   mode-name: "LC"
30068 +   c-basic-offset: 8
30069 +   tab-width: 8
30070 +   fill-column: 120
30071 +   scroll-step: 1
30072 +   End:
30073 +*/
30074 diff --git a/fs/reiser4/plugin/disk_format/Makefile b/fs/reiser4/plugin/disk_format/Makefile
30075 new file mode 100644
30076 index 0000000..e4e9e54
30077 --- /dev/null
30078 +++ b/fs/reiser4/plugin/disk_format/Makefile
30079 @@ -0,0 +1,5 @@
30080 +obj-$(CONFIG_REISER4_FS) += df_plugins.o
30081 +
30082 +df_plugins-objs :=     \
30083 +       disk_format40.o \
30084 +       disk_format.o
30085 diff --git a/fs/reiser4/plugin/disk_format/disk_format.c b/fs/reiser4/plugin/disk_format/disk_format.c
30086 new file mode 100644
30087 index 0000000..d785106
30088 --- /dev/null
30089 +++ b/fs/reiser4/plugin/disk_format/disk_format.c
30090 @@ -0,0 +1,38 @@
30091 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30092 +
30093 +#include "../../debug.h"
30094 +#include "../plugin_header.h"
30095 +#include "disk_format40.h"
30096 +#include "disk_format.h"
30097 +#include "../plugin.h"
30098 +
30099 +/* initialization of disk layout plugins */
30100 +disk_format_plugin format_plugins[LAST_FORMAT_ID] = {
30101 +       [FORMAT40_ID] = {
30102 +               .h = {
30103 +                       .type_id = REISER4_FORMAT_PLUGIN_TYPE,
30104 +                       .id = FORMAT40_ID,
30105 +                       .pops = NULL,
30106 +                       .label = "reiser40",
30107 +                       .desc = "standard disk layout for reiser40",
30108 +                       .linkage = {NULL, NULL}
30109 +               },
30110 +               .init_format = init_format_format40,
30111 +               .root_dir_key = root_dir_key_format40,
30112 +               .release = release_format40,
30113 +               .log_super = log_super_format40,
30114 +               .check_open = check_open_format40,
30115 +               .version_update = version_update_format40
30116 +       }
30117 +};
30118 +
30119 +/* Make Linus happy.
30120 +   Local variables:
30121 +   c-indentation-style: "K&R"
30122 +   mode-name: "LC"
30123 +   c-basic-offset: 8
30124 +   tab-width: 8
30125 +   fill-column: 120
30126 +   scroll-step: 1
30127 +   End:
30128 +*/
30129 diff --git a/fs/reiser4/plugin/disk_format/disk_format.h b/fs/reiser4/plugin/disk_format/disk_format.h
30130 new file mode 100644
30131 index 0000000..b9c53ac
30132 --- /dev/null
30133 +++ b/fs/reiser4/plugin/disk_format/disk_format.h
30134 @@ -0,0 +1,27 @@
30135 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30136 +
30137 +/* identifiers for disk layouts, they are also used as indexes in array of disk
30138 +   plugins */
30139 +
30140 +#if !defined( __REISER4_DISK_FORMAT_H__ )
30141 +#define __REISER4_DISK_FORMAT_H__
30142 +
30143 +typedef enum {
30144 +       /* standard reiser4 disk layout plugin id */
30145 +       FORMAT40_ID,
30146 +       LAST_FORMAT_ID
30147 +} disk_format_id;
30148 +
30149 +/* __REISER4_DISK_FORMAT_H__ */
30150 +#endif
30151 +
30152 +/* Make Linus happy.
30153 +   Local variables:
30154 +   c-indentation-style: "K&R"
30155 +   mode-name: "LC"
30156 +   c-basic-offset: 8
30157 +   tab-width: 8
30158 +   fill-column: 120
30159 +   scroll-step: 1
30160 +   End:
30161 +*/
30162 diff --git a/fs/reiser4/plugin/disk_format/disk_format40.c b/fs/reiser4/plugin/disk_format/disk_format40.c
30163 new file mode 100644
30164 index 0000000..17718f0
30165 --- /dev/null
30166 +++ b/fs/reiser4/plugin/disk_format/disk_format40.c
30167 @@ -0,0 +1,655 @@
30168 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30169 +
30170 +#include "../../debug.h"
30171 +#include "../../dformat.h"
30172 +#include "../../key.h"
30173 +#include "../node/node.h"
30174 +#include "../space/space_allocator.h"
30175 +#include "disk_format40.h"
30176 +#include "../plugin.h"
30177 +#include "../../txnmgr.h"
30178 +#include "../../jnode.h"
30179 +#include "../../tree.h"
30180 +#include "../../super.h"
30181 +#include "../../wander.h"
30182 +#include "../../inode.h"
30183 +#include "../../ktxnmgrd.h"
30184 +#include "../../status_flags.h"
30185 +
30186 +#include <linux/types.h>       /* for __u??  */
30187 +#include <linux/fs.h>          /* for struct super_block  */
30188 +#include <linux/buffer_head.h>
30189 +
30190 +/* reiser 4.0 default disk layout */
30191 +
30192 +/* Amount of free blocks needed to perform release_format40 when fs gets
30193 +   mounted RW: 1 for SB, 1 for non-leaves in overwrite set, 2 for tx header
30194 +   & tx record. */
30195 +#define RELEASE_RESERVED 4
30196 +
30197 +/* The greatest supported format40 version number */
30198 +#define FORMAT40_VERSION PLUGIN_LIBRARY_VERSION
30199 +
30200 +/* This flag indicates that backup should be updated
30201 +   (the update is performed by fsck) */
30202 +#define FORMAT40_UPDATE_BACKUP (1 << 31)
30203 +
30204 +/* functions to access fields of format40_disk_super_block */
30205 +static __u64 get_format40_block_count(const format40_disk_super_block * sb)
30206 +{
30207 +       return le64_to_cpu(get_unaligned(&sb->block_count));
30208 +}
30209 +
30210 +static __u64 get_format40_free_blocks(const format40_disk_super_block * sb)
30211 +{
30212 +       return le64_to_cpu(get_unaligned(&sb->free_blocks));
30213 +}
30214 +
30215 +static __u64 get_format40_root_block(const format40_disk_super_block * sb)
30216 +{
30217 +       return le64_to_cpu(get_unaligned(&sb->root_block));
30218 +}
30219 +
30220 +static __u16 get_format40_tree_height(const format40_disk_super_block * sb)
30221 +{
30222 +       return le16_to_cpu(get_unaligned(&sb->tree_height));
30223 +}
30224 +
30225 +static __u64 get_format40_file_count(const format40_disk_super_block * sb)
30226 +{
30227 +       return le64_to_cpu(get_unaligned(&sb->file_count));
30228 +}
30229 +
30230 +static __u64 get_format40_oid(const format40_disk_super_block * sb)
30231 +{
30232 +       return le64_to_cpu(get_unaligned(&sb->oid));
30233 +}
30234 +
30235 +static __u32 get_format40_mkfs_id(const format40_disk_super_block * sb)
30236 +{
30237 +       return le32_to_cpu(get_unaligned(&sb->mkfs_id));
30238 +}
30239 +
30240 +static __u64 get_format40_flags(const format40_disk_super_block * sb)
30241 +{
30242 +       return le64_to_cpu(get_unaligned(&sb->flags));
30243 +}
30244 +
30245 +static __u32 get_format40_version(const format40_disk_super_block * sb)
30246 +{
30247 +       return le32_to_cpu(get_unaligned(&sb->version)) &
30248 +               ~FORMAT40_UPDATE_BACKUP;
30249 +}
30250 +
30251 +static int update_backup_version(const format40_disk_super_block * sb)
30252 +{
30253 +       return (le32_to_cpu(get_unaligned(&sb->version)) &
30254 +               FORMAT40_UPDATE_BACKUP);
30255 +}
30256 +
30257 +static int update_disk_version(const format40_disk_super_block * sb)
30258 +{
30259 +       return (get_format40_version(sb) < FORMAT40_VERSION);
30260 +}
30261 +
30262 +static int incomplete_compatibility(const format40_disk_super_block * sb)
30263 +{
30264 +       return (get_format40_version(sb) > FORMAT40_VERSION);
30265 +}
30266 +
30267 +static format40_super_info *get_sb_info(struct super_block *super)
30268 +{
30269 +       return &get_super_private(super)->u.format40;
30270 +}
30271 +
30272 +static int consult_diskmap(struct super_block *s)
30273 +{
30274 +       format40_super_info *info;
30275 +       journal_location *jloc;
30276 +
30277 +       info = get_sb_info(s);
30278 +       jloc = &get_super_private(s)->jloc;
30279 +       /* Default format-specific locations, if there is nothing in
30280 +        * diskmap */
30281 +       jloc->footer = FORMAT40_JOURNAL_FOOTER_BLOCKNR;
30282 +       jloc->header = FORMAT40_JOURNAL_HEADER_BLOCKNR;
30283 +       info->loc.super = FORMAT40_OFFSET / s->s_blocksize;
30284 +#ifdef CONFIG_REISER4_BADBLOCKS
30285 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JF,
30286 +                                 &jloc->footer);
30287 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_JH,
30288 +                                 &jloc->header);
30289 +       reiser4_get_diskmap_value(FORMAT40_PLUGIN_DISKMAP_ID, FORMAT40_SUPER,
30290 +                                 &info->loc.super);
30291 +#endif
30292 +       return 0;
30293 +}
30294 +
30295 +/* find any valid super block of disk_format40 (even if the first
30296 +   super block is destroyed), will change block numbers of actual journal header/footer (jf/jh)
30297 +   if needed */
30298 +static struct buffer_head *find_a_disk_format40_super_block(struct super_block
30299 +                                                           *s)
30300 +{
30301 +       struct buffer_head *super_bh;
30302 +       format40_disk_super_block *disk_sb;
30303 +       format40_super_info *info;
30304 +
30305 +       assert("umka-487", s != NULL);
30306 +
30307 +       info = get_sb_info(s);
30308 +
30309 +       super_bh = sb_bread(s, info->loc.super);
30310 +       if (super_bh == NULL)
30311 +               return ERR_PTR(RETERR(-EIO));
30312 +
30313 +       disk_sb = (format40_disk_super_block *) super_bh->b_data;
30314 +       if (strncmp(disk_sb->magic, FORMAT40_MAGIC, sizeof(FORMAT40_MAGIC))) {
30315 +               brelse(super_bh);
30316 +               return ERR_PTR(RETERR(-EINVAL));
30317 +       }
30318 +
30319 +       reiser4_set_block_count(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)));
30320 +       reiser4_set_data_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->block_count)) -
30321 +                               le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30322 +       reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&disk_sb->free_blocks)));
30323 +
30324 +       return super_bh;
30325 +}
30326 +
30327 +/* find the most recent version of super block. This is called after journal is
30328 +   replayed */
30329 +static struct buffer_head *read_super_block(struct super_block *s UNUSED_ARG)
30330 +{
30331 +       /* Here the most recent superblock copy has to be read. However, as
30332 +          journal replay isn't complete, we are using
30333 +          find_a_disk_format40_super_block() function. */
30334 +       return find_a_disk_format40_super_block(s);
30335 +}
30336 +
30337 +static int get_super_jnode(struct super_block *s)
30338 +{
30339 +       reiser4_super_info_data *sbinfo = get_super_private(s);
30340 +       jnode *sb_jnode;
30341 +       int ret;
30342 +
30343 +       sb_jnode = reiser4_alloc_io_head(&get_sb_info(s)->loc.super);
30344 +
30345 +       ret = jload(sb_jnode);
30346 +
30347 +       if (ret) {
30348 +               reiser4_drop_io_head(sb_jnode);
30349 +               return ret;
30350 +       }
30351 +
30352 +       pin_jnode_data(sb_jnode);
30353 +       jrelse(sb_jnode);
30354 +
30355 +       sbinfo->u.format40.sb_jnode = sb_jnode;
30356 +
30357 +       return 0;
30358 +}
30359 +
30360 +static void done_super_jnode(struct super_block *s)
30361 +{
30362 +       jnode *sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30363 +
30364 +       if (sb_jnode) {
30365 +               unpin_jnode_data(sb_jnode);
30366 +               reiser4_drop_io_head(sb_jnode);
30367 +       }
30368 +}
30369 +
30370 +typedef enum format40_init_stage {
30371 +       NONE_DONE = 0,
30372 +       CONSULT_DISKMAP,
30373 +       FIND_A_SUPER,
30374 +       INIT_JOURNAL_INFO,
30375 +       INIT_STATUS,
30376 +       JOURNAL_REPLAY,
30377 +       READ_SUPER,
30378 +       KEY_CHECK,
30379 +       INIT_OID,
30380 +       INIT_TREE,
30381 +       JOURNAL_RECOVER,
30382 +       INIT_SA,
30383 +       INIT_JNODE,
30384 +       ALL_DONE
30385 +} format40_init_stage;
30386 +
30387 +static format40_disk_super_block *copy_sb(const struct buffer_head *super_bh)
30388 +{
30389 +       format40_disk_super_block *sb_copy;
30390 +
30391 +       sb_copy = kmalloc(sizeof(format40_disk_super_block),
30392 +                         reiser4_ctx_gfp_mask_get());
30393 +       if (sb_copy == NULL)
30394 +               return ERR_PTR(RETERR(-ENOMEM));
30395 +       memcpy(sb_copy, ((format40_disk_super_block *) super_bh->b_data),
30396 +              sizeof(format40_disk_super_block));
30397 +       return sb_copy;
30398 +}
30399 +
30400 +static int check_key_format(const format40_disk_super_block *sb_copy)
30401 +{
30402 +       if (!equi(REISER4_LARGE_KEY,
30403 +                 get_format40_flags(sb_copy) & (1 << FORMAT40_LARGE_KEYS))) {
30404 +               warning("nikita-3228", "Key format mismatch. "
30405 +                       "Only %s keys are supported.",
30406 +                       REISER4_LARGE_KEY ? "large" : "small");
30407 +               return RETERR(-EINVAL);
30408 +       }
30409 +       return 0;
30410 +}
30411 +
30412 +/**
30413 + * try_init_format40
30414 + * @super:
30415 + * @stage:
30416 + *
30417 + */
30418 +static int try_init_format40(struct super_block *super,
30419 +                            format40_init_stage *stage)
30420 +{
30421 +       int result;
30422 +       struct buffer_head *super_bh;
30423 +       reiser4_super_info_data *sbinfo;
30424 +       format40_disk_super_block *sb_copy;
30425 +       tree_level height;
30426 +       reiser4_block_nr root_block;
30427 +       node_plugin *nplug;
30428 +
30429 +       assert("vs-475", super != NULL);
30430 +       assert("vs-474", get_super_private(super));
30431 +
30432 +       *stage = NONE_DONE;
30433 +
30434 +       result = consult_diskmap(super);
30435 +       if (result)
30436 +               return result;
30437 +       *stage = CONSULT_DISKMAP;
30438 +
30439 +       super_bh = find_a_disk_format40_super_block(super);
30440 +       if (IS_ERR(super_bh))
30441 +               return PTR_ERR(super_bh);
30442 +       brelse(super_bh);
30443 +       *stage = FIND_A_SUPER;
30444 +
30445 +       /* ok, we are sure that filesystem format is a format40 format */
30446 +
30447 +       /* map jnodes for journal control blocks (header, footer) to disk  */
30448 +       result = reiser4_init_journal_info(super);
30449 +       if (result)
30450 +               return result;
30451 +       *stage = INIT_JOURNAL_INFO;
30452 +
30453 +       /* ok, we are sure that filesystem format is a format40 format */
30454 +       /* Now check it's state */
30455 +       result = reiser4_status_init(FORMAT40_STATUS_BLOCKNR);
30456 +       if (result != 0 && result != -EINVAL)
30457 +               /* -EINVAL means there is no magic, so probably just old
30458 +                * fs. */
30459 +               return result;
30460 +       *stage = INIT_STATUS;
30461 +
30462 +       result = reiser4_status_query(NULL, NULL);
30463 +       if (result == REISER4_STATUS_MOUNT_WARN)
30464 +               notice("vpf-1363", "Warning: mounting %s with errors.",
30465 +                      super->s_id);
30466 +       if (result == REISER4_STATUS_MOUNT_RO)
30467 +               notice("vpf-1364", "Warning: mounting %s with fatal errors,"
30468 +                      " forcing read-only mount.", super->s_id);
30469 +       result = reiser4_journal_replay(super);
30470 +       if (result)
30471 +               return result;
30472 +       *stage = JOURNAL_REPLAY;
30473 +
30474 +       super_bh = read_super_block(super);
30475 +       if (IS_ERR(super_bh))
30476 +               return PTR_ERR(super_bh);
30477 +       *stage = READ_SUPER;
30478 +
30479 +       /* allocate and make a copy of format40_disk_super_block */
30480 +       sb_copy = copy_sb(super_bh);
30481 +       brelse(super_bh);
30482 +
30483 +       if (IS_ERR(sb_copy))
30484 +               return PTR_ERR(sb_copy);
30485 +       printk("reiser4: %s: found disk format 4.0.%u.\n",
30486 +              super->s_id,
30487 +              get_format40_version(sb_copy));
30488 +       if (incomplete_compatibility(sb_copy))
30489 +               printk("reiser4: Warning: The last completely supported "
30490 +                      "version of disk format40 is %u. Some objects of "
30491 +                      "the semantic tree can be unaccessible.\n",
30492 +                      FORMAT40_VERSION);
30493 +       /* make sure that key format of kernel and filesystem match */
30494 +       result = check_key_format(sb_copy);
30495 +       if (result) {
30496 +               kfree(sb_copy);
30497 +               return result;
30498 +       }
30499 +       *stage = KEY_CHECK;
30500 +
30501 +       result = oid_init_allocator(super, get_format40_file_count(sb_copy),
30502 +                                   get_format40_oid(sb_copy));
30503 +       if (result) {
30504 +               kfree(sb_copy);
30505 +               return result;
30506 +       }
30507 +       *stage = INIT_OID;
30508 +
30509 +       /* get things necessary to init reiser4_tree */
30510 +       root_block = get_format40_root_block(sb_copy);
30511 +       height = get_format40_tree_height(sb_copy);
30512 +       nplug = node_plugin_by_id(NODE40_ID);
30513 +
30514 +       /* initialize reiser4_super_info_data */
30515 +       sbinfo = get_super_private(super);
30516 +       assert("", sbinfo->tree.super == super);
30517 +       /* init reiser4_tree for the filesystem */
30518 +       result = reiser4_init_tree(&sbinfo->tree, &root_block, height, nplug);
30519 +       if (result) {
30520 +               kfree(sb_copy);
30521 +               return result;
30522 +       }
30523 +       *stage = INIT_TREE;
30524 +
30525 +       /*
30526 +        * initialize reiser4_super_info_data with data from format40 super
30527 +        * block
30528 +        */
30529 +       sbinfo->default_uid = 0;
30530 +       sbinfo->default_gid = 0;
30531 +       sbinfo->mkfs_id = get_format40_mkfs_id(sb_copy);
30532 +       /* number of blocks in filesystem and reserved space */
30533 +       reiser4_set_block_count(super, get_format40_block_count(sb_copy));
30534 +       sbinfo->blocks_free = get_format40_free_blocks(sb_copy);
30535 +       sbinfo->version = get_format40_version(sb_copy);
30536 +       kfree(sb_copy);
30537 +
30538 +       if (update_backup_version(sb_copy))
30539 +               printk("reiser4: Warning: metadata backup is not updated. "
30540 +                      "Please run 'fsck.reiser4 --fix' on %s.\n",
30541 +                      super->s_id);
30542 +
30543 +       sbinfo->fsuid = 0;
30544 +       sbinfo->fs_flags |= (1 << REISER4_ADG); /* hard links for directories
30545 +                                                * are not supported */
30546 +       sbinfo->fs_flags |= (1 << REISER4_ONE_NODE_PLUGIN);     /* all nodes in
30547 +                                                                * layout 40 are
30548 +                                                                * of one
30549 +                                                                * plugin */
30550 +       /* sbinfo->tmgr is initialized already */
30551 +
30552 +       /* recover sb data which were logged separately from sb block */
30553 +
30554 +       /* NOTE-NIKITA: reiser4_journal_recover_sb_data() calls
30555 +        * oid_init_allocator() and reiser4_set_free_blocks() with new
30556 +        * data. What's the reason to call them above? */
30557 +       result = reiser4_journal_recover_sb_data(super);
30558 +       if (result != 0)
30559 +               return result;
30560 +       *stage = JOURNAL_RECOVER;
30561 +
30562 +       /*
30563 +        * Set number of used blocks.  The number of used blocks is not stored
30564 +        * neither in on-disk super block nor in the journal footer blocks.  At
30565 +        * this moment actual values of total blocks and free block counters
30566 +        * are set in the reiser4 super block (in-memory structure) and we can
30567 +        * calculate number of used blocks from them.
30568 +        */
30569 +       reiser4_set_data_blocks(super,
30570 +                               reiser4_block_count(super) -
30571 +                               reiser4_free_blocks(super));
30572 +
30573 +#if REISER4_DEBUG
30574 +       sbinfo->min_blocks_used = 16 /* reserved area */  +
30575 +               2 /* super blocks */  +
30576 +               2 /* journal footer and header */ ;
30577 +#endif
30578 +
30579 +       /* init disk space allocator */
30580 +       result = sa_init_allocator(reiser4_get_space_allocator(super),
30581 +                                  super, NULL);
30582 +       if (result)
30583 +               return result;
30584 +       *stage = INIT_SA;
30585 +
30586 +       result = get_super_jnode(super);
30587 +       if (result == 0)
30588 +               *stage = ALL_DONE;
30589 +       return result;
30590 +}
30591 +
30592 +/* plugin->u.format.get_ready */
30593 +int init_format_format40(struct super_block *s, void *data UNUSED_ARG)
30594 +{
30595 +       int result;
30596 +       format40_init_stage stage;
30597 +
30598 +       result = try_init_format40(s, &stage);
30599 +       switch (stage) {
30600 +       case ALL_DONE:
30601 +               assert("nikita-3458", result == 0);
30602 +               break;
30603 +       case INIT_JNODE:
30604 +               done_super_jnode(s);
30605 +       case INIT_SA:
30606 +               sa_destroy_allocator(reiser4_get_space_allocator(s), s);
30607 +       case JOURNAL_RECOVER:
30608 +       case INIT_TREE:
30609 +               reiser4_done_tree(&get_super_private(s)->tree);
30610 +       case INIT_OID:
30611 +       case KEY_CHECK:
30612 +       case READ_SUPER:
30613 +       case JOURNAL_REPLAY:
30614 +       case INIT_STATUS:
30615 +               reiser4_status_finish();
30616 +       case INIT_JOURNAL_INFO:
30617 +               reiser4_done_journal_info(s);
30618 +       case FIND_A_SUPER:
30619 +       case CONSULT_DISKMAP:
30620 +       case NONE_DONE:
30621 +               break;
30622 +       default:
30623 +               impossible("nikita-3457", "init stage: %i", stage);
30624 +       }
30625 +
30626 +       if (!rofs_super(s) && reiser4_free_blocks(s) < RELEASE_RESERVED)
30627 +               return RETERR(-ENOSPC);
30628 +
30629 +       return result;
30630 +}
30631 +
30632 +static void pack_format40_super(const struct super_block *s, char *data)
30633 +{
30634 +       format40_disk_super_block *super_data =
30635 +           (format40_disk_super_block *) data;
30636 +
30637 +       reiser4_super_info_data *sbinfo = get_super_private(s);
30638 +
30639 +       assert("zam-591", data != NULL);
30640 +
30641 +       put_unaligned(cpu_to_le64(reiser4_free_committed_blocks(s)),
30642 +                     &super_data->free_blocks);
30643 +
30644 +       put_unaligned(cpu_to_le64(sbinfo->tree.root_block),
30645 +                     &super_data->root_block);
30646 +
30647 +       put_unaligned(cpu_to_le64(oid_next(s)),
30648 +                     &super_data->oid);
30649 +
30650 +       put_unaligned(cpu_to_le64(oids_used(s)),
30651 +                     &super_data->file_count);
30652 +
30653 +       put_unaligned(cpu_to_le16(sbinfo->tree.height),
30654 +                     &super_data->tree_height);
30655 +
30656 +       if (update_disk_version(super_data)) {
30657 +               __u32 version = FORMAT40_VERSION | FORMAT40_UPDATE_BACKUP;
30658 +
30659 +               put_unaligned(cpu_to_le32(version), &super_data->version);
30660 +       }
30661 +}
30662 +
30663 +/* plugin->u.format.log_super
30664 +   return a jnode which should be added to transaction when the super block
30665 +   gets logged */
30666 +jnode *log_super_format40(struct super_block *s)
30667 +{
30668 +       jnode *sb_jnode;
30669 +
30670 +       sb_jnode = get_super_private(s)->u.format40.sb_jnode;
30671 +
30672 +       jload(sb_jnode);
30673 +
30674 +       pack_format40_super(s, jdata(sb_jnode));
30675 +
30676 +       jrelse(sb_jnode);
30677 +
30678 +       return sb_jnode;
30679 +}
30680 +
30681 +/* plugin->u.format.release */
30682 +int release_format40(struct super_block *s)
30683 +{
30684 +       int ret;
30685 +       reiser4_super_info_data *sbinfo;
30686 +
30687 +       sbinfo = get_super_private(s);
30688 +       assert("zam-579", sbinfo != NULL);
30689 +
30690 +       if (!rofs_super(s)) {
30691 +               ret = reiser4_capture_super_block(s);
30692 +               if (ret != 0)
30693 +                       warning("vs-898",
30694 +                               "reiser4_capture_super_block failed: %d",
30695 +                               ret);
30696 +
30697 +               ret = txnmgr_force_commit_all(s, 1);
30698 +               if (ret != 0)
30699 +                       warning("jmacd-74438", "txn_force failed: %d", ret);
30700 +
30701 +               all_grabbed2free();
30702 +       }
30703 +
30704 +       sa_destroy_allocator(&sbinfo->space_allocator, s);
30705 +       reiser4_done_journal_info(s);
30706 +       done_super_jnode(s);
30707 +
30708 +       rcu_barrier();
30709 +       reiser4_done_tree(&sbinfo->tree);
30710 +       /* call finish_rcu(), because some znode were "released" in
30711 +        * reiser4_done_tree(). */
30712 +       rcu_barrier();
30713 +
30714 +       return 0;
30715 +}
30716 +
30717 +#define FORMAT40_ROOT_LOCALITY 41
30718 +#define FORMAT40_ROOT_OBJECTID 42
30719 +
30720 +/* plugin->u.format.root_dir_key */
30721 +const reiser4_key *root_dir_key_format40(const struct super_block *super
30722 +                                        UNUSED_ARG)
30723 +{
30724 +       static const reiser4_key FORMAT40_ROOT_DIR_KEY = {
30725 +               .el = {
30726 +                       __constant_cpu_to_le64((FORMAT40_ROOT_LOCALITY << 4) | KEY_SD_MINOR),
30727 +#if REISER4_LARGE_KEY
30728 +                       ON_LARGE_KEY(0ull,)
30729 +#endif
30730 +                       __constant_cpu_to_le64(FORMAT40_ROOT_OBJECTID),
30731 +                       0ull
30732 +               }
30733 +       };
30734 +
30735 +       return &FORMAT40_ROOT_DIR_KEY;
30736 +}
30737 +
30738 +/* plugin->u.format.check_open.
30739 +   Check the opened object for validness. For now it checks for the valid oid &
30740 +   locality only, can be improved later and it its work may depend on the mount
30741 +   options. */
30742 +int check_open_format40(const struct inode *object)
30743 +{
30744 +       oid_t max, oid;
30745 +
30746 +       max = oid_next(object->i_sb) - 1;
30747 +
30748 +       /* Check the oid. */
30749 +       oid = get_inode_oid(object);
30750 +       if (oid > max) {
30751 +               warning("vpf-1360", "The object with the oid %llu "
30752 +                       "greater then the max used oid %llu found.",
30753 +                       (unsigned long long)oid, (unsigned long long)max);
30754 +
30755 +               return RETERR(-EIO);
30756 +       }
30757 +
30758 +       /* Check the locality. */
30759 +       oid = reiser4_inode_data(object)->locality_id;
30760 +       if (oid > max) {
30761 +               warning("vpf-1361", "The object with the locality %llu "
30762 +                       "greater then the max used oid %llu found.",
30763 +                       (unsigned long long)oid, (unsigned long long)max);
30764 +
30765 +               return RETERR(-EIO);
30766 +       }
30767 +
30768 +       return 0;
30769 +}
30770 +
30771 +/* plugin->u.format.version_update.
30772 +   Perform all version update operations from the on-disk
30773 +   format40_disk_super_block.version on disk to FORMAT40_VERSION.
30774 + */
30775 +int version_update_format40(struct super_block *super) {
30776 +       txn_handle * trans;
30777 +       lock_handle lh;
30778 +       txn_atom *atom;
30779 +       int ret;
30780 +
30781 +       /* Nothing to do if RO mount or the on-disk version is not less. */
30782 +       if (super->s_flags & MS_RDONLY)
30783 +               return 0;
30784 +
30785 +       if (get_super_private(super)->version >= FORMAT40_VERSION)
30786 +               return 0;
30787 +
30788 +       printk("reiser4: Updating disk format to 4.0.%u. The reiser4 metadata "
30789 +              "backup is left unchanged. Please run 'fsck.reiser4 --fix' "
30790 +              "on %s to update it too.\n", FORMAT40_VERSION, super->s_id);
30791 +
30792 +       /* Mark the uber znode dirty to call log_super on write_logs. */
30793 +       init_lh(&lh);
30794 +       ret = get_uber_znode(reiser4_get_tree(super), ZNODE_WRITE_LOCK,
30795 +                            ZNODE_LOCK_HIPRI, &lh);
30796 +       if (ret != 0)
30797 +               return ret;
30798 +
30799 +       znode_make_dirty(lh.node);
30800 +       done_lh(&lh);
30801 +
30802 +       /* Update the backup blocks. */
30803 +
30804 +       /* Force write_logs immediately. */
30805 +       trans = get_current_context()->trans;
30806 +       atom = get_current_atom_locked();
30807 +       assert("vpf-1906", atom != NULL);
30808 +
30809 +       spin_lock_txnh(trans);
30810 +       return force_commit_atom(trans);
30811 +}
30812 +
30813 +/* Make Linus happy.
30814 +   Local variables:
30815 +   c-indentation-style: "K&R"
30816 +   mode-name: "LC"
30817 +   c-basic-offset: 8
30818 +   tab-width: 8
30819 +   fill-column: 120
30820 +   scroll-step: 1
30821 +   End:
30822 +*/
30823 diff --git a/fs/reiser4/plugin/disk_format/disk_format40.h b/fs/reiser4/plugin/disk_format/disk_format40.h
30824 new file mode 100644
30825 index 0000000..7fc1772
30826 --- /dev/null
30827 +++ b/fs/reiser4/plugin/disk_format/disk_format40.h
30828 @@ -0,0 +1,109 @@
30829 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
30830 +
30831 +/* this file contains:
30832 +   - definition of ondisk super block of standart disk layout for
30833 +     reiser 4.0 (layout 40)
30834 +   - definition of layout 40 specific portion of in-core super block
30835 +   - declarations of functions implementing methods of layout plugin
30836 +     for layout 40
30837 +   - declarations of functions used to get/set fields in layout 40 super block
30838 +*/
30839 +
30840 +#ifndef __DISK_FORMAT40_H__
30841 +#define __DISK_FORMAT40_H__
30842 +
30843 +/* magic for default reiser4 layout */
30844 +#define FORMAT40_MAGIC "ReIsEr40FoRmAt"
30845 +#define FORMAT40_OFFSET (REISER4_MASTER_OFFSET + PAGE_CACHE_SIZE)
30846 +
30847 +#include "../../dformat.h"
30848 +
30849 +#include <linux/fs.h>          /* for struct super_block  */
30850 +
30851 +typedef enum {
30852 +       FORMAT40_LARGE_KEYS
30853 +} format40_flags;
30854 +
30855 +/* ondisk super block for format 40. It is 512 bytes long */
30856 +typedef struct format40_disk_super_block {
30857 +       /*   0 */ d64 block_count;
30858 +       /* number of block in a filesystem */
30859 +       /*   8 */ d64 free_blocks;
30860 +       /* number of free blocks */
30861 +       /*  16 */ d64 root_block;
30862 +       /* filesystem tree root block */
30863 +       /*  24 */ d64 oid;
30864 +       /* smallest free objectid */
30865 +       /*  32 */ d64 file_count;
30866 +       /* number of files in a filesystem */
30867 +       /*  40 */ d64 flushes;
30868 +       /* number of times super block was
30869 +          flushed. Needed if format 40
30870 +          will have few super blocks */
30871 +       /*  48 */ d32 mkfs_id;
30872 +       /* unique identifier of fs */
30873 +       /*  52 */ char magic[16];
30874 +       /* magic string ReIsEr40FoRmAt */
30875 +       /*  68 */ d16 tree_height;
30876 +       /* height of filesystem tree */
30877 +       /*  70 */ d16 formatting_policy;
30878 +       /* not used anymore */
30879 +       /*  72 */ d64 flags;
30880 +       /*  80 */ d32 version;
30881 +       /* on-disk format version number
30882 +          initially assigned by mkfs as the greatest format40
30883 +          version number supported by reiser4progs and updated
30884 +          in mount time in accordance with the greatest format40
30885 +          version number supported by kernel.
30886 +          Is used by fsck to catch possible corruption and
30887 +          for various compatibility issues */
30888 +       /*  84 */ char not_used[428];
30889 +} format40_disk_super_block;
30890 +
30891 +/* format 40 specific part of reiser4_super_info_data */
30892 +typedef struct format40_super_info {
30893 +/*     format40_disk_super_block actual_sb; */
30894 +       jnode *sb_jnode;
30895 +       struct {
30896 +               reiser4_block_nr super;
30897 +       } loc;
30898 +} format40_super_info;
30899 +
30900 +/* Defines for journal header and footer respectively. */
30901 +#define FORMAT40_JOURNAL_HEADER_BLOCKNR \
30902 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 3)
30903 +
30904 +#define FORMAT40_JOURNAL_FOOTER_BLOCKNR \
30905 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 4)
30906 +
30907 +#define FORMAT40_STATUS_BLOCKNR \
30908 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 5)
30909 +
30910 +/* Diskmap declarations */
30911 +#define FORMAT40_PLUGIN_DISKMAP_ID ((REISER4_FORMAT_PLUGIN_TYPE<<16) | (FORMAT40_ID))
30912 +#define FORMAT40_SUPER 1
30913 +#define FORMAT40_JH 2
30914 +#define FORMAT40_JF 3
30915 +
30916 +/* declarations of functions implementing methods of layout plugin for
30917 +   format 40. The functions theirself are in disk_format40.c */
30918 +extern int init_format_format40(struct super_block *, void *data);
30919 +extern const reiser4_key *root_dir_key_format40(const struct super_block *);
30920 +extern int release_format40(struct super_block *s);
30921 +extern jnode *log_super_format40(struct super_block *s);
30922 +extern int check_open_format40(const struct inode *object);
30923 +extern int version_update_format40(struct super_block *super);
30924 +
30925 +/* __DISK_FORMAT40_H__ */
30926 +#endif
30927 +
30928 +/* Make Linus happy.
30929 +   Local variables:
30930 +   c-indentation-style: "K&R"
30931 +   mode-name: "LC"
30932 +   c-basic-offset: 8
30933 +   tab-width: 8
30934 +   fill-column: 120
30935 +   scroll-step: 1
30936 +   End:
30937 +*/
30938 diff --git a/fs/reiser4/plugin/fibration.c b/fs/reiser4/plugin/fibration.c
30939 new file mode 100644
30940 index 0000000..690dac4
30941 --- /dev/null
30942 +++ b/fs/reiser4/plugin/fibration.c
30943 @@ -0,0 +1,175 @@
30944 +/* Copyright 2004 by Hans Reiser, licensing governed by
30945 + * reiser4/README */
30946 +
30947 +/* Directory fibrations */
30948 +
30949 +/*
30950 + * Suppose we have a directory tree with sources of some project. During
30951 + * compilation .o files are created within this tree. This makes access
30952 + * to the original source files less efficient, because source files are
30953 + * now "diluted" by object files: default directory plugin uses prefix
30954 + * of a file name as a part of the key for directory entry (and this
30955 + * part is also inherited by the key of file body). This means that
30956 + * foo.o will be located close to foo.c and foo.h in the tree.
30957 + *
30958 + * To avoid this effect directory plugin fill highest 7 (unused
30959 + * originally) bits of the second component of the directory entry key
30960 + * by bit-pattern depending on the file name (see
30961 + * fs/reiser4/kassign.c:build_entry_key_common()). These bits are called
30962 + * "fibre". Fibre of the file name key is inherited by key of stat data
30963 + * and keys of file body (in the case of REISER4_LARGE_KEY).
30964 + *
30965 + * Fibre for a given file is chosen by per-directory fibration
30966 + * plugin. Names within given fibre are ordered lexicographically.
30967 + */
30968 +
30969 +#include "../debug.h"
30970 +#include "plugin_header.h"
30971 +#include "plugin.h"
30972 +#include "../super.h"
30973 +#include "../inode.h"
30974 +
30975 +#include <linux/types.h>
30976 +
30977 +static const int fibre_shift = 57;
30978 +
30979 +#define FIBRE_NO(n) (((__u64)(n)) << fibre_shift)
30980 +
30981 +/*
30982 + * Trivial fibration: all files of directory are just ordered
30983 + * lexicographically.
30984 + */
30985 +static __u64 fibre_trivial(const struct inode *dir, const char *name, int len)
30986 +{
30987 +       return FIBRE_NO(0);
30988 +}
30989 +
30990 +/*
30991 + * dot-o fibration: place .o files after all others.
30992 + */
30993 +static __u64 fibre_dot_o(const struct inode *dir, const char *name, int len)
30994 +{
30995 +       /* special treatment for .*\.o */
30996 +       if (len > 2 && name[len - 1] == 'o' && name[len - 2] == '.')
30997 +               return FIBRE_NO(1);
30998 +       else
30999 +               return FIBRE_NO(0);
31000 +}
31001 +
31002 +/*
31003 + * ext.1 fibration: subdivide directory into 128 fibrations one for each
31004 + * 7bit extension character (file "foo.h" goes into fibre "h"), plus
31005 + * default fibre for the rest.
31006 + */
31007 +static __u64 fibre_ext_1(const struct inode *dir, const char *name, int len)
31008 +{
31009 +       if (len > 2 && name[len - 2] == '.')
31010 +               return FIBRE_NO(name[len - 1]);
31011 +       else
31012 +               return FIBRE_NO(0);
31013 +}
31014 +
31015 +/*
31016 + * ext.3 fibration: try to separate files with different 3-character
31017 + * extensions from each other.
31018 + */
31019 +static __u64 fibre_ext_3(const struct inode *dir, const char *name, int len)
31020 +{
31021 +       if (len > 4 && name[len - 4] == '.')
31022 +               return FIBRE_NO(name[len - 3] + name[len - 2] + name[len - 1]);
31023 +       else
31024 +               return FIBRE_NO(0);
31025 +}
31026 +
31027 +static int change_fibration(struct inode *inode,
31028 +                           reiser4_plugin * plugin,
31029 +                           pset_member memb)
31030 +{
31031 +       int result;
31032 +
31033 +       assert("nikita-3503", inode != NULL);
31034 +       assert("nikita-3504", plugin != NULL);
31035 +
31036 +       assert("nikita-3505", is_reiser4_inode(inode));
31037 +       assert("nikita-3506", inode_dir_plugin(inode) != NULL);
31038 +       assert("nikita-3507",
31039 +              plugin->h.type_id == REISER4_FIBRATION_PLUGIN_TYPE);
31040 +
31041 +       result = 0;
31042 +       if (inode_fibration_plugin(inode) == NULL ||
31043 +           inode_fibration_plugin(inode)->h.id != plugin->h.id) {
31044 +               if (is_dir_empty(inode) == 0)
31045 +                       result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
31046 +                                                PSET_FIBRATION, plugin);
31047 +               else
31048 +                       result = RETERR(-ENOTEMPTY);
31049 +
31050 +       }
31051 +       return result;
31052 +}
31053 +
31054 +static reiser4_plugin_ops fibration_plugin_ops = {
31055 +       .init = NULL,
31056 +       .load = NULL,
31057 +       .save_len = NULL,
31058 +       .save = NULL,
31059 +       .change = change_fibration
31060 +};
31061 +
31062 +/* fibration plugins */
31063 +fibration_plugin fibration_plugins[LAST_FIBRATION_ID] = {
31064 +       [FIBRATION_LEXICOGRAPHIC] = {
31065 +               .h = {
31066 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31067 +                       .id = FIBRATION_LEXICOGRAPHIC,
31068 +                       .pops = &fibration_plugin_ops,
31069 +                       .label = "lexicographic",
31070 +                       .desc = "no fibration",
31071 +                       .linkage = {NULL, NULL}
31072 +               },
31073 +               .fibre = fibre_trivial
31074 +       },
31075 +       [FIBRATION_DOT_O] = {
31076 +               .h = {
31077 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31078 +                       .id = FIBRATION_DOT_O,
31079 +                       .pops = &fibration_plugin_ops,
31080 +                       .label = "dot-o",
31081 +                       .desc = "fibrate .o files separately",
31082 +                       .linkage = {NULL, NULL}
31083 +               },
31084 +               .fibre = fibre_dot_o
31085 +       },
31086 +       [FIBRATION_EXT_1] = {
31087 +               .h = {
31088 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31089 +                       .id = FIBRATION_EXT_1,
31090 +                       .pops = &fibration_plugin_ops,
31091 +                       .label = "ext-1",
31092 +                       .desc = "fibrate file by single character extension",
31093 +                       .linkage = {NULL, NULL}
31094 +               },
31095 +               .fibre = fibre_ext_1
31096 +       },
31097 +       [FIBRATION_EXT_3] = {
31098 +               .h = {
31099 +                       .type_id = REISER4_FIBRATION_PLUGIN_TYPE,
31100 +                       .id = FIBRATION_EXT_3,
31101 +                       .pops = &fibration_plugin_ops,
31102 +                       .label = "ext-3",
31103 +                       .desc = "fibrate file by three character extension",
31104 +                       .linkage = {NULL, NULL}
31105 +               },
31106 +               .fibre = fibre_ext_3
31107 +       }
31108 +};
31109 +
31110 +/*
31111 + * Local variables:
31112 + * c-indentation-style: "K&R"
31113 + * mode-name: "LC"
31114 + * c-basic-offset: 8
31115 + * tab-width: 8
31116 + * fill-column: 79
31117 + * End:
31118 + */
31119 diff --git a/fs/reiser4/plugin/fibration.h b/fs/reiser4/plugin/fibration.h
31120 new file mode 100644
31121 index 0000000..0723cad
31122 --- /dev/null
31123 +++ b/fs/reiser4/plugin/fibration.h
31124 @@ -0,0 +1,37 @@
31125 +/* Copyright 2004 by Hans Reiser, licensing governed by reiser4/README */
31126 +
31127 +/* Fibration plugin used by hashed directory plugin to segment content
31128 + * of directory. See fs/reiser4/plugin/fibration.c for more on this. */
31129 +
31130 +#if !defined( __FS_REISER4_PLUGIN_FIBRATION_H__ )
31131 +#define __FS_REISER4_PLUGIN_FIBRATION_H__
31132 +
31133 +#include "plugin_header.h"
31134 +
31135 +typedef struct fibration_plugin {
31136 +       /* generic fields */
31137 +       plugin_header h;
31138 +
31139 +        __u64(*fibre) (const struct inode * dir, const char *name, int len);
31140 +} fibration_plugin;
31141 +
31142 +typedef enum {
31143 +       FIBRATION_LEXICOGRAPHIC,
31144 +       FIBRATION_DOT_O,
31145 +       FIBRATION_EXT_1,
31146 +       FIBRATION_EXT_3,
31147 +       LAST_FIBRATION_ID
31148 +} reiser4_fibration_id;
31149 +
31150 +/* __FS_REISER4_PLUGIN_FIBRATION_H__ */
31151 +#endif
31152 +
31153 +/* Make Linus happy.
31154 +   Local variables:
31155 +   c-indentation-style: "K&R"
31156 +   mode-name: "LC"
31157 +   c-basic-offset: 8
31158 +   tab-width: 8
31159 +   fill-column: 120
31160 +   End:
31161 +*/
31162 diff --git a/fs/reiser4/plugin/file/Makefile b/fs/reiser4/plugin/file/Makefile
31163 new file mode 100644
31164 index 0000000..134fa7a
31165 --- /dev/null
31166 +++ b/fs/reiser4/plugin/file/Makefile
31167 @@ -0,0 +1,7 @@
31168 +obj-$(CONFIG_REISER4_FS) += file_plugins.o
31169 +
31170 +file_plugins-objs :=           \
31171 +       file.o                  \
31172 +       tail_conversion.o       \
31173 +       symlink.o               \
31174 +       cryptcompress.o
31175 diff --git a/fs/reiser4/plugin/file/cryptcompress.c b/fs/reiser4/plugin/file/cryptcompress.c
31176 new file mode 100644
31177 index 0000000..2876e31
31178 --- /dev/null
31179 +++ b/fs/reiser4/plugin/file/cryptcompress.c
31180 @@ -0,0 +1,3760 @@
31181 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
31182 +   reiser4/README */
31183 +
31184 +/* This file contains implementations of inode/file/address_space/file plugin
31185 + * operations specific for cryptcompress file plugin which manages files with
31186 + * compressed and encrypted bodies. "Cryptcompress file" is built of items of
31187 + * CTAIL_ID (see http://www.namesys.com/cryptcompress_design.html for details).
31188 + */
31189 +
31190 +#include "../../inode.h"
31191 +#include "../cluster.h"
31192 +#include "../object.h"
31193 +#include "../../tree_walk.h"
31194 +#include "cryptcompress.h"
31195 +
31196 +#include <asm/scatterlist.h>
31197 +#include <linux/pagevec.h>
31198 +#include <asm/uaccess.h>
31199 +#include <linux/swap.h>
31200 +#include <linux/writeback.h>
31201 +#include <linux/random.h>
31202 +
31203 +/* get cryptcompress specific portion of inode */
31204 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode *inode)
31205 +{
31206 +       return &reiser4_inode_data(inode)->file_plugin_data.cryptcompress_info;
31207 +}
31208 +
31209 +/* plugin->u.file.init_inode_data */
31210 +void init_inode_data_cryptcompress(struct inode *inode,
31211 +                                  reiser4_object_create_data * crd,
31212 +                                  int create)
31213 +{
31214 +       cryptcompress_info_t *data;
31215 +
31216 +       data = cryptcompress_inode_data(inode);
31217 +       assert("edward-685", data != NULL);
31218 +
31219 +       memset(data, 0, sizeof(*data));
31220 +
31221 +       turn_on_compression(data);
31222 +       set_lattice_factor(data, MIN_LATTICE_FACTOR);
31223 +       init_inode_ordering(inode, crd, create);
31224 +}
31225 +
31226 +#if REISER4_DEBUG
31227 +int cryptcompress_inode_ok(struct inode *inode)
31228 +{
31229 +       if (!(reiser4_inode_data(inode)->plugin_mask & (1 << PSET_FILE)))
31230 +               return 0;
31231 +       if (!cluster_shift_ok(inode_cluster_shift(inode)))
31232 +               return 0;
31233 +       return 1;
31234 +}
31235 +#endif
31236 +
31237 +/* The following is a part of reiser4 cipher key manager
31238 +   which is called when opening/creating a cryptcompress file */
31239 +
31240 +/* get/set cipher key info */
31241 +crypto_stat_t * inode_crypto_stat (struct inode * inode)
31242 +{
31243 +       assert("edward-90", inode != NULL);
31244 +       assert("edward-91", reiser4_inode_data(inode) != NULL);
31245 +       return cryptcompress_inode_data(inode)->crypt;
31246 +}
31247 +
31248 +static void set_inode_crypto_stat (struct inode * inode, crypto_stat_t * stat)
31249 +{
31250 +       cryptcompress_inode_data(inode)->crypt = stat;
31251 +}
31252 +
31253 +/* allocate a cipher key info */
31254 +crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode)
31255 +{
31256 +       crypto_stat_t * info;
31257 +       int fipsize;
31258 +
31259 +       info = kmalloc(sizeof(*info), reiser4_ctx_gfp_mask_get());
31260 +       if (!info)
31261 +               return ERR_PTR(-ENOMEM);
31262 +       memset(info, 0, sizeof (*info));
31263 +       fipsize = inode_digest_plugin(inode)->fipsize;
31264 +       info->keyid = kmalloc(fipsize, reiser4_ctx_gfp_mask_get());
31265 +       if (!info->keyid) {
31266 +               kfree(info);
31267 +               return ERR_PTR(-ENOMEM);
31268 +       }
31269 +       info->host = inode;
31270 +       return info;
31271 +}
31272 +
31273 +#if 0
31274 +/* allocate/free low-level info for cipher and digest
31275 +   transforms */
31276 +static int alloc_crypto_tfms(crypto_stat_t * info)
31277 +{
31278 +       struct crypto_blkcipher * ctfm = NULL;
31279 +       struct crypto_hash      * dtfm = NULL;
31280 +       cipher_plugin * cplug = inode_cipher_plugin(info->host);
31281 +       digest_plugin * dplug = inode_digest_plugin(info->host);
31282 +
31283 +       if (cplug->alloc) {
31284 +               ctfm = cplug->alloc();
31285 +               if (IS_ERR(ctfm)) {
31286 +                       warning("edward-1364",
31287 +                               "Can not allocate info for %s\n",
31288 +                               cplug->h.desc);
31289 +                       return RETERR(PTR_ERR(ctfm));
31290 +               }
31291 +       }
31292 +       info_set_cipher(info, ctfm);
31293 +       if (dplug->alloc) {
31294 +               dtfm = dplug->alloc();
31295 +               if (IS_ERR(dtfm)) {
31296 +                       warning("edward-1365",
31297 +                               "Can not allocate info for %s\n",
31298 +                               dplug->h.desc);
31299 +                       goto unhappy_with_digest;
31300 +               }
31301 +       }
31302 +       info_set_digest(info, dtfm);
31303 +       return 0;
31304 + unhappy_with_digest:
31305 +       if (cplug->free) {
31306 +               cplug->free(ctfm);
31307 +               info_set_cipher(info, NULL);
31308 +       }
31309 +       return RETERR(PTR_ERR(dtfm));
31310 +}
31311 +#endif
31312 +
31313 +static void
31314 +free_crypto_tfms(crypto_stat_t * info)
31315 +{
31316 +       assert("edward-1366", info != NULL);
31317 +       if (!info_get_cipher(info)) {
31318 +               assert("edward-1601", !info_get_digest(info));
31319 +               return;
31320 +       }
31321 +       inode_cipher_plugin(info->host)->free(info_get_cipher(info));
31322 +       info_set_cipher(info, NULL);
31323 +       inode_digest_plugin(info->host)->free(info_get_digest(info));
31324 +       info_set_digest(info, NULL);
31325 +       return;
31326 +}
31327 +
31328 +#if 0
31329 +/* create a key fingerprint for disk stat-data */
31330 +static int create_keyid (crypto_stat_t * info, crypto_data_t * data)
31331 +{
31332 +       int ret = -ENOMEM;
31333 +       size_t blk, pad;
31334 +       __u8 * dmem;
31335 +       __u8 * cmem;
31336 +       struct hash_desc      ddesc;
31337 +       struct blkcipher_desc cdesc;
31338 +       struct scatterlist sg;
31339 +
31340 +       assert("edward-1367", info != NULL);
31341 +       assert("edward-1368", info->keyid != NULL);
31342 +
31343 +       ddesc.tfm = info_get_digest(info);
31344 +       ddesc.flags = 0;
31345 +       cdesc.tfm = info_get_cipher(info);
31346 +       cdesc.flags = 0;
31347 +
31348 +       dmem = kmalloc((size_t)crypto_hash_digestsize(ddesc.tfm),
31349 +                      reiser4_ctx_gfp_mask_get());
31350 +       if (!dmem)
31351 +               goto exit1;
31352 +
31353 +       blk = crypto_blkcipher_blocksize(cdesc.tfm);
31354 +
31355 +       pad = data->keyid_size % blk;
31356 +       pad = (pad ? blk - pad : 0);
31357 +
31358 +       cmem = kmalloc((size_t)data->keyid_size + pad,
31359 +                      reiser4_ctx_gfp_mask_get());
31360 +       if (!cmem)
31361 +               goto exit2;
31362 +       memcpy(cmem, data->keyid, data->keyid_size);
31363 +       memset(cmem + data->keyid_size, 0, pad);
31364 +
31365 +       sg.page = virt_to_page(cmem);
31366 +       sg.offset = offset_in_page(cmem);
31367 +       sg.length = data->keyid_size + pad;
31368 +
31369 +       ret = crypto_blkcipher_encrypt(&cdesc, &sg, &sg,
31370 +                                      data->keyid_size + pad);
31371 +       if (ret) {
31372 +               warning("edward-1369",
31373 +                       "encryption failed flags=%x\n", cdesc.flags);
31374 +               goto exit3;
31375 +       }
31376 +       ret = crypto_hash_digest(&ddesc, &sg, sg.length, dmem);
31377 +       if (ret) {
31378 +               warning("edward-1602",
31379 +                       "digest failed flags=%x\n", ddesc.flags);
31380 +               goto exit3;
31381 +       }
31382 +       memcpy(info->keyid, dmem, inode_digest_plugin(info->host)->fipsize);
31383 + exit3:
31384 +       kfree(cmem);
31385 + exit2:
31386 +       kfree(dmem);
31387 + exit1:
31388 +       return ret;
31389 +}
31390 +#endif
31391 +
31392 +static void destroy_keyid(crypto_stat_t * info)
31393 +{
31394 +       assert("edward-1370", info != NULL);
31395 +       assert("edward-1371", info->keyid != NULL);
31396 +       kfree(info->keyid);
31397 +       return;
31398 +}
31399 +
31400 +static void __free_crypto_stat (struct inode * inode)
31401 +{
31402 +       crypto_stat_t * info = inode_crypto_stat(inode);
31403 +       assert("edward-1372", info != NULL);
31404 +
31405 +       free_crypto_tfms(info);
31406 +       destroy_keyid(info);
31407 +       kfree(info);
31408 +}
31409 +
31410 +#if 0
31411 +static void instantiate_crypto_stat(crypto_stat_t * info)
31412 +{
31413 +       assert("edward-1373", info != NULL);
31414 +       assert("edward-1374", info->inst == 0);
31415 +       info->inst = 1;
31416 +}
31417 +#endif
31418 +
31419 +static void uninstantiate_crypto_stat(crypto_stat_t * info)
31420 +{
31421 +       assert("edward-1375", info != NULL);
31422 +       info->inst = 0;
31423 +}
31424 +
31425 +static int crypto_stat_instantiated(crypto_stat_t * info)
31426 +{
31427 +       return info->inst;
31428 +}
31429 +
31430 +static int inode_has_cipher_key(struct inode * inode)
31431 +{
31432 +       assert("edward-1376", inode != NULL);
31433 +       return inode_crypto_stat(inode) &&
31434 +               crypto_stat_instantiated(inode_crypto_stat(inode));
31435 +}
31436 +
31437 +static void free_crypto_stat (struct inode * inode)
31438 +{
31439 +       uninstantiate_crypto_stat(inode_crypto_stat(inode));
31440 +       __free_crypto_stat(inode);
31441 +}
31442 +
31443 +static int need_cipher(struct inode * inode)
31444 +{
31445 +       return inode_cipher_plugin(inode) !=
31446 +               cipher_plugin_by_id(NONE_CIPHER_ID);
31447 +}
31448 +
31449 +/* Create a crypto-stat and attach result to the @object.
31450 +   If success is returned, then low-level cipher info contains
31451 +   an instantiated key */
31452 +#if 0
31453 +crypto_stat_t *
31454 +create_crypto_stat(struct inode * object,
31455 +                  crypto_data_t * data /* this contains a (uninstantiated)
31456 +                                          cipher key imported from user
31457 +                                          space */)
31458 +{
31459 +       int ret;
31460 +       crypto_stat_t * info;
31461 +
31462 +       assert("edward-1377", data != NULL);
31463 +       assert("edward-1378", need_cipher(object));
31464 +
31465 +       if (inode_file_plugin(object) !=
31466 +           file_plugin_by_id(DIRECTORY_FILE_PLUGIN_ID))
31467 +               return ERR_PTR(-EINVAL);
31468 +
31469 +       info = reiser4_alloc_crypto_stat(object);
31470 +       if (IS_ERR(info))
31471 +               return info;
31472 +       ret = alloc_crypto_tfms(info);
31473 +       if (ret)
31474 +               goto err;
31475 +       /* instantiating a key */
31476 +       ret = crypto_blkcipher_setkey(info_get_cipher(info),
31477 +                                     data->key,
31478 +                                     data->keysize);
31479 +       if (ret) {
31480 +               warning("edward-1379",
31481 +                       "setkey failed flags=%x\n",
31482 +                       crypto_blkcipher_get_flags(info_get_cipher(info)));
31483 +               goto err;
31484 +       }
31485 +       info->keysize = data->keysize;
31486 +       ret = create_keyid(info, data);
31487 +       if (ret)
31488 +               goto err;
31489 +       instantiate_crypto_stat(info);
31490 +       return info;
31491 + err:
31492 +       __free_crypto_stat(object);
31493 +       return ERR_PTR(ret);
31494 +}
31495 +#endif
31496 +
31497 +/* increment/decrement a load counter when
31498 +   attaching/detaching the crypto-stat to any object */
31499 +static void load_crypto_stat(crypto_stat_t * info)
31500 +{
31501 +       assert("edward-1380", info != NULL);
31502 +       inc_keyload_count(info);
31503 +}
31504 +
31505 +static void unload_crypto_stat(struct inode * inode)
31506 +{
31507 +       crypto_stat_t * info = inode_crypto_stat(inode);
31508 +       assert("edward-1381", info->keyload_count > 0);
31509 +
31510 +       dec_keyload_count(inode_crypto_stat(inode));
31511 +       if (info->keyload_count == 0)
31512 +               /* final release */
31513 +               free_crypto_stat(inode);
31514 +}
31515 +
31516 +/* attach/detach an existing crypto-stat */
31517 +void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info)
31518 +{
31519 +       assert("edward-1382", inode != NULL);
31520 +       assert("edward-1383", info != NULL);
31521 +       assert("edward-1384", inode_crypto_stat(inode) == NULL);
31522 +
31523 +       set_inode_crypto_stat(inode, info);
31524 +       load_crypto_stat(info);
31525 +}
31526 +
31527 +/* returns true, if crypto stat can be attached to the @host */
31528 +#if REISER4_DEBUG
31529 +static int host_allows_crypto_stat(struct inode * host)
31530 +{
31531 +       int ret;
31532 +       file_plugin * fplug = inode_file_plugin(host);
31533 +
31534 +       switch (fplug->h.id) {
31535 +       case CRYPTCOMPRESS_FILE_PLUGIN_ID:
31536 +               ret = 1;
31537 +               break;
31538 +       default:
31539 +               ret = 0;
31540 +       }
31541 +       return ret;
31542 +}
31543 +#endif  /*  REISER4_DEBUG  */
31544 +
31545 +static void reiser4_detach_crypto_stat(struct inode * inode)
31546 +{
31547 +       assert("edward-1385", inode != NULL);
31548 +       assert("edward-1386", host_allows_crypto_stat(inode));
31549 +
31550 +       if (inode_crypto_stat(inode))
31551 +               unload_crypto_stat(inode);
31552 +       set_inode_crypto_stat(inode, NULL);
31553 +}
31554 +
31555 +#if 0
31556 +
31557 +/* compare fingerprints of @child and @parent */
31558 +static int keyid_eq(crypto_stat_t * child, crypto_stat_t * parent)
31559 +{
31560 +       return !memcmp(child->keyid, parent->keyid, info_digest_plugin(parent)->fipsize);
31561 +}
31562 +
31563 +/* check if a crypto-stat (which is bound to @parent) can be inherited */
31564 +int can_inherit_crypto_cryptcompress(struct inode *child, struct inode *parent)
31565 +{
31566 +       if (!need_cipher(child))
31567 +               return 0;
31568 +       /* the child is created */
31569 +       if (!inode_crypto_stat(child))
31570 +               return 1;
31571 +       /* the child is looked up */
31572 +       if (!inode_crypto_stat(parent))
31573 +               return 0;
31574 +       return (inode_cipher_plugin(child) == inode_cipher_plugin(parent) &&
31575 +               inode_digest_plugin(child) == inode_digest_plugin(parent) &&
31576 +               inode_crypto_stat(child)->keysize == inode_crypto_stat(parent)->keysize &&
31577 +               keyid_eq(inode_crypto_stat(child), inode_crypto_stat(parent)));
31578 +}
31579 +#endif
31580 +
31581 +/* helper functions for ->create() method of the cryptcompress plugin */
31582 +static int inode_set_crypto(struct inode * object)
31583 +{
31584 +       reiser4_inode * info;
31585 +       if (!inode_crypto_stat(object)) {
31586 +               if (need_cipher(object))
31587 +                       return RETERR(-EINVAL);
31588 +               /* the file is not to be encrypted */
31589 +               return 0;
31590 +       }
31591 +       info = reiser4_inode_data(object);
31592 +       info->extmask |= (1 << CRYPTO_STAT);
31593 +       return 0;
31594 +}
31595 +
31596 +static int inode_init_compression(struct inode * object)
31597 +{
31598 +       int result = 0;
31599 +       assert("edward-1461", object != NULL);
31600 +       if (inode_compression_plugin(object)->init)
31601 +               result = inode_compression_plugin(object)->init();
31602 +       return result;
31603 +}
31604 +
31605 +static int inode_check_cluster(struct inode * object)
31606 +{
31607 +       assert("edward-696", object != NULL);
31608 +
31609 +       if (inode_cluster_size(object) < PAGE_CACHE_SIZE) {
31610 +               warning("edward-1320", "Can not support '%s' "
31611 +                       "logical clusters (less then page size)",
31612 +                       inode_cluster_plugin(object)->h.label);
31613 +               return RETERR(-EINVAL);
31614 +       }
31615 +       return 0;
31616 +}
31617 +
31618 +/* ->destroy_inode() method of the cryptcompress plugin */
31619 +void destroy_inode_cryptcompress(struct inode * inode)
31620 +{
31621 +       assert("edward-23", cryptcompress_inode_data(inode)->pgcount == 0);
31622 +       reiser4_detach_crypto_stat(inode);
31623 +       return;
31624 +}
31625 +
31626 +/* ->create() method of the cryptcompress plugin
31627 +
31628 +. install plugins
31629 +. attach crypto info if specified
31630 +. attach compression info if specified
31631 +. attach cluster info
31632 +*/
31633 +int
31634 +create_cryptcompress(struct inode *object, struct inode *parent,
31635 +                    reiser4_object_create_data * data)
31636 +{
31637 +       int result;
31638 +       reiser4_inode *info;
31639 +
31640 +       assert("edward-23", object != NULL);
31641 +       assert("edward-24", parent != NULL);
31642 +       assert("edward-30", data != NULL);
31643 +       assert("edward-26", reiser4_inode_get_flag(object, REISER4_NO_SD));
31644 +       assert("edward-27", data->id == CRYPTCOMPRESS_FILE_PLUGIN_ID);
31645 +
31646 +       info = reiser4_inode_data(object);
31647 +
31648 +       assert("edward-29", info != NULL);
31649 +
31650 +       /* set file bit */
31651 +       info->plugin_mask |= (1 << PSET_FILE);
31652 +
31653 +       /* set crypto */
31654 +       result = inode_set_crypto(object);
31655 +       if (result)
31656 +               goto error;
31657 +       /* set compression */
31658 +       result = inode_init_compression(object);
31659 +       if (result)
31660 +               goto error;
31661 +       /* set cluster */
31662 +       result = inode_check_cluster(object);
31663 +       if (result)
31664 +               goto error;
31665 +
31666 +       /* save everything in disk stat-data */
31667 +       result = write_sd_by_inode_common(object);
31668 +       if (!result)
31669 +               return 0;
31670 + error:
31671 +       reiser4_detach_crypto_stat(object);
31672 +       return result;
31673 +}
31674 +
31675 +/* ->open() method of the cryptcompress plugin */
31676 +int open_object_cryptcompress(struct inode * inode, struct file * file)
31677 +{
31678 +       int result;
31679 +       struct inode * parent;
31680 +
31681 +       assert("edward-1394", inode != NULL);
31682 +       assert("edward-1395", file != NULL);
31683 +       assert("edward-1396", file != NULL);
31684 +       assert("edward-1397", file->f_dentry->d_inode == inode);
31685 +       assert("edward-1398", file->f_dentry->d_parent != NULL);
31686 +       assert("edward-1399", file->f_dentry->d_parent->d_inode != NULL);
31687 +       assert("edward-698",
31688 +              inode_file_plugin(inode) ==
31689 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
31690 +       result = inode_check_cluster(inode);
31691 +       if (result)
31692 +               return result;
31693 +       result = inode_init_compression(inode);
31694 +       if (result)
31695 +               return result;
31696 +       if (!need_cipher(inode))
31697 +               /* the file is not to be ciphered */
31698 +               return 0;
31699 +       parent = file->f_dentry->d_parent->d_inode;
31700 +       if (!inode_has_cipher_key(inode))
31701 +               return RETERR(-EINVAL);
31702 +       return 0;
31703 +}
31704 +
31705 +/* returns a blocksize, the attribute of a cipher algorithm */
31706 +static unsigned int
31707 +cipher_blocksize(struct inode * inode)
31708 +{
31709 +       assert("edward-758", need_cipher(inode));
31710 +       assert("edward-1400", inode_crypto_stat(inode) != NULL);
31711 +       return crypto_blkcipher_blocksize
31712 +               (info_get_cipher(inode_crypto_stat(inode)));
31713 +}
31714 +
31715 +/* returns offset translated by scale factor of the crypto-algorithm */
31716 +static loff_t inode_scaled_offset (struct inode * inode,
31717 +                                  const loff_t src_off /* input offset */)
31718 +{
31719 +       assert("edward-97", inode != NULL);
31720 +
31721 +       if (!need_cipher(inode) ||
31722 +           src_off == get_key_offset(reiser4_min_key()) ||
31723 +           src_off == get_key_offset(reiser4_max_key()))
31724 +               return src_off;
31725 +
31726 +       return inode_cipher_plugin(inode)->scale(inode,
31727 +                                                cipher_blocksize(inode),
31728 +                                                src_off);
31729 +}
31730 +
31731 +/* returns disk cluster size */
31732 +size_t inode_scaled_cluster_size(struct inode * inode)
31733 +{
31734 +       assert("edward-110", inode != NULL);
31735 +
31736 +       return inode_scaled_offset(inode, inode_cluster_size(inode));
31737 +}
31738 +
31739 +static int new_cluster(reiser4_cluster_t * clust, struct inode *inode)
31740 +{
31741 +       return (clust_to_off(clust->index, inode) >= inode->i_size);
31742 +}
31743 +
31744 +/* set number of cluster pages */
31745 +static void set_cluster_nrpages(reiser4_cluster_t * clust, struct inode *inode)
31746 +{
31747 +       reiser4_slide_t *win;
31748 +
31749 +       assert("edward-180", clust != NULL);
31750 +       assert("edward-1040", inode != NULL);
31751 +
31752 +       win = clust->win;
31753 +       if (!win) {
31754 +               /* NOTE-EDWARD: i_size should be protected */
31755 +               clust->nr_pages =
31756 +                   count_to_nrpages(fsize_to_count(clust, inode));
31757 +               return;
31758 +       }
31759 +       assert("edward-1176", clust->op != PCL_UNKNOWN);
31760 +       assert("edward-1064", win->off + win->count + win->delta != 0);
31761 +
31762 +       if (win->stat == HOLE_WINDOW &&
31763 +           win->off == 0 && win->count == inode_cluster_size(inode)) {
31764 +               /* special case: we start write hole from fake cluster */
31765 +               clust->nr_pages = 0;
31766 +               return;
31767 +       }
31768 +       clust->nr_pages =
31769 +           count_to_nrpages(max_count(win->off + win->count + win->delta,
31770 +                                      fsize_to_count(clust, inode)));
31771 +       return;
31772 +}
31773 +
31774 +/* ->key_by_inode() method of the cryptcompress plugin */
31775 +/* see plugin/plugin.h for details */
31776 +int
31777 +key_by_inode_cryptcompress(struct inode *inode, loff_t off, reiser4_key * key)
31778 +{
31779 +       loff_t clust_off;
31780 +
31781 +       assert("edward-64", inode != 0);
31782 +       //      assert("edward-112", ergo(off != get_key_offset(reiser4_max_key()), !off_to_cloff(off, inode)));
31783 +       /* don't come here with other offsets */
31784 +
31785 +       clust_off =
31786 +           (off ==
31787 +            get_key_offset(reiser4_max_key())? get_key_offset(reiser4_max_key()) :
31788 +            off_to_clust_to_off(off, inode));
31789 +
31790 +       key_by_inode_and_offset_common(inode, 0, key);
31791 +       set_key_offset(key,
31792 +                      (__u64) (!inode_crypto_stat(inode) ? clust_off :
31793 +                               inode_scaled_offset(inode, clust_off)));
31794 +       return 0;
31795 +}
31796 +
31797 +/* plugin->flow_by_inode */
31798 +int
31799 +flow_by_inode_cryptcompress(struct inode *inode /* file to build flow for */ ,
31800 +                           const char __user *buf /* user level buffer */ ,
31801 +                           int user    /* 1 if @buf is of user space, 0 - if it is
31802 +                                          kernel space */ ,
31803 +                           loff_t size /* buffer size */ ,
31804 +                           loff_t off /* offset to start io from */ ,
31805 +                           rw_op op /* READ or WRITE */ ,
31806 +                           flow_t * f /* resulting flow */ )
31807 +{
31808 +       assert("edward-436", f != NULL);
31809 +       assert("edward-149", inode != NULL);
31810 +       assert("edward-150", inode_file_plugin(inode) != NULL);
31811 +
31812 +       f->length = size;
31813 +       memcpy(&f->data, &buf, sizeof(buf));
31814 +       f->user = user;
31815 +       f->op = op;
31816 +
31817 +       if (op == WRITE_OP && user == 1)
31818 +               return 0;
31819 +       return key_by_inode_cryptcompress(inode, off, &f->key);
31820 +}
31821 +
31822 +static int
31823 +cryptcompress_hint_validate(hint_t * hint, const reiser4_key * key,
31824 +                           znode_lock_mode lock_mode)
31825 +{
31826 +       coord_t *coord;
31827 +
31828 +       assert("edward-704", hint != NULL);
31829 +       assert("edward-1089", !hint_is_valid(hint));
31830 +       assert("edward-706", hint->lh.owner == NULL);
31831 +
31832 +       coord = &hint->ext_coord.coord;
31833 +
31834 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
31835 +               /* hint either not set or set by different operation */
31836 +               return RETERR(-E_REPEAT);
31837 +
31838 +       if (get_key_offset(key) != hint->offset)
31839 +               /* hint is set for different key */
31840 +               return RETERR(-E_REPEAT);
31841 +
31842 +       assert("edward-707", reiser4_schedulable());
31843 +
31844 +       return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord,
31845 +                                    key, &hint->lh, lock_mode,
31846 +                                    ZNODE_LOCK_LOPRI);
31847 +}
31848 +
31849 +/* reserve disk space when writing a logical cluster */
31850 +static int reserve4cluster(struct inode *inode, reiser4_cluster_t *clust)
31851 +{
31852 +       int result = 0;
31853 +
31854 +       assert("edward-965", reiser4_schedulable());
31855 +       assert("edward-439", inode != NULL);
31856 +       assert("edward-440", clust != NULL);
31857 +       assert("edward-441", clust->pages != NULL);
31858 +
31859 +       if (clust->nr_pages == 0) {
31860 +               assert("edward-1152", clust->win != NULL);
31861 +               assert("edward-1153", clust->win->stat == HOLE_WINDOW);
31862 +               /* don't reserve space for fake disk clusteer */
31863 +               return 0;
31864 +       }
31865 +       assert("edward-442", jprivate(clust->pages[0]) != NULL);
31866 +
31867 +       result = reiser4_grab_space_force(estimate_insert_cluster(inode) +
31868 +                                         estimate_update_cluster(inode),
31869 +                                         BA_CAN_COMMIT);
31870 +       if (result)
31871 +               return result;
31872 +       clust->reserved = 1;
31873 +       grabbed2cluster_reserved(estimate_insert_cluster(inode) +
31874 +                                estimate_update_cluster(inode));
31875 +#if REISER4_DEBUG
31876 +       clust->reserved_prepped = estimate_update_cluster(inode);
31877 +       clust->reserved_unprepped = estimate_insert_cluster(inode);
31878 +#endif
31879 +       /* there can be space grabbed by txnmgr_force_commit_all */
31880 +       return 0;
31881 +}
31882 +
31883 +/* free reserved disk space if writing a logical cluster fails */
31884 +static void
31885 +free_reserved4cluster(struct inode *inode, reiser4_cluster_t * clust, int count)
31886 +{
31887 +       assert("edward-967", clust->reserved == 1);
31888 +
31889 +       cluster_reserved2free(count);
31890 +       clust->reserved = 0;
31891 +}
31892 +
31893 +/* The core search procedure of the cryptcompress plugin.
31894 +   If returned value is not cbk_errored, then current znode is locked */
31895 +static int find_cluster_item(hint_t * hint,
31896 +                            const reiser4_key * key, /* key of the item we are
31897 +                                                        looking for */
31898 +                            znode_lock_mode lock_mode /* which lock */ ,
31899 +                            ra_info_t * ra_info, lookup_bias bias, __u32 flags)
31900 +{
31901 +       int result;
31902 +       reiser4_key ikey;
31903 +       int went_right = 0;
31904 +       coord_t *coord = &hint->ext_coord.coord;
31905 +       coord_t orig = *coord;
31906 +
31907 +       assert("edward-152", hint != NULL);
31908 +
31909 +       if (!hint_is_valid(hint)) {
31910 +               result = cryptcompress_hint_validate(hint, key, lock_mode);
31911 +               if (result == -E_REPEAT)
31912 +                       goto traverse_tree;
31913 +               else if (result) {
31914 +                       assert("edward-1216", 0);
31915 +                       return result;
31916 +               }
31917 +               hint_set_valid(hint);
31918 +       }
31919 +       assert("edward-709", znode_is_any_locked(coord->node));
31920 +
31921 +       /* In-place lookup is going here, it means we just need to
31922 +          check if next item of the @coord match to the @keyhint) */
31923 +
31924 +       if (equal_to_rdk(coord->node, key)) {
31925 +               result = goto_right_neighbor(coord, &hint->lh);
31926 +               if (result == -E_NO_NEIGHBOR) {
31927 +                       assert("edward-1217", 0);
31928 +                       return RETERR(-EIO);
31929 +               }
31930 +               if (result)
31931 +                       return result;
31932 +               assert("edward-1218", equal_to_ldk(coord->node, key));
31933 +               went_right = 1;
31934 +       } else {
31935 +               coord->item_pos++;
31936 +               coord->unit_pos = 0;
31937 +               coord->between = AT_UNIT;
31938 +       }
31939 +       result = zload(coord->node);
31940 +       if (result)
31941 +               return result;
31942 +       assert("edward-1219", !node_is_empty(coord->node));
31943 +
31944 +       if (!coord_is_existing_item(coord)) {
31945 +               zrelse(coord->node);
31946 +               goto not_found;
31947 +       }
31948 +       item_key_by_coord(coord, &ikey);
31949 +       zrelse(coord->node);
31950 +       if (!keyeq(key, &ikey))
31951 +               goto not_found;
31952 +       /* Ok, item is found, update node counts */
31953 +       if (went_right)
31954 +               dclust_inc_extension_ncount(hint);
31955 +       return CBK_COORD_FOUND;
31956 +
31957 +      not_found:
31958 +       assert("edward-1220", coord->item_pos > 0);
31959 +       //coord->item_pos--;
31960 +       /* roll back */
31961 +       *coord = orig;
31962 +       ON_DEBUG(coord_update_v(coord));
31963 +       return CBK_COORD_NOTFOUND;
31964 +
31965 +      traverse_tree:
31966 +       assert("edward-713", hint->lh.owner == NULL);
31967 +       assert("edward-714", reiser4_schedulable());
31968 +
31969 +       reiser4_unset_hint(hint);
31970 +       dclust_init_extension(hint);
31971 +       coord_init_zero(coord);
31972 +       result = coord_by_key(current_tree, key, coord, &hint->lh,
31973 +                             lock_mode, bias, LEAF_LEVEL, LEAF_LEVEL,
31974 +                             CBK_UNIQUE | flags, ra_info);
31975 +       if (cbk_errored(result))
31976 +               return result;
31977 +       if(result == CBK_COORD_FOUND)
31978 +               dclust_inc_extension_ncount(hint);
31979 +       hint_set_valid(hint);
31980 +       return result;
31981 +}
31982 +
31983 +/* This function is called by deflate[inflate] manager when
31984 +   creating a transformed/plain stream to check if we should
31985 +   create/cut some overhead. If this returns true, then @oh
31986 +   contains the size of this overhead.
31987 + */
31988 +static int
31989 +need_cut_or_align(struct inode * inode, reiser4_cluster_t * clust,
31990 +                 rw_op rw, int * oh)
31991 +{
31992 +       tfm_cluster_t * tc = &clust->tc;
31993 +       switch (rw) {
31994 +       case WRITE_OP: /* estimate align */
31995 +               *oh = tc->len % cipher_blocksize(inode);
31996 +               if (*oh != 0)
31997 +                       return 1;
31998 +               break;
31999 +       case READ_OP:  /* estimate cut */
32000 +               *oh = *(tfm_output_data(clust) + tc->len - 1);
32001 +               break;
32002 +       default:
32003 +               impossible("edward-1401", "bad option");
32004 +       }
32005 +       return (tc->len != tc->lsize);
32006 +}
32007 +
32008 +/* create/cut an overhead of transformed/plain stream */
32009 +static void
32010 +align_or_cut_overhead(struct inode * inode, reiser4_cluster_t * clust, rw_op rw)
32011 +{
32012 +       int oh;
32013 +       cipher_plugin * cplug = inode_cipher_plugin(inode);
32014 +
32015 +       assert("edward-1402", need_cipher(inode));
32016 +
32017 +       if (!need_cut_or_align(inode, clust, rw, &oh))
32018 +               return;
32019 +       switch (rw) {
32020 +       case WRITE_OP: /* do align */
32021 +               clust->tc.len +=
32022 +                       cplug->align_stream(tfm_input_data(clust) +
32023 +                                           clust->tc.len, clust->tc.len,
32024 +                                           cipher_blocksize(inode));
32025 +               *(tfm_input_data(clust) + clust->tc.len - 1) =
32026 +                       cipher_blocksize(inode) - oh;
32027 +               break;
32028 +       case READ_OP: /* do cut */
32029 +               assert("edward-1403", oh <= cipher_blocksize(inode));
32030 +               clust->tc.len -= oh;
32031 +               break;
32032 +       default:
32033 +               impossible("edward-1404", "bad option");
32034 +       }
32035 +       return;
32036 +}
32037 +
32038 +/* the following two functions are to evaluate results
32039 +   of compression transform */
32040 +static unsigned
32041 +max_cipher_overhead(struct inode * inode)
32042 +{
32043 +       if (!need_cipher(inode) || !inode_cipher_plugin(inode)->align_stream)
32044 +               return 0;
32045 +       return cipher_blocksize(inode);
32046 +}
32047 +
32048 +static int deflate_overhead(struct inode *inode)
32049 +{
32050 +       return (inode_compression_plugin(inode)->
32051 +               checksum ? DC_CHECKSUM_SIZE : 0);
32052 +}
32053 +
32054 +static unsigned deflate_overrun(struct inode * inode, int ilen)
32055 +{
32056 +       return coa_overrun(inode_compression_plugin(inode), ilen);
32057 +}
32058 +
32059 +/* Estimating compressibility of a logical cluster by various
32060 +   policies represented by compression mode plugin.
32061 +   If this returns false, then compressor won't be called for
32062 +   the cluster of index @index.
32063 +*/
32064 +static int should_compress(tfm_cluster_t * tc, cloff_t index,
32065 +                          struct inode *inode)
32066 +{
32067 +       compression_plugin *cplug = inode_compression_plugin(inode);
32068 +       compression_mode_plugin *mplug = inode_compression_mode_plugin(inode);
32069 +
32070 +       assert("edward-1321", tc->len != 0);
32071 +       assert("edward-1322", cplug != NULL);
32072 +       assert("edward-1323", mplug != NULL);
32073 +
32074 +       return /* estimate by size */
32075 +               (cplug->min_size_deflate ?
32076 +                tc->len >= cplug->min_size_deflate() :
32077 +                1) &&
32078 +               /* estimate by compression mode plugin */
32079 +               (mplug->should_deflate ?
32080 +                mplug->should_deflate(inode, index) :
32081 +                1);
32082 +}
32083 +
32084 +/* Evaluating results of compression transform.
32085 +   Returns true, if we need to accept this results */
32086 +static int
32087 +save_compressed(int size_before, int size_after, struct inode * inode)
32088 +{
32089 +       return (size_after + deflate_overhead(inode) +
32090 +               max_cipher_overhead(inode) < size_before);
32091 +}
32092 +
32093 +/* Guess result of the evaluation above */
32094 +static int
32095 +need_inflate(reiser4_cluster_t * clust, struct inode *inode,
32096 +            int encrypted /* is cluster encrypted */ )
32097 +{
32098 +       tfm_cluster_t *tc = &clust->tc;
32099 +
32100 +       assert("edward-142", tc != 0);
32101 +       assert("edward-143", inode != NULL);
32102 +
32103 +       return tc->len <
32104 +           (encrypted ?
32105 +            inode_scaled_offset(inode, tc->lsize) :
32106 +            tc->lsize);
32107 +}
32108 +
32109 +/* If results of compression were accepted, then we add
32110 +   a checksum to catch possible disk cluster corruption.
32111 +   The following is a format of the data stored in disk clusters:
32112 +
32113 +                  data                   This is (transformed) logical cluster.
32114 +                  cipher_overhead        This is created by ->align() method
32115 +                                          of cipher plugin. May be absent.
32116 +                  checksum          (4)  This is created by ->checksum method
32117 +                                          of compression plugin to check
32118 +                                          integrity. May be absent.
32119 +
32120 +                  Crypto overhead format:
32121 +
32122 +                  data
32123 +                  control_byte      (1)   contains aligned overhead size:
32124 +                                          1 <= overhead <= cipher_blksize
32125 +*/
32126 +/* Append a checksum at the end of a transformed stream */
32127 +static void dc_set_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32128 +{
32129 +       __u32 checksum;
32130 +
32131 +       assert("edward-1309", tc != NULL);
32132 +       assert("edward-1310", tc->len > 0);
32133 +       assert("edward-1311", cplug->checksum != NULL);
32134 +
32135 +       checksum = cplug->checksum(tfm_stream_data(tc, OUTPUT_STREAM), tc->len);
32136 +       put_unaligned(cpu_to_le32(checksum),
32137 +                (d32 *)(tfm_stream_data(tc, OUTPUT_STREAM) + tc->len));
32138 +       tc->len += (int)DC_CHECKSUM_SIZE;
32139 +}
32140 +
32141 +/* Check a disk cluster checksum.
32142 +   Returns 0 if checksum is correct, otherwise returns 1 */
32143 +static int dc_check_checksum(compression_plugin * cplug, tfm_cluster_t * tc)
32144 +{
32145 +       assert("edward-1312", tc != NULL);
32146 +       assert("edward-1313", tc->len > (int)DC_CHECKSUM_SIZE);
32147 +       assert("edward-1314", cplug->checksum != NULL);
32148 +
32149 +       if (cplug->checksum(tfm_stream_data(tc, INPUT_STREAM),
32150 +                           tc->len - (int)DC_CHECKSUM_SIZE) !=
32151 +           le32_to_cpu(get_unaligned((d32 *)
32152 +                                     (tfm_stream_data(tc, INPUT_STREAM)
32153 +                                      + tc->len - (int)DC_CHECKSUM_SIZE)))) {
32154 +               warning("edward-156",
32155 +                       "Bad disk cluster checksum %d, (should be %d) Fsck?\n",
32156 +                       (int)le32_to_cpu
32157 +                       (get_unaligned((d32 *)
32158 +                                      (tfm_stream_data(tc, INPUT_STREAM) +
32159 +                                       tc->len - (int)DC_CHECKSUM_SIZE))),
32160 +                       (int)cplug->checksum
32161 +                       (tfm_stream_data(tc, INPUT_STREAM),
32162 +                        tc->len - (int)DC_CHECKSUM_SIZE));
32163 +               return 1;
32164 +       }
32165 +       tc->len -= (int)DC_CHECKSUM_SIZE;
32166 +       return 0;
32167 +}
32168 +
32169 +/* get input/output stream for some transform action */
32170 +int grab_tfm_stream(struct inode * inode, tfm_cluster_t * tc,
32171 +                   tfm_stream_id id)
32172 +{
32173 +       size_t size = inode_scaled_cluster_size(inode);
32174 +
32175 +       assert("edward-901", tc != NULL);
32176 +       assert("edward-1027", inode_compression_plugin(inode) != NULL);
32177 +
32178 +       if (cluster_get_tfm_act(tc) == TFMA_WRITE)
32179 +               size += deflate_overrun(inode, inode_cluster_size(inode));
32180 +
32181 +       if (!tfm_stream(tc, id) && id == INPUT_STREAM)
32182 +               alternate_streams(tc);
32183 +       if (!tfm_stream(tc, id))
32184 +               return alloc_tfm_stream(tc, size, id);
32185 +
32186 +       assert("edward-902", tfm_stream_is_set(tc, id));
32187 +
32188 +       if (tfm_stream_size(tc, id) < size)
32189 +               return realloc_tfm_stream(tc, size, id);
32190 +       return 0;
32191 +}
32192 +
32193 +/* Common deflate manager */
32194 +int reiser4_deflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32195 +{
32196 +       int result = 0;
32197 +       int compressed = 0;
32198 +       int encrypted = 0;
32199 +       tfm_cluster_t * tc = &clust->tc;
32200 +       compression_plugin * coplug;
32201 +
32202 +       assert("edward-401", inode != NULL);
32203 +       assert("edward-903", tfm_stream_is_set(tc, INPUT_STREAM));
32204 +       assert("edward-1348", cluster_get_tfm_act(tc) == TFMA_WRITE);
32205 +       assert("edward-498", !tfm_cluster_is_uptodate(tc));
32206 +
32207 +       coplug = inode_compression_plugin(inode);
32208 +       if (should_compress(tc, clust->index, inode)) {
32209 +               /* try to compress, discard bad results */
32210 +               __u32 dst_len;
32211 +               compression_mode_plugin * mplug =
32212 +                       inode_compression_mode_plugin(inode);
32213 +               assert("edward-602", coplug != NULL);
32214 +               assert("edward-1423", coplug->compress != NULL);
32215 +
32216 +               result = grab_coa(tc, coplug);
32217 +               if (result) {
32218 +                   warning("edward-1424",
32219 +                           "alloc_coa failed with ret=%d, skipped compression",
32220 +                           result);
32221 +                   goto cipher;
32222 +               }
32223 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32224 +               if (result) {
32225 +                   warning("edward-1425",
32226 +                        "alloc stream failed with ret=%d, skipped compression",
32227 +                           result);
32228 +                   goto cipher;
32229 +               }
32230 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
32231 +               coplug->compress(get_coa(tc, coplug->h.id, tc->act),
32232 +                                tfm_input_data(clust), tc->len,
32233 +                                tfm_output_data(clust), &dst_len);
32234 +               /* make sure we didn't overwrite extra bytes */
32235 +               assert("edward-603",
32236 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
32237 +
32238 +               /* evaluate results of compression transform */
32239 +               if (save_compressed(tc->len, dst_len, inode)) {
32240 +                       /* good result, accept */
32241 +                       tc->len = dst_len;
32242 +                       if (mplug->accept_hook != NULL) {
32243 +                              result = mplug->accept_hook(inode, clust->index);
32244 +                              if (result)
32245 +                                      warning("edward-1426",
32246 +                                              "accept_hook failed with ret=%d",
32247 +                                              result);
32248 +                       }
32249 +                       compressed = 1;
32250 +               }
32251 +               else {
32252 +                       /* bad result, discard */
32253 +#if REISER4_DEBUG
32254 +                       if (cluster_is_complete(clust, inode))
32255 +                             warning("edward-1338",
32256 +                                     "incompressible cluster %lu (inode %llu)",
32257 +                                     clust->index,
32258 +                                     (unsigned long long)get_inode_oid(inode));
32259 +#endif
32260 +                       if (mplug->discard_hook != NULL &&
32261 +                           cluster_is_complete(clust, inode)) {
32262 +                               result = mplug->discard_hook(inode,
32263 +                                                            clust->index);
32264 +                               if (result)
32265 +                                     warning("edward-1427",
32266 +                                             "discard_hook failed with ret=%d",
32267 +                                             result);
32268 +                       }
32269 +               }
32270 +       }
32271 + cipher:
32272 +       if (need_cipher(inode)) {
32273 +               cipher_plugin * ciplug;
32274 +               struct blkcipher_desc desc;
32275 +               struct scatterlist src;
32276 +               struct scatterlist dst;
32277 +
32278 +               ciplug = inode_cipher_plugin(inode);
32279 +               desc.tfm = info_get_cipher(inode_crypto_stat(inode));
32280 +               desc.flags = 0;
32281 +               if (compressed)
32282 +                       alternate_streams(tc);
32283 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32284 +               if (result)
32285 +                       return result;
32286 +
32287 +               align_or_cut_overhead(inode, clust, WRITE_OP);
32288 +               src.page = virt_to_page(tfm_input_data(clust));
32289 +               src.offset = offset_in_page(tfm_input_data(clust));
32290 +               src.length = tc->len;
32291 +
32292 +               dst.page = virt_to_page(tfm_output_data(clust));
32293 +               dst.offset = offset_in_page(tfm_output_data(clust));
32294 +               dst.length = tc->len;
32295 +
32296 +               result = crypto_blkcipher_encrypt(&desc, &dst, &src, tc->len);
32297 +               if (result) {
32298 +                       warning("edward-1405",
32299 +                               "encryption failed flags=%x\n", desc.flags);
32300 +                       return result;
32301 +               }
32302 +               encrypted = 1;
32303 +       }
32304 +       if (compressed && coplug->checksum != NULL)
32305 +               dc_set_checksum(coplug, tc);
32306 +       if (!compressed && !encrypted)
32307 +               alternate_streams(tc);
32308 +       return result;
32309 +}
32310 +
32311 +/* Common inflate manager. */
32312 +int reiser4_inflate_cluster(reiser4_cluster_t * clust, struct inode * inode)
32313 +{
32314 +       int result = 0;
32315 +       int transformed = 0;
32316 +       tfm_cluster_t * tc = &clust->tc;
32317 +       compression_plugin * coplug;
32318 +
32319 +       assert("edward-905", inode != NULL);
32320 +       assert("edward-1178", clust->dstat == PREP_DISK_CLUSTER);
32321 +       assert("edward-906", tfm_stream_is_set(&clust->tc, INPUT_STREAM));
32322 +       assert("edward-1349", tc->act == TFMA_READ);
32323 +       assert("edward-907", !tfm_cluster_is_uptodate(tc));
32324 +
32325 +       /* Handle a checksum (if any) */
32326 +       coplug = inode_compression_plugin(inode);
32327 +       if (need_inflate(clust, inode, need_cipher(inode)) &&
32328 +           coplug->checksum != NULL) {
32329 +               result = dc_check_checksum(coplug, tc);
32330 +               if (unlikely(result)) {
32331 +                       warning("edward-1460",
32332 +                               "Inode %llu: disk cluster %lu looks corrupted",
32333 +                               (unsigned long long)get_inode_oid(inode),
32334 +                               clust->index);
32335 +                       return RETERR(-EIO);
32336 +               }
32337 +       }
32338 +       if (need_cipher(inode)) {
32339 +               cipher_plugin * ciplug;
32340 +               struct blkcipher_desc desc;
32341 +               struct scatterlist src;
32342 +               struct scatterlist dst;
32343 +
32344 +               ciplug = inode_cipher_plugin(inode);
32345 +               desc.tfm = info_get_cipher(inode_crypto_stat(inode));
32346 +               desc.flags = 0;
32347 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32348 +               if (result)
32349 +                       return result;
32350 +               assert("edward-909", tfm_cluster_is_set(tc));
32351 +
32352 +               src.page   =   virt_to_page(tfm_input_data(clust));
32353 +               src.offset = offset_in_page(tfm_input_data(clust));
32354 +               src.length = tc->len;
32355 +
32356 +               dst.page   =   virt_to_page(tfm_output_data(clust));
32357 +               dst.offset = offset_in_page(tfm_output_data(clust));
32358 +               dst.length = tc->len;
32359 +
32360 +               result = crypto_blkcipher_decrypt(&desc, &dst, &src, tc->len);
32361 +               if (result) {
32362 +                       warning("edward-1600", "decrypt failed flags=%x\n",
32363 +                               desc.flags);
32364 +                       return result;
32365 +               }
32366 +               align_or_cut_overhead(inode, clust, READ_OP);
32367 +               transformed = 1;
32368 +       }
32369 +       if (need_inflate(clust, inode, 0)) {
32370 +               unsigned dst_len = inode_cluster_size(inode);
32371 +               if(transformed)
32372 +                       alternate_streams(tc);
32373 +
32374 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
32375 +               if (result)
32376 +                       return result;
32377 +               assert("edward-1305", coplug->decompress != NULL);
32378 +               assert("edward-910", tfm_cluster_is_set(tc));
32379 +
32380 +               coplug->decompress(get_coa(tc, coplug->h.id, tc->act),
32381 +                                  tfm_input_data(clust), tc->len,
32382 +                                  tfm_output_data(clust), &dst_len);
32383 +               /* check length */
32384 +               tc->len = dst_len;
32385 +               assert("edward-157", dst_len == tc->lsize);
32386 +               transformed = 1;
32387 +       }
32388 +       if (!transformed)
32389 +               alternate_streams(tc);
32390 +       return result;
32391 +}
32392 +
32393 +/* This is implementation of readpage method of struct
32394 +   address_space_operations for cryptcompress plugin. */
32395 +int readpage_cryptcompress(struct file *file, struct page *page)
32396 +{
32397 +       reiser4_context *ctx;
32398 +       reiser4_cluster_t clust;
32399 +       item_plugin *iplug;
32400 +       int result;
32401 +
32402 +       assert("edward-88", PageLocked(page));
32403 +       assert("vs-976", !PageUptodate(page));
32404 +       assert("edward-89", page->mapping && page->mapping->host);
32405 +
32406 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
32407 +       if (IS_ERR(ctx)) {
32408 +               unlock_page(page);
32409 +               return PTR_ERR(ctx);
32410 +       }
32411 +       assert("edward-113",
32412 +              ergo(file != NULL,
32413 +                   page->mapping == file->f_dentry->d_inode->i_mapping));
32414 +
32415 +       if (PageUptodate(page)) {
32416 +               warning("edward-1338", "page is already uptodate\n");
32417 +               unlock_page(page);
32418 +               reiser4_exit_context(ctx);
32419 +               return 0;
32420 +       }
32421 +       cluster_init_read(&clust, NULL);
32422 +       clust.file = file;
32423 +       iplug = item_plugin_by_id(CTAIL_ID);
32424 +       if (!iplug->s.file.readpage) {
32425 +               unlock_page(page);
32426 +               put_cluster_handle(&clust);
32427 +               reiser4_exit_context(ctx);
32428 +               return -EINVAL;
32429 +       }
32430 +       result = iplug->s.file.readpage(&clust, page);
32431 +
32432 +       assert("edward-1459", !PageLocked(page));
32433 +       assert("edward-64", ergo(result == 0, PageUptodate(page)));
32434 +       put_cluster_handle(&clust);
32435 +       reiser4_exit_context(ctx);
32436 +       return result;
32437 +}
32438 +
32439 +/* how much pages will be captured */
32440 +static int cluster_nrpages_to_capture(reiser4_cluster_t * clust)
32441 +{
32442 +       switch (clust->op) {
32443 +       case PCL_APPEND:
32444 +               return clust->nr_pages;
32445 +       case PCL_TRUNCATE:
32446 +               assert("edward-1179", clust->win != NULL);
32447 +               return count_to_nrpages(clust->win->off + clust->win->count);
32448 +       default:
32449 +               impossible("edward-1180", "bad page cluster option");
32450 +               return 0;
32451 +       }
32452 +}
32453 +
32454 +static void set_cluster_pages_dirty(reiser4_cluster_t * clust)
32455 +{
32456 +       int i;
32457 +       struct page *pg;
32458 +       int nrpages = cluster_nrpages_to_capture(clust);
32459 +
32460 +       for (i = 0; i < nrpages; i++) {
32461 +
32462 +               pg = clust->pages[i];
32463 +               assert("edward-968", pg != NULL);
32464 +               lock_page(pg);
32465 +               assert("edward-1065", PageUptodate(pg));
32466 +               reiser4_set_page_dirty_internal(pg);
32467 +               unlock_page(pg);
32468 +               mark_page_accessed(pg);
32469 +       }
32470 +}
32471 +
32472 +static void clear_cluster_pages_dirty(reiser4_cluster_t * clust)
32473 +{
32474 +       int i;
32475 +       assert("edward-1275", clust != NULL);
32476 +
32477 +       for (i = 0; i < clust->nr_pages; i++) {
32478 +               assert("edward-1276", clust->pages[i] != NULL);
32479 +
32480 +               lock_page(clust->pages[i]);
32481 +               if (PageDirty(clust->pages[i])) {
32482 +                       assert("edward-1277", PageUptodate(clust->pages[i]));
32483 +                       cancel_dirty_page(clust->pages[i], PAGE_CACHE_SIZE);
32484 +               }
32485 +#if REISER4_DEBUG
32486 +               else
32487 +                       /* Race between flush and write:
32488 +                          some pages became clean when write() (or another
32489 +                          process which modifies data) capture the cluster. */
32490 +                       warning("edward-985", "Page of index %lu (inode %llu)"
32491 +                               " is not dirty\n", clust->pages[i]->index,
32492 +                               (unsigned long long)get_inode_oid(clust->
32493 +                                                                 pages[i]->
32494 +                                                                 mapping->
32495 +                                                                 host));
32496 +#endif
32497 +               unlock_page(clust->pages[i]);
32498 +       }
32499 +}
32500 +
32501 +/* update i_size by window */
32502 +static void inode_set_new_size(reiser4_cluster_t * clust, struct inode *inode)
32503 +{
32504 +       loff_t size;
32505 +       reiser4_slide_t *win;
32506 +
32507 +       assert("edward-1181", clust != NULL);
32508 +       assert("edward-1182", inode != NULL);
32509 +
32510 +       win = clust->win;
32511 +       assert("edward-1183", win != NULL);
32512 +       assert("edward-1183", win->count != 0);
32513 +
32514 +       size = clust_to_off(clust->index, inode) + win->off;
32515 +
32516 +       switch (clust->op) {
32517 +       case PCL_APPEND:
32518 +               if (size + win->count <= inode->i_size)
32519 +                       /* overwrite only */
32520 +                       return;
32521 +               size += win->count;
32522 +               break;
32523 +       case PCL_TRUNCATE:
32524 +               break;
32525 +       default:
32526 +               impossible("edward-1184", "bad page cluster option");
32527 +               break;
32528 +       }
32529 +       inode_check_scale_nolock(inode, inode->i_size, size);
32530 +       inode->i_size = size;
32531 +       return;
32532 +}
32533 +
32534 +/* Check in page cluster modifications.
32535 +   . Make jnode dirty, if it wasn't;
32536 +   . Reserve space for a disk cluster update by flush algorithm, if needed;
32537 +   . Clean up old references (if any).
32538 +   . Put pages (grabbed in this thread) which will be truncated
32539 +*/
32540 +static void
32541 +make_cluster_jnode_dirty_locked(reiser4_cluster_t * clust, jnode * node,
32542 +                               loff_t * old_isize, struct inode *inode)
32543 +{
32544 +       int i;
32545 +       int old_nrpages;
32546 +       int new_nrpages = cluster_nrpages_to_capture(clust);
32547 +
32548 +       assert("edward-973", new_nrpages > 0);
32549 +       assert("edward-221", node != NULL);
32550 +       assert("edward-971", clust->reserved == 1);
32551 +       assert_spin_locked(&(node->guard));
32552 +       assert("edward-972", node->page_count <= cluster_nrpages(inode));
32553 +       assert("edward-1263",
32554 +              clust->reserved_prepped == estimate_update_cluster(inode));
32555 +       assert("edward-1264", clust->reserved_unprepped == 0);
32556 +
32557 +       if (JF_ISSET(node, JNODE_DIRTY)) {
32558 +               /* someone has modified this cluster, but
32559 +                  the modifications are not committed yet */
32560 +               old_nrpages =
32561 +                       count_to_nrpages(cnt_to_clcnt(*old_isize,
32562 +                                                     clust->index, inode));
32563 +               /* free space which is already reserved */
32564 +               free_reserved4cluster(inode, clust,
32565 +                                     estimate_update_cluster(inode));
32566 +               /* put old references */
32567 +               for (i = 0; i < old_nrpages; i++) {
32568 +                       assert("edward-975", clust->pages[i]);
32569 +                       assert("edward-1185", PageUptodate(clust->pages[i]));
32570 +
32571 +                       page_cache_release(clust->pages[i]);
32572 +#if REISER4_DEBUG
32573 +                       cryptcompress_inode_data(inode)->pgcount --;
32574 +#endif
32575 +               }
32576 +       } else {
32577 +               /* no captured pages */
32578 +               assert("edward-1043", node->page_count == 0);
32579 +               jnode_make_dirty_locked(node);
32580 +               clust->reserved = 0;
32581 +       }
32582 +       /* put pages that will be truncated (if any) */
32583 +       for (i = new_nrpages; i < clust->nr_pages; i++) {
32584 +               assert("edward-1433", clust->pages[i]);
32585 +               assert("edward-1434", PageUptodate(clust->pages[i]));
32586 +               page_cache_release(clust->pages[i]);
32587 +#if REISER4_DEBUG
32588 +               cryptcompress_inode_data(inode)->pgcount --;
32589 +#endif
32590 +       }
32591 +#if REISER4_DEBUG
32592 +       clust->reserved_prepped -= estimate_update_cluster(inode);
32593 +       node->page_count = new_nrpages;
32594 +#endif
32595 +       return;
32596 +}
32597 +
32598 +/* This function spawns a transaction and
32599 +   is called by any thread as a final step in page cluster modification.
32600 +*/
32601 +static int try_capture_cluster(reiser4_cluster_t * clust, struct inode *inode)
32602 +{
32603 +       int result = 0;
32604 +       loff_t old_size;
32605 +       jnode *node;
32606 +
32607 +       assert("edward-1029", clust != NULL);
32608 +       assert("edward-1030", clust->reserved == 1);
32609 +       assert("edward-1031", clust->nr_pages != 0);
32610 +       assert("edward-1032", clust->pages != NULL);
32611 +       assert("edward-1033", clust->pages[0] != NULL);
32612 +
32613 +       node = jprivate(clust->pages[0]);
32614 +       assert("edward-1035", node != NULL);
32615 +       assert("edward-1446", jnode_is_cluster_page(node));
32616 +
32617 +       spin_lock_jnode(node);
32618 +
32619 +       old_size = inode->i_size;
32620 +       if (clust->win)
32621 +               inode_set_new_size(clust, inode);
32622 +
32623 +       result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
32624 +       if (result)
32625 +               goto exit;
32626 +       make_cluster_jnode_dirty_locked(clust, node, &old_size, inode);
32627 +      exit:
32628 +       spin_unlock_jnode(node);
32629 +       jput(node);
32630 +       return result;
32631 +}
32632 +
32633 +/* Collect unlocked cluster pages for any modifications and attach a jnode.
32634 +   We allocate only one jnode per cluster, this jnode is binded to the first
32635 +   page of this cluster, so we have an extra-reference that will exist with
32636 +   this jnode, other references will be cleaned up in flush time.
32637 +*/
32638 +static int
32639 +grab_cluster_pages_jnode(struct inode *inode, reiser4_cluster_t * clust)
32640 +{
32641 +       int i;
32642 +       int result = 0;
32643 +       jnode *node = NULL;
32644 +
32645 +       assert("edward-182", clust != NULL);
32646 +       assert("edward-183", clust->pages != NULL);
32647 +       assert("edward-184", clust->nr_pages <= cluster_nrpages(inode));
32648 +
32649 +       if (clust->nr_pages == 0)
32650 +               return 0;
32651 +
32652 +       for (i = 0; i < clust->nr_pages; i++) {
32653 +
32654 +               assert("edward-1044", clust->pages[i] == NULL);
32655 +
32656 +               clust->pages[i] =
32657 +                       find_or_create_page(inode->i_mapping,
32658 +                                           clust_to_pg(clust->index, inode) + i,
32659 +                                           reiser4_ctx_gfp_mask_get());
32660 +               if (!clust->pages[i]) {
32661 +                       result = RETERR(-ENOMEM);
32662 +                       break;
32663 +               }
32664 +               if (i == 0) {
32665 +                       node = jnode_of_page(clust->pages[i]);
32666 +                       if (IS_ERR(node)) {
32667 +                               result = PTR_ERR(node);
32668 +                               unlock_page(clust->pages[i]);
32669 +                               break;
32670 +                       }
32671 +                       JF_SET(node, JNODE_CLUSTER_PAGE);
32672 +                       unlock_page(clust->pages[i]);
32673 +                       assert("edward-919", node);
32674 +                       continue;
32675 +               }
32676 +               unlock_page(clust->pages[i]);
32677 +       }
32678 +       if (result) {
32679 +               while (i)
32680 +                       page_cache_release(clust->pages[--i]);
32681 +               if (node && !IS_ERR(node))
32682 +                       jput(node);
32683 +               return result;
32684 +       }
32685 +       assert("edward-920", jprivate(clust->pages[0]));
32686 +#if REISER4_DEBUG
32687 +       cryptcompress_inode_data(inode)->pgcount += clust->nr_pages;
32688 +#endif
32689 +       return 0;
32690 +}
32691 +
32692 +/* Collect unlocked cluster pages only for read (not to modify) */
32693 +int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
32694 +{
32695 +       int i;
32696 +       int result = 0;
32697 +
32698 +       assert("edward-1428", inode != NULL);
32699 +       assert("edward-1429", inode->i_mapping != NULL);
32700 +       assert("edward-787", clust != NULL);
32701 +       assert("edward-788", clust->pages != NULL);
32702 +       assert("edward-789", clust->nr_pages != 0);
32703 +       assert("edward-790", clust->nr_pages <= cluster_nrpages(inode));
32704 +
32705 +       for (i = 0; i < clust->nr_pages; i++) {
32706 +               clust->pages[i] =
32707 +                      find_or_create_page(inode->i_mapping,
32708 +                                          clust_to_pg(clust->index, inode) + i,
32709 +                                          reiser4_ctx_gfp_mask_get());
32710 +               if (!clust->pages[i]) {
32711 +                       result = RETERR(-ENOMEM);
32712 +                       break;
32713 +               }
32714 +               unlock_page(clust->pages[i]);
32715 +       }
32716 +       if (result)
32717 +               while (i)
32718 +                       page_cache_release(clust->pages[--i]);
32719 +       return result;
32720 +}
32721 +
32722 +/* @node might be attached by reiser4_writepage(), not by
32723 +   cryptcompress plugin code, but emergency flush should
32724 +   understand that pages of cryptcompress files are not
32725 +   flushable.
32726 +*/
32727 +#if 0
32728 +int jnode_of_cluster(const jnode * node, struct page * page)
32729 +{
32730 +       assert("edward-1339", node != NULL);
32731 +       assert("edward-1340", page != NULL);
32732 +       assert("edward-1341", page->mapping != NULL);
32733 +       assert("edward-1342", page->mapping->host != NULL);
32734 +       assert("edward-1343",
32735 +              ergo(jnode_is_unformatted(node),
32736 +                   get_inode_oid(page->mapping->host) ==
32737 +                   node->key.j.objectid));
32738 +       if (inode_file_plugin(page->mapping->host) ==
32739 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID)) {
32740 +#if REISER4_DEBUG
32741 +               if (!jnode_is_cluster_page(node))
32742 +                       warning("edward-1345",
32743 +                       "inode %llu: cluster page of index %lu became private",
32744 +                       (unsigned long long)get_inode_oid(page->mapping->host),
32745 +                       page->index);
32746 +#endif
32747 +               return 1;
32748 +       }
32749 +       return 0;
32750 +}
32751 +#endif  /*  0  */
32752 +
32753 +/* put cluster pages */
32754 +void reiser4_release_cluster_pages(reiser4_cluster_t * clust)
32755 +{
32756 +       int i;
32757 +
32758 +       assert("edward-447", clust != NULL);
32759 +       for (i = 0; i < clust->nr_pages; i++) {
32760 +
32761 +               assert("edward-449", clust->pages[i] != NULL);
32762 +
32763 +               page_cache_release(clust->pages[i]);
32764 +       }
32765 +}
32766 +
32767 +/* this is called when something is failed */
32768 +static void reiser4_release_cluster_pages_and_jnode(reiser4_cluster_t * clust)
32769 +{
32770 +       jnode *node;
32771 +
32772 +       assert("edward-445", clust != NULL);
32773 +       assert("edward-922", clust->pages != NULL);
32774 +       assert("edward-446", clust->pages[0] != NULL);
32775 +
32776 +       node = jprivate(clust->pages[0]);
32777 +
32778 +       assert("edward-447", node != NULL);
32779 +
32780 +       reiser4_release_cluster_pages(clust);
32781 +       jput(node);
32782 +}
32783 +
32784 +#if REISER4_DEBUG
32785 +static int window_ok(reiser4_slide_t * win, struct inode *inode)
32786 +{
32787 +       assert("edward-1115", win != NULL);
32788 +       assert("edward-1116", ergo(win->delta, win->stat == HOLE_WINDOW));
32789 +
32790 +       return (win->off != inode_cluster_size(inode)) &&
32791 +           (win->off + win->count + win->delta <= inode_cluster_size(inode));
32792 +}
32793 +
32794 +static int cluster_ok(reiser4_cluster_t * clust, struct inode *inode)
32795 +{
32796 +       assert("edward-279", clust != NULL);
32797 +
32798 +       if (!clust->pages)
32799 +               return 0;
32800 +       return (clust->win ? window_ok(clust->win, inode) : 1);
32801 +}
32802 +#endif
32803 +
32804 +/* guess next window stat */
32805 +static inline window_stat next_window_stat(reiser4_slide_t * win)
32806 +{
32807 +       assert("edward-1130", win != NULL);
32808 +       return ((win->stat == HOLE_WINDOW && win->delta == 0) ?
32809 +               HOLE_WINDOW : DATA_WINDOW);
32810 +}
32811 +
32812 +/* guess next cluster index and window params */
32813 +static void
32814 +update_cluster(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
32815 +              loff_t to_file)
32816 +{
32817 +       reiser4_slide_t *win;
32818 +
32819 +       assert("edward-185", clust != NULL);
32820 +       assert("edward-438", clust->pages != NULL);
32821 +       assert("edward-281", cluster_ok(clust, inode));
32822 +
32823 +       win = clust->win;
32824 +       if (!win)
32825 +               return;
32826 +
32827 +       switch (win->stat) {
32828 +       case DATA_WINDOW:
32829 +               /* increment window position */
32830 +               clust->index++;
32831 +               win->stat = DATA_WINDOW;
32832 +               win->off = 0;
32833 +               win->count = min_count(inode_cluster_size(inode), to_file);
32834 +               break;
32835 +       case HOLE_WINDOW:
32836 +               switch (next_window_stat(win)) {
32837 +               case HOLE_WINDOW:
32838 +                       /* set window to fit the offset we start write from */
32839 +                       clust->index = off_to_clust(file_off, inode);
32840 +                       win->stat = HOLE_WINDOW;
32841 +                       win->off = 0;
32842 +                       win->count = off_to_cloff(file_off, inode);
32843 +                       win->delta =
32844 +                           min_count(inode_cluster_size(inode) - win->count,
32845 +                                     to_file);
32846 +                       break;
32847 +               case DATA_WINDOW:
32848 +                       /* do not move the window, just change its state,
32849 +                          off+count+delta=inv */
32850 +                       win->stat = DATA_WINDOW;
32851 +                       win->off = win->off + win->count;
32852 +                       win->count = win->delta;
32853 +                       win->delta = 0;
32854 +                       break;
32855 +               default:
32856 +                       impossible("edward-282", "wrong next window state");
32857 +               }
32858 +               break;
32859 +       default:
32860 +               impossible("edward-283", "wrong current window state");
32861 +       }
32862 +       assert("edward-1068", cluster_ok(clust, inode));
32863 +}
32864 +
32865 +static int update_sd_cryptcompress(struct inode *inode)
32866 +{
32867 +       int result = 0;
32868 +
32869 +       assert("edward-978", reiser4_schedulable());
32870 +
32871 +       result = reiser4_grab_space_force(      /* one for stat data update */
32872 +                                                estimate_update_common(inode),
32873 +                                                BA_CAN_COMMIT);
32874 +       if (result)
32875 +               return result;
32876 +       inode->i_ctime = inode->i_mtime = CURRENT_TIME;
32877 +       result = reiser4_update_sd(inode);
32878 +
32879 +       return result;
32880 +}
32881 +
32882 +/* NOTE-Edward: this is too similar to reiser4/txnmgr.c:uncapture_jnode() */
32883 +static void uncapture_cluster_jnode(jnode * node)
32884 +{
32885 +       txn_atom *atom;
32886 +
32887 +       assert_spin_locked(&(node->guard));
32888 +
32889 +       /*jnode_make_clean(node); */
32890 +       atom = jnode_get_atom(node);
32891 +       if (atom == NULL) {
32892 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
32893 +               spin_unlock_jnode(node);
32894 +               return;
32895 +       }
32896 +
32897 +       reiser4_uncapture_block(node);
32898 +       spin_unlock_atom(atom);
32899 +       jput(node);
32900 +}
32901 +
32902 +static void forget_cluster_pages(struct page **pages, int nr)
32903 +{
32904 +       int i;
32905 +       for (i = 0; i < nr; i++) {
32906 +
32907 +               assert("edward-1045", pages[i] != NULL);
32908 +               page_cache_release(pages[i]);
32909 +       }
32910 +}
32911 +
32912 +/* Check out last modifications we are about to commit,
32913 +   and prepare input stream for transform operations.
32914 +*/
32915 +int
32916 +flush_cluster_pages(reiser4_cluster_t * clust, jnode * node,
32917 +                   struct inode *inode)
32918 +{
32919 +       int result = 0;
32920 +       int i;
32921 +       int nr_pages = 0;
32922 +       tfm_cluster_t *tc = &clust->tc;
32923 +#if REISER4_DEBUG
32924 +       int node_pgcount;
32925 +#endif
32926 +       assert("edward-980", node != NULL);
32927 +       assert("edward-236", inode != NULL);
32928 +       assert("edward-237", clust != NULL);
32929 +       assert("edward-240", !clust->win);
32930 +       assert("edward-241", reiser4_schedulable());
32931 +       assert("edward-718", cryptcompress_inode_ok(inode));
32932 +
32933 +       result = grab_tfm_stream(inode, tc, INPUT_STREAM);
32934 +       if (result) {
32935 +               warning("edward-1430",
32936 +                       "alloc stream failed with ret=%d", result);
32937 +               return result;
32938 +       }
32939 +       spin_lock_jnode(node);
32940 +#if REISER4_DEBUG
32941 +       node_pgcount = node->page_count;
32942 +#endif
32943 +       if (!JF_ISSET(node, JNODE_DIRTY)) {
32944 +               /* race with another flush */
32945 +#if REISER4_DEBUG
32946 +               assert("edward-981", node_pgcount == 0);
32947 +               warning("edward-982", "flush_cluster_pages: jnode is not dirty "
32948 +                       "clust %lu, inode %llu\n",
32949 +                       clust->index, (unsigned long long)get_inode_oid(inode));
32950 +#endif
32951 +               spin_unlock_jnode(node);
32952 +               return RETERR(-E_REPEAT);
32953 +       }
32954 +       /* Check out a size of logical cluster and
32955 +          set a number of cluster pages to commit. */
32956 +       tc->len = tc->lsize = fsize_to_count(clust, inode);
32957 +       clust->nr_pages = count_to_nrpages(tc->len);
32958 +
32959 +#if REISER4_DEBUG
32960 +       node->page_count = 0;
32961 +#endif
32962 +       cluster_reserved2grabbed(estimate_update_cluster(inode));
32963 +       uncapture_cluster_jnode(node);
32964 +
32965 +       assert("edward-1224", reiser4_schedulable());
32966 +       /* Check out page cluster for commit */
32967 +       nr_pages =
32968 +             find_get_pages(inode->i_mapping, clust_to_pg(clust->index, inode),
32969 +                            clust->nr_pages, clust->pages);
32970 +       if (nr_pages != clust->nr_pages)
32971 +               goto checkout_failed;
32972 +
32973 +       /* Try to construct input stream from the checked out pages */
32974 +       for (i = 0; i < clust->nr_pages; i++) {
32975 +               char *data;
32976 +
32977 +               assert("edward-242", clust->pages[i] != NULL);
32978 +               if (clust->pages[i]->index !=
32979 +                   clust_to_pg(clust->index, inode) + i)
32980 +                       goto checkout_failed;
32981 +               BUG_ON(!PageUptodate(clust->pages[i]));
32982 +
32983 +               /* flush the page into input transform stream */
32984 +               lock_page(clust->pages[i]);
32985 +               data = kmap(clust->pages[i]);
32986 +
32987 +               assert("edward-986", cnt_to_pgcnt(tc->len, i) != 0);
32988 +
32989 +               memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
32990 +                      data, cnt_to_pgcnt(tc->len, i));
32991 +               kunmap(clust->pages[i]);
32992 +               unlock_page(clust->pages[i]);
32993 +       }
32994 +       /* page cluster flushed successfully */
32995 +
32996 +       clear_cluster_pages_dirty(clust);
32997 +       reiser4_release_cluster_pages(clust);
32998 +#if REISER4_DEBUG
32999 +       cryptcompress_inode_data(inode)->pgcount -= clust->nr_pages;
33000 +#endif
33001 +       goto out;
33002 + checkout_failed:
33003 +#if REISER4_DEBUG
33004 +       assert("edward-1282", node_pgcount == 0);
33005 +       warning("edward-1435", "Inode %llu : checkout page cluster"
33006 +               "of index %lu failed\n",
33007 +                       (unsigned long long)get_inode_oid(inode), clust->index);
33008 +#endif /* REISER4_DEBUG */
33009 +       result = RETERR(-E_REPEAT);
33010 + out:
33011 +       /* put pages that were found here */
33012 +       forget_cluster_pages(clust->pages, nr_pages);
33013 +       return result;
33014 +}
33015 +
33016 +/* set hint for the cluster of the index @index */
33017 +static void set_hint_cluster(struct inode *inode, hint_t * hint,
33018 +                            cloff_t index, znode_lock_mode mode)
33019 +{
33020 +       reiser4_key key;
33021 +       assert("edward-722", cryptcompress_inode_ok(inode));
33022 +       assert("edward-723",
33023 +              inode_file_plugin(inode) ==
33024 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
33025 +
33026 +       inode_file_plugin(inode)->key_by_inode(inode,
33027 +                                              clust_to_off(index, inode),
33028 +                                              &key);
33029 +
33030 +       reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, &key);
33031 +       hint->offset = get_key_offset(&key);
33032 +       hint->mode = mode;
33033 +}
33034 +
33035 +void invalidate_hint_cluster(reiser4_cluster_t * clust)
33036 +{
33037 +       assert("edward-1291", clust != NULL);
33038 +       assert("edward-1292", clust->hint != NULL);
33039 +
33040 +       done_lh(&clust->hint->lh);
33041 +       hint_clr_valid(clust->hint);
33042 +}
33043 +
33044 +void put_hint_cluster(reiser4_cluster_t * clust, struct inode *inode,
33045 +                znode_lock_mode mode)
33046 +{
33047 +       assert("edward-1286", clust != NULL);
33048 +       assert("edward-1287", clust->hint != NULL);
33049 +
33050 +       set_hint_cluster(inode, clust->hint, clust->index + 1, mode);
33051 +       invalidate_hint_cluster(clust);
33052 +}
33053 +
33054 +static int
33055 +balance_dirty_page_cluster(reiser4_cluster_t * clust, struct inode *inode,
33056 +                          loff_t off, loff_t to_file)
33057 +{
33058 +       int result;
33059 +
33060 +       assert("edward-724", inode != NULL);
33061 +       assert("edward-725", cryptcompress_inode_ok(inode));
33062 +
33063 +       /* set next window params */
33064 +       update_cluster(inode, clust, off, to_file);
33065 +
33066 +       result = update_sd_cryptcompress(inode);
33067 +       if (result)
33068 +               return result;
33069 +       assert("edward-726", clust->hint->lh.owner == NULL);
33070 +
33071 +       reiser4_throttle_write(inode);
33072 +       return 0;
33073 +}
33074 +
33075 +/* set zeroes to the cluster, update it, and maybe, try to capture its pages */
33076 +static int
33077 +write_hole(struct inode *inode, reiser4_cluster_t * clust, loff_t file_off,
33078 +          loff_t to_file)
33079 +{
33080 +       char *data;
33081 +       int result = 0;
33082 +       unsigned cl_off, cl_count = 0;
33083 +       unsigned to_pg, pg_off;
33084 +       reiser4_slide_t *win;
33085 +
33086 +       assert("edward-190", clust != NULL);
33087 +       assert("edward-1069", clust->win != NULL);
33088 +       assert("edward-191", inode != NULL);
33089 +       assert("edward-727", cryptcompress_inode_ok(inode));
33090 +       assert("edward-1171", clust->dstat != INVAL_DISK_CLUSTER);
33091 +       assert("edward-1154",
33092 +              ergo(clust->dstat != FAKE_DISK_CLUSTER, clust->reserved == 1));
33093 +
33094 +       win = clust->win;
33095 +
33096 +       assert("edward-1070", win != NULL);
33097 +       assert("edward-201", win->stat == HOLE_WINDOW);
33098 +       assert("edward-192", cluster_ok(clust, inode));
33099 +
33100 +       if (win->off == 0 && win->count == inode_cluster_size(inode)) {
33101 +               /* the hole will be represented by fake disk cluster */
33102 +               update_cluster(inode, clust, file_off, to_file);
33103 +               return 0;
33104 +       }
33105 +       cl_count = win->count;  /* number of zeroes to write */
33106 +       cl_off = win->off;
33107 +       pg_off = off_to_pgoff(win->off);
33108 +
33109 +       while (cl_count) {
33110 +               struct page *page;
33111 +               page = clust->pages[off_to_pg(cl_off)];
33112 +
33113 +               assert("edward-284", page != NULL);
33114 +
33115 +               to_pg = min_count(PAGE_CACHE_SIZE - pg_off, cl_count);
33116 +               lock_page(page);
33117 +               data = kmap_atomic(page, KM_USER0);
33118 +               memset(data + pg_off, 0, to_pg);
33119 +               flush_dcache_page(page);
33120 +               kunmap_atomic(data, KM_USER0);
33121 +               SetPageUptodate(page);
33122 +               unlock_page(page);
33123 +
33124 +               cl_off += to_pg;
33125 +               cl_count -= to_pg;
33126 +               pg_off = 0;
33127 +       }
33128 +       if (!win->delta) {
33129 +               /* only zeroes, try to capture */
33130 +
33131 +               set_cluster_pages_dirty(clust);
33132 +               result = try_capture_cluster(clust, inode);
33133 +               if (result)
33134 +                       return result;
33135 +               put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
33136 +               result =
33137 +                   balance_dirty_page_cluster(clust, inode, file_off, to_file);
33138 +       } else
33139 +               update_cluster(inode, clust, file_off, to_file);
33140 +       return result;
33141 +}
33142 +
33143 +/*
33144 +  The main disk search procedure for cryptcompress plugins, which
33145 +  . scans all items of disk cluster with the lock mode @mode
33146 +  . maybe reads each one (if @read)
33147 +  . maybe makes its znode dirty (if write lock mode was specified)
33148 +
33149 +  NOTE-EDWARD: Callers should handle the case when disk cluster
33150 +  is incomplete (-EIO)
33151 +*/
33152 +int find_disk_cluster(reiser4_cluster_t * clust,
33153 +                     struct inode *inode, int read, znode_lock_mode mode)
33154 +{
33155 +       flow_t f;
33156 +       hint_t *hint;
33157 +       int result = 0;
33158 +       unsigned long cl_idx;
33159 +       ra_info_t ra_info;
33160 +       file_plugin *fplug;
33161 +       item_plugin *iplug;
33162 +       tfm_cluster_t *tc;
33163 +       int was_grabbed;
33164 +
33165 +       assert("edward-138", clust != NULL);
33166 +       assert("edward-728", clust->hint != NULL);
33167 +       assert("edward-226", reiser4_schedulable());
33168 +       assert("edward-137", inode != NULL);
33169 +       assert("edward-729", cryptcompress_inode_ok(inode));
33170 +
33171 +       hint = clust->hint;
33172 +       cl_idx = clust->index;
33173 +       fplug = inode_file_plugin(inode);
33174 +       was_grabbed = get_current_context()->grabbed_blocks;
33175 +       tc = &clust->tc;
33176 +
33177 +       assert("edward-462", !tfm_cluster_is_uptodate(tc));
33178 +       assert("edward-461", ergo(read, tfm_stream_is_set(tc, INPUT_STREAM)));
33179 +
33180 +       dclust_init_extension(hint);
33181 +
33182 +       /* set key of the first disk cluster item */
33183 +       fplug->flow_by_inode(inode,
33184 +                            (read ? (char __user *)tfm_stream_data(tc, INPUT_STREAM) : NULL),
33185 +                            0 /* kernel space */ ,
33186 +                            inode_scaled_cluster_size(inode),
33187 +                            clust_to_off(cl_idx, inode), READ_OP, &f);
33188 +       if (mode == ZNODE_WRITE_LOCK) {
33189 +               /* reserve for flush to make dirty all the leaf nodes
33190 +                  which contain disk cluster */
33191 +               result =
33192 +                   reiser4_grab_space_force(estimate_dirty_cluster(inode),
33193 +                                            BA_CAN_COMMIT);
33194 +               if (result)
33195 +                       goto out;
33196 +       }
33197 +
33198 +       ra_info.key_to_stop = f.key;
33199 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
33200 +
33201 +       while (f.length) {
33202 +               result = find_cluster_item(hint, &f.key, mode,
33203 +                                          NULL, FIND_EXACT,
33204 +                                          (mode == ZNODE_WRITE_LOCK ?
33205 +                                           CBK_FOR_INSERT : 0));
33206 +               switch (result) {
33207 +               case CBK_COORD_NOTFOUND:
33208 +                       result = 0;
33209 +                       if (inode_scaled_offset
33210 +                           (inode,
33211 +                            clust_to_off(cl_idx,
33212 +                                         inode)) == get_key_offset(&f.key)) {
33213 +                               /* first item not found, this is treated
33214 +                                  as disk cluster is absent */
33215 +                               clust->dstat = FAKE_DISK_CLUSTER;
33216 +                               goto out;
33217 +                       }
33218 +                       /* we are outside the cluster, stop search here */
33219 +                       assert("edward-146",
33220 +                              f.length != inode_scaled_cluster_size(inode));
33221 +                       goto ok;
33222 +               case CBK_COORD_FOUND:
33223 +                       assert("edward-148",
33224 +                              hint->ext_coord.coord.between == AT_UNIT);
33225 +                       assert("edward-460",
33226 +                              hint->ext_coord.coord.unit_pos == 0);
33227 +
33228 +                       coord_clear_iplug(&hint->ext_coord.coord);
33229 +                       result = zload_ra(hint->ext_coord.coord.node, &ra_info);
33230 +                       if (unlikely(result))
33231 +                               goto out;
33232 +                       iplug = item_plugin_by_coord(&hint->ext_coord.coord);
33233 +                       assert("edward-147",
33234 +                              item_id_by_coord(&hint->ext_coord.coord) ==
33235 +                              CTAIL_ID);
33236 +
33237 +                       result = iplug->s.file.read(NULL, &f, hint);
33238 +                       if (result) {
33239 +                               zrelse(hint->ext_coord.coord.node);
33240 +                               goto out;
33241 +                       }
33242 +                       if (mode == ZNODE_WRITE_LOCK) {
33243 +                               /* Don't make dirty more nodes then it was
33244 +                                  estimated (see comments before
33245 +                                  estimate_dirty_cluster). Missed nodes will be
33246 +                                  read up in flush time if they are evicted from
33247 +                                  memory */
33248 +                               if (dclust_get_extension_ncount(hint) <=
33249 +                                   estimate_dirty_cluster(inode))
33250 +                                  znode_make_dirty(hint->ext_coord.coord.node);
33251 +
33252 +                               znode_set_convertible(hint->ext_coord.coord.
33253 +                                                     node);
33254 +                       }
33255 +                       zrelse(hint->ext_coord.coord.node);
33256 +                       break;
33257 +               default:
33258 +                       goto out;
33259 +               }
33260 +       }
33261 + ok:
33262 +       /* at least one item was found  */
33263 +       /* NOTE-EDWARD: Callers should handle the case
33264 +          when disk cluster is incomplete (-EIO) */
33265 +       tc->len = inode_scaled_cluster_size(inode) - f.length;
33266 +       tc->lsize = fsize_to_count(clust, inode);
33267 +       assert("edward-1196", tc->len > 0);
33268 +       assert("edward-1406", tc->lsize > 0);
33269 +
33270 +       if (hint_is_unprepped_dclust(clust->hint))
33271 +               clust->dstat = UNPR_DISK_CLUSTER;
33272 +       else {
33273 +               dclust_set_extension_dsize(clust->hint, tc->len);
33274 +               clust->dstat = PREP_DISK_CLUSTER;
33275 +       }
33276 + out:
33277 +       assert("edward-1339",
33278 +              get_current_context()->grabbed_blocks >= was_grabbed);
33279 +       grabbed2free(get_current_context(),
33280 +                    get_current_super_private(),
33281 +                    get_current_context()->grabbed_blocks - was_grabbed);
33282 +       return result;
33283 +}
33284 +
33285 +int
33286 +get_disk_cluster_locked(reiser4_cluster_t * clust, struct inode *inode,
33287 +                       znode_lock_mode lock_mode)
33288 +{
33289 +       reiser4_key key;
33290 +       ra_info_t ra_info;
33291 +
33292 +       assert("edward-730", reiser4_schedulable());
33293 +       assert("edward-731", clust != NULL);
33294 +       assert("edward-732", inode != NULL);
33295 +
33296 +       if (hint_is_valid(clust->hint)) {
33297 +               assert("edward-1293", clust->dstat != INVAL_DISK_CLUSTER);
33298 +               assert("edward-1294",
33299 +                      znode_is_write_locked(clust->hint->lh.node));
33300 +               /* already have a valid locked position */
33301 +               return (clust->dstat ==
33302 +                       FAKE_DISK_CLUSTER ? CBK_COORD_NOTFOUND :
33303 +                       CBK_COORD_FOUND);
33304 +       }
33305 +       key_by_inode_cryptcompress(inode, clust_to_off(clust->index, inode),
33306 +                                  &key);
33307 +       ra_info.key_to_stop = key;
33308 +       set_key_offset(&ra_info.key_to_stop, get_key_offset(reiser4_max_key()));
33309 +
33310 +       return find_cluster_item(clust->hint, &key, lock_mode, NULL, FIND_EXACT,
33311 +                                CBK_FOR_INSERT);
33312 +}
33313 +
33314 +/* Read needed cluster pages before modifying.
33315 +   If success, @clust->hint contains locked position in the tree.
33316 +   Also:
33317 +   . find and set disk cluster state
33318 +   . make disk cluster dirty if its state is not FAKE_DISK_CLUSTER.
33319 +*/
33320 +static int
33321 +read_some_cluster_pages(struct inode *inode, reiser4_cluster_t * clust)
33322 +{
33323 +       int i;
33324 +       int result = 0;
33325 +       item_plugin *iplug;
33326 +       reiser4_slide_t *win = clust->win;
33327 +       znode_lock_mode mode = ZNODE_WRITE_LOCK;
33328 +
33329 +       iplug = item_plugin_by_id(CTAIL_ID);
33330 +
33331 +       assert("edward-924", !tfm_cluster_is_uptodate(&clust->tc));
33332 +
33333 +#if REISER4_DEBUG
33334 +       if (clust->nr_pages == 0) {
33335 +               /* start write hole from fake disk cluster */
33336 +               assert("edward-1117", win != NULL);
33337 +               assert("edward-1118", win->stat == HOLE_WINDOW);
33338 +               assert("edward-1119", new_cluster(clust, inode));
33339 +       }
33340 +#endif
33341 +       if (new_cluster(clust, inode)) {
33342 +               /*
33343 +                  new page cluster is about to be written, nothing to read,
33344 +                */
33345 +               assert("edward-734", reiser4_schedulable());
33346 +               assert("edward-735", clust->hint->lh.owner == NULL);
33347 +
33348 +               if (clust->nr_pages) {
33349 +                       int off;
33350 +                       char *data;
33351 +                       struct page * pg;
33352 +                       assert("edward-1419", clust->pages != NULL);
33353 +                       pg = clust->pages[clust->nr_pages - 1];
33354 +                       assert("edward-1420", pg != NULL);
33355 +                       off = off_to_pgoff(win->off+win->count+win->delta);
33356 +                       if (off) {
33357 +                               lock_page(pg);
33358 +                               data = kmap_atomic(pg, KM_USER0);
33359 +                               memset(data + off, 0, PAGE_CACHE_SIZE - off);
33360 +                               flush_dcache_page(pg);
33361 +                               kunmap_atomic(data, KM_USER0);
33362 +                               unlock_page(pg);
33363 +                       }
33364 +               }
33365 +               clust->dstat = FAKE_DISK_CLUSTER;
33366 +               return 0;
33367 +       }
33368 +       /*
33369 +          Here we should search for disk cluster to figure out its real state.
33370 +          Also there is one more important reason to do disk search: we need
33371 +          to make disk cluster _dirty_ if it exists
33372 +        */
33373 +
33374 +       /* if windows is specified, read the only pages
33375 +          that will be modified partially */
33376 +
33377 +       for (i = 0; i < clust->nr_pages; i++) {
33378 +               struct page *pg = clust->pages[i];
33379 +
33380 +               lock_page(pg);
33381 +               if (PageUptodate(pg)) {
33382 +                       unlock_page(pg);
33383 +                       continue;
33384 +               }
33385 +               unlock_page(pg);
33386 +
33387 +               if (win &&
33388 +                   i >= count_to_nrpages(win->off) &&
33389 +                   i < off_to_pg(win->off + win->count + win->delta))
33390 +                       /* page will be completely overwritten */
33391 +                       continue;
33392 +
33393 +               if (win && (i == clust->nr_pages - 1) &&
33394 +                   /* the last page is
33395 +                      partially modified,
33396 +                      not uptodate .. */
33397 +                   (count_to_nrpages(inode->i_size) <= pg->index)) {
33398 +                       /* .. and appended,
33399 +                          so set zeroes to the rest */
33400 +                       char *data;
33401 +                       int offset;
33402 +                       lock_page(pg);
33403 +                       data = kmap_atomic(pg, KM_USER0);
33404 +
33405 +                       assert("edward-1260",
33406 +                              count_to_nrpages(win->off + win->count +
33407 +                                               win->delta) - 1 == i);
33408 +
33409 +                       offset =
33410 +                           off_to_pgoff(win->off + win->count + win->delta);
33411 +                       memset(data + offset, 0, PAGE_CACHE_SIZE - offset);
33412 +                       flush_dcache_page(pg);
33413 +                       kunmap_atomic(data, KM_USER0);
33414 +                       unlock_page(pg);
33415 +                       /* still not uptodate */
33416 +                       break;
33417 +               }
33418 +               if (!tfm_cluster_is_uptodate(&clust->tc)) {
33419 +                       result = ctail_read_disk_cluster(clust, inode, mode);
33420 +                       if (result)
33421 +                               goto out;
33422 +                       assert("edward-925",
33423 +                              tfm_cluster_is_uptodate(&clust->tc));
33424 +               }
33425 +               lock_page(pg);
33426 +               result = do_readpage_ctail(inode, clust, pg, mode);
33427 +               unlock_page(pg);
33428 +               if (result) {
33429 +                       impossible("edward-219",
33430 +                                  "do_readpage_ctail returned crap");
33431 +                       goto out;
33432 +               }
33433 +       }
33434 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
33435 +               /* disk cluster unclaimed, but we need to make its znodes dirty
33436 +                  to make flush update convert its content */
33437 +               result = find_disk_cluster(clust, inode, 0 /* do not read items */,
33438 +                                          mode);
33439 +       }
33440 + out:
33441 +       tfm_cluster_clr_uptodate(&clust->tc);
33442 +       return result;
33443 +}
33444 +
33445 +static int
33446 +should_create_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
33447 +{
33448 +       assert("edward-737", clust != NULL);
33449 +
33450 +       switch (clust->dstat) {
33451 +       case PREP_DISK_CLUSTER:
33452 +       case UNPR_DISK_CLUSTER:
33453 +               return 0;
33454 +       case FAKE_DISK_CLUSTER:
33455 +               if (clust->win &&
33456 +                   clust->win->stat == HOLE_WINDOW && clust->nr_pages == 0) {
33457 +                       assert("edward-1172", new_cluster(clust, inode));
33458 +                       return 0;
33459 +               }
33460 +               return 1;
33461 +       default:
33462 +               impossible("edward-1173", "bad disk cluster state");
33463 +               return 0;
33464 +       }
33465 +}
33466 +
33467 +static int
33468 +cryptcompress_make_unprepped_cluster(reiser4_cluster_t * clust,
33469 +                                    struct inode *inode)
33470 +{
33471 +       int result;
33472 +
33473 +       assert("edward-1123", reiser4_schedulable());
33474 +       assert("edward-737", clust != NULL);
33475 +       assert("edward-738", inode != NULL);
33476 +       assert("edward-739", cryptcompress_inode_ok(inode));
33477 +       assert("edward-1053", clust->hint != NULL);
33478 +
33479 +       if (!should_create_unprepped_cluster(clust, inode)) {
33480 +               if (clust->reserved) {
33481 +                       cluster_reserved2free(estimate_insert_cluster(inode));
33482 +#if REISER4_DEBUG
33483 +                       assert("edward-1267",
33484 +                              clust->reserved_unprepped ==
33485 +                              estimate_insert_cluster(inode));
33486 +                       clust->reserved_unprepped -=
33487 +                               estimate_insert_cluster(inode);
33488 +#endif
33489 +               }
33490 +               return 0;
33491 +       }
33492 +       assert("edward-1268", clust->reserved);
33493 +       cluster_reserved2grabbed(estimate_insert_cluster(inode));
33494 +#if REISER4_DEBUG
33495 +       assert("edward-1441",
33496 +              clust->reserved_unprepped == estimate_insert_cluster(inode));
33497 +       clust->reserved_unprepped -= estimate_insert_cluster(inode);
33498 +#endif
33499 +       result = ctail_insert_unprepped_cluster(clust, inode);
33500 +       if (result)
33501 +               return result;
33502 +
33503 +       inode_add_bytes(inode, inode_cluster_size(inode));
33504 +
33505 +       assert("edward-743", cryptcompress_inode_ok(inode));
33506 +       assert("edward-744", znode_is_write_locked(clust->hint->lh.node));
33507 +
33508 +       clust->dstat = UNPR_DISK_CLUSTER;
33509 +       return 0;
33510 +}
33511 +
33512 +#if REISER4_DEBUG
33513 +static int jnode_truncate_ok(struct inode *inode, cloff_t index)
33514 +{
33515 +       jnode *node;
33516 +       node =
33517 +           jlookup(current_tree, get_inode_oid(inode),
33518 +                   clust_to_pg(index, inode));
33519 +       if (likely(!node))
33520 +               return 1;
33521 +       /* someone got this jnode */
33522 +       warning("edward-1315", "jnode %p is untruncated\n", node);
33523 +       jput(node);
33524 +       return (atomic_read(&node->x_count));
33525 +}
33526 +#endif
33527 +
33528 +/* Collect unlocked cluster pages and jnode (the last is in the
33529 +   case when the page cluster will be modified and captured) */
33530 +int
33531 +prepare_page_cluster(struct inode *inode, reiser4_cluster_t * clust,
33532 +                    int capture)
33533 +{
33534 +       assert("edward-177", inode != NULL);
33535 +       assert("edward-741", cryptcompress_inode_ok(inode));
33536 +       assert("edward-740", clust->pages != NULL);
33537 +
33538 +       set_cluster_nrpages(clust, inode);
33539 +       reset_cluster_pgset(clust, cluster_nrpages(inode));
33540 +       return (capture ?
33541 +               grab_cluster_pages_jnode(inode, clust) :
33542 +               grab_cluster_pages(inode, clust));
33543 +}
33544 +
33545 +/* Truncate all pages of the cluster of index @index.
33546 +   This is called by ->kill_hook() method of item plugin */
33547 +void truncate_page_cluster_cryptcompress(struct inode *inode, cloff_t index,
33548 +                                        int even_cows)
33549 +{
33550 +       int i;
33551 +       int found = 0;
33552 +       int nr_pages;
33553 +       jnode *node;
33554 +       struct page *pages[MAX_CLUSTER_NRPAGES];
33555 +
33556 +       node =
33557 +           jlookup(current_tree, get_inode_oid(inode),
33558 +                   clust_to_pg(index, inode));
33559 +       /* jnode is absent, just drop pages which can not
33560 +          acquire jnode because of exclusive access */
33561 +       if (!node)
33562 +               goto truncate;
33563 +       /* jnode is present and may be dirty */
33564 +       nr_pages = count_to_nrpages(cnt_to_clcnt(inode->i_size, index, inode));
33565 +
33566 +       found = find_get_pages(inode->i_mapping, clust_to_pg(index, inode),
33567 +                              nr_pages, pages);
33568 +       spin_lock_jnode(node);
33569 +
33570 +       if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS)
33571 +           && index == 0)
33572 +               /* converting to unix_file in progress */
33573 +               JF_CLR(node, JNODE_CLUSTER_PAGE);
33574 +       if (JF_ISSET(node, JNODE_DIRTY)) {
33575 +               /* someone has done modifications which are not
33576 +                  yet committed, so we need to release some resources */
33577 +
33578 +               /* free disk space grabbed for disk cluster converting */
33579 +               cluster_reserved2grabbed(estimate_update_cluster(inode));
33580 +               grabbed2free(get_current_context(),
33581 +                            get_current_super_private(),
33582 +                            estimate_update_cluster(inode));
33583 +
33584 +               assert("edward-1198", found == nr_pages);
33585 +               assert("edward-1199", node->page_count == nr_pages);
33586 +#if REISER4_DEBUG
33587 +               node->page_count = 0;
33588 +#endif
33589 +               /* This will clear dirty bit */
33590 +               uncapture_cluster_jnode(node);
33591 +
33592 +               /* put pages grabbed for last uncommitted modifications */
33593 +               for (i = 0; i < nr_pages; i++) {
33594 +                       assert("edward-1200", PageUptodate(pages[i]));
33595 +                       page_cache_release(pages[i]);
33596 +#if REISER4_DEBUG
33597 +                       cryptcompress_inode_data(inode)->pgcount --;
33598 +#endif
33599 +               }
33600 +       } else
33601 +               spin_unlock_jnode(node);
33602 +       /* FIXME-EDWARD: Use truncate_complete_page in the loop above instead */
33603 +
33604 +       jput(node);
33605 +       /* put pages found here */
33606 +       forget_cluster_pages(pages, found);
33607 + truncate:
33608 +       if (reiser4_inode_get_flag(inode, REISER4_FILE_CONV_IN_PROGRESS) &&
33609 +           index == 0)
33610 +               return;
33611 +       reiser4_invalidate_pages(inode->i_mapping,
33612 +                                clust_to_pg(index, inode),
33613 +                                cluster_nrpages(inode),
33614 +                                even_cows);
33615 +       assert("edward-1201",
33616 +              ergo(!reiser4_inode_get_flag(inode,
33617 +                                           REISER4_FILE_CONV_IN_PROGRESS),
33618 +                   jnode_truncate_ok(inode, index)));
33619 +       return;
33620 +}
33621 +
33622 +/* Prepare cluster handle before(after) modifications
33623 +   which are supposed to be committed.
33624 +
33625 +   . grab cluster pages;
33626 +   . reserve disk space;
33627 +   . maybe read pages from disk and set the disk cluster dirty;
33628 +   . maybe write hole;
33629 +   . maybe create 'unprepped' disk cluster if the last one is fake
33630 +     (i.e. is not represenred by any items)
33631 +*/
33632 +
33633 +static int
33634 +prepare_cluster(struct inode *inode,
33635 +               loff_t file_off /* write position in the file */ ,
33636 +               loff_t to_file, /* bytes of users data to write to the file */
33637 +               reiser4_cluster_t * clust, page_cluster_op op)
33638 +{
33639 +       int result = 0;
33640 +       reiser4_slide_t *win = clust->win;
33641 +
33642 +       reset_cluster_params(clust);
33643 +       cluster_set_tfm_act(&clust->tc, TFMA_READ);
33644 +#if REISER4_DEBUG
33645 +       clust->ctx = get_current_context();
33646 +#endif
33647 +       assert("edward-1190", op != PCL_UNKNOWN);
33648 +
33649 +       clust->op = op;
33650 +
33651 +       result = prepare_page_cluster(inode, clust, 1);
33652 +       if (result)
33653 +               return result;
33654 +       assert("edward-1447",
33655 +              ergo(clust->nr_pages != 0, jprivate(clust->pages[0])));
33656 +       assert("edward-1448",
33657 +              ergo(clust->nr_pages != 0,
33658 +                   jnode_is_cluster_page(jprivate(clust->pages[0]))));
33659 +
33660 +       result = reserve4cluster(inode, clust);
33661 +       if (result)
33662 +               goto err1;
33663 +       result = read_some_cluster_pages(inode, clust);
33664 +       if (result) {
33665 +               free_reserved4cluster(inode,
33666 +                                     clust,
33667 +                                     estimate_update_cluster(inode) +
33668 +                                     estimate_insert_cluster(inode));
33669 +               goto err1;
33670 +       }
33671 +       assert("edward-1124", clust->dstat != INVAL_DISK_CLUSTER);
33672 +
33673 +       result = cryptcompress_make_unprepped_cluster(clust, inode);
33674 +       if (result)
33675 +               goto err2;
33676 +       if (win && win->stat == HOLE_WINDOW) {
33677 +               result = write_hole(inode, clust, file_off, to_file);
33678 +               if (result)
33679 +                       goto err2;
33680 +       }
33681 +       return 0;
33682 +      err2:
33683 +       free_reserved4cluster(inode, clust,
33684 +                             estimate_update_cluster(inode));
33685 +      err1:
33686 +       reiser4_release_cluster_pages_and_jnode(clust);
33687 +       assert("edward-1125", result == -ENOSPC);
33688 +       return result;
33689 +}
33690 +
33691 +/* set window by two offsets */
33692 +static void
33693 +set_window(reiser4_cluster_t * clust, reiser4_slide_t * win,
33694 +          struct inode *inode, loff_t o1, loff_t o2)
33695 +{
33696 +       assert("edward-295", clust != NULL);
33697 +       assert("edward-296", inode != NULL);
33698 +       assert("edward-1071", win != NULL);
33699 +       assert("edward-297", o1 <= o2);
33700 +
33701 +       clust->index = off_to_clust(o1, inode);
33702 +
33703 +       win->off = off_to_cloff(o1, inode);
33704 +       win->count = min_count(inode_cluster_size(inode) - win->off, o2 - o1);
33705 +       win->delta = 0;
33706 +
33707 +       clust->win = win;
33708 +}
33709 +
33710 +static int
33711 +set_cluster_by_window(struct inode *inode, reiser4_cluster_t * clust,
33712 +                     reiser4_slide_t * win, flow_t * f, loff_t file_off)
33713 +{
33714 +       int result;
33715 +
33716 +       assert("edward-197", clust != NULL);
33717 +       assert("edward-1072", win != NULL);
33718 +       assert("edward-198", inode != NULL);
33719 +
33720 +       result = alloc_cluster_pgset(clust, cluster_nrpages(inode));
33721 +       if (result)
33722 +               return result;
33723 +
33724 +       if (file_off > inode->i_size) {
33725 +               /* Uhmm, hole in cryptcompress file... */
33726 +               loff_t hole_size;
33727 +               hole_size = file_off - inode->i_size;
33728 +
33729 +               set_window(clust, win, inode, inode->i_size, file_off);
33730 +               win->stat = HOLE_WINDOW;
33731 +               if (win->off + hole_size < inode_cluster_size(inode))
33732 +                       /* there is also user's data to append to the hole */
33733 +                       win->delta =
33734 +                           min_count(inode_cluster_size(inode) -
33735 +                                     (win->off + win->count), f->length);
33736 +               return 0;
33737 +       }
33738 +       set_window(clust, win, inode, file_off, file_off + f->length);
33739 +       win->stat = DATA_WINDOW;
33740 +       return 0;
33741 +}
33742 +
33743 +int set_cluster_by_page(reiser4_cluster_t * clust, struct page * page,
33744 +                       int count)
33745 +{
33746 +       int result = 0;
33747 +       int (*setting_actor)(reiser4_cluster_t * clust, int count);
33748 +
33749 +       assert("edward-1358", clust != NULL);
33750 +       assert("edward-1359", page != NULL);
33751 +       assert("edward-1360", page->mapping != NULL);
33752 +       assert("edward-1361", page->mapping->host != NULL);
33753 +
33754 +       setting_actor  = (clust->pages ? reset_cluster_pgset : alloc_cluster_pgset);
33755 +       result = setting_actor(clust, count);
33756 +       clust->index = pg_to_clust(page->index, page->mapping->host);
33757 +       return result;
33758 +}
33759 +
33760 +/* reset all the params that not get updated */
33761 +void reset_cluster_params(reiser4_cluster_t * clust)
33762 +{
33763 +       assert("edward-197", clust != NULL);
33764 +
33765 +       clust->dstat = INVAL_DISK_CLUSTER;
33766 +       clust->tc.uptodate = 0;
33767 +       clust->tc.len = 0;
33768 +}
33769 +
33770 +/* Core write procedure of cryptcompress plugin, which slices user's
33771 +   flow into logical clusters, maps the last ones to the appropriate
33772 +   page clusters, and tries to capture them.
33773 +   If @buf != NULL, returns number of successfully written bytes,
33774 +   otherwise returns error
33775 +*/
33776 +static loff_t
33777 +write_cryptcompress_flow(struct file *file, struct inode *inode,
33778 +                        const char __user *buf, size_t count, loff_t pos,
33779 +                        int *conv_occured)
33780 +{
33781 +       int i;
33782 +       flow_t f;
33783 +       hint_t *hint;
33784 +       int result = 0;
33785 +       size_t to_write = 0;
33786 +       loff_t file_off;
33787 +       reiser4_slide_t win;
33788 +       reiser4_cluster_t clust;
33789 +
33790 +       assert("edward-161", reiser4_schedulable());
33791 +       assert("edward-748", cryptcompress_inode_ok(inode));
33792 +       assert("edward-159", current_blocksize == PAGE_CACHE_SIZE);
33793 +       assert("edward-1274", get_current_context()->grabbed_blocks == 0);
33794 +
33795 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
33796 +       if (hint == NULL)
33797 +               return RETERR(-ENOMEM);
33798 +
33799 +       result = load_file_hint(file, hint);
33800 +       if (result) {
33801 +               kfree(hint);
33802 +               return result;
33803 +       }
33804 +
33805 +       result =
33806 +           flow_by_inode_cryptcompress(inode, buf, 1 /* user space */ ,
33807 +                                       count, pos, WRITE_OP, &f);
33808 +       if (result)
33809 +               goto out;
33810 +       to_write = f.length;
33811 +
33812 +       /* current write position in file */
33813 +       file_off = pos;
33814 +       reiser4_slide_init(&win);
33815 +       cluster_init_read(&clust, &win);
33816 +       clust.hint = hint;
33817 +
33818 +       result = set_cluster_by_window(inode, &clust, &win, &f, file_off);
33819 +       if (result)
33820 +               goto out;
33821 +
33822 +       if (next_window_stat(&win) == HOLE_WINDOW) {
33823 +               result = write_conversion_hook(file, inode, pos, &clust, NULL);
33824 +               if (result)
33825 +                       goto out;
33826 +               result =
33827 +                   prepare_cluster(inode, file_off, f.length, &clust,
33828 +                                   PCL_APPEND);
33829 +               if (result)
33830 +                       goto out;
33831 +       }
33832 +       do {
33833 +               char *src;
33834 +               unsigned page_off, page_count;
33835 +
33836 +               assert("edward-750", reiser4_schedulable());
33837 +
33838 +               result = write_conversion_hook(file, inode, pos, &clust,
33839 +                                              conv_occured);
33840 +               if (result || *conv_occured)
33841 +                       goto out;
33842 +               result =
33843 +                   prepare_cluster(inode, file_off, f.length, &clust,
33844 +                                   PCL_APPEND);
33845 +               if (result)
33846 +                       goto out;
33847 +
33848 +               assert("edward-751", cryptcompress_inode_ok(inode));
33849 +               assert("edward-204", win.stat == DATA_WINDOW);
33850 +               assert("edward-1288", hint_is_valid(clust.hint));
33851 +               assert("edward-752",
33852 +                      znode_is_write_locked(hint->ext_coord.coord.node));
33853 +
33854 +               put_hint_cluster(&clust, inode, ZNODE_WRITE_LOCK);
33855 +
33856 +               /* set write position in page */
33857 +               page_off = off_to_pgoff(win.off);
33858 +
33859 +               /* copy user's data to cluster pages */
33860 +               for (i = off_to_pg(win.off), src = f.data;
33861 +                    i < count_to_nrpages(win.off + win.count);
33862 +                    i++, src += page_count) {
33863 +                       page_count =
33864 +                           cnt_to_pgcnt(win.off + win.count, i) - page_off;
33865 +
33866 +                       assert("edward-1039",
33867 +                              page_off + page_count <= PAGE_CACHE_SIZE);
33868 +                       assert("edward-287", clust.pages[i] != NULL);
33869 +
33870 +                       lock_page(clust.pages[i]);
33871 +                       result =
33872 +                           __copy_from_user((char *)kmap(clust.pages[i]) +
33873 +                                            page_off, (char __user *)src, page_count);
33874 +                       kunmap(clust.pages[i]);
33875 +                       if (unlikely(result)) {
33876 +                               unlock_page(clust.pages[i]);
33877 +                               result = -EFAULT;
33878 +                               goto err2;
33879 +                       }
33880 +                       SetPageUptodate(clust.pages[i]);
33881 +                       unlock_page(clust.pages[i]);
33882 +                       page_off = 0;
33883 +               }
33884 +               assert("edward-753", cryptcompress_inode_ok(inode));
33885 +
33886 +               set_cluster_pages_dirty(&clust);
33887 +
33888 +               result = try_capture_cluster(&clust, inode);
33889 +               if (result)
33890 +                       goto err2;
33891 +
33892 +               assert("edward-998", f.user == 1);
33893 +
33894 +               move_flow_forward(&f, win.count);
33895 +
33896 +               /* disk cluster may be already clean at this point */
33897 +
33898 +               /* . update cluster
33899 +                  . set hint for new offset
33900 +                  . unlock znode
33901 +                  . update inode
33902 +                  . balance dirty pages
33903 +                */
33904 +               result = balance_dirty_page_cluster(&clust, inode, 0, f.length);
33905 +               if (result)
33906 +                       goto err1;
33907 +               assert("edward-755", hint->lh.owner == NULL);
33908 +               reset_cluster_params(&clust);
33909 +               continue;
33910 +             err2:
33911 +               reiser4_release_cluster_pages_and_jnode(&clust);
33912 +             err1:
33913 +               if (clust.reserved)
33914 +                       free_reserved4cluster(inode,
33915 +                                             &clust,
33916 +                                             estimate_update_cluster(inode));
33917 +               break;
33918 +       } while (f.length);
33919 +      out:
33920 +       done_lh(&hint->lh);
33921 +       if (result == -EEXIST)
33922 +               warning("edward-1407", "write returns EEXIST!\n");
33923 +
33924 +       put_cluster_handle(&clust);
33925 +       save_file_hint(file, hint);
33926 +       kfree(hint);
33927 +       if (buf) {
33928 +               /* if nothing were written - there must be an error */
33929 +               assert("edward-195", ergo((to_write == f.length),
33930 +                                         (result < 0 || *conv_occured)));
33931 +               return (to_write - f.length) ? (to_write - f.length) : result;
33932 +       }
33933 +       return result;
33934 +}
33935 +
33936 +/**
33937 + * write_cryptcompress - write of struct file_operations
33938 + * @file: file to write to
33939 + * @buf: address of user-space buffer
33940 + * @read_amount: number of bytes to write
33941 + * @off: position in file to write to
33942 + *
33943 + * This is implementation of vfs's write method of struct file_operations for
33944 + * cryptcompress plugin.
33945 + */
33946 +ssize_t write_cryptcompress(struct file *file, const char __user *buf,
33947 +                           size_t count, loff_t *off, int *conv)
33948 +{
33949 +       ssize_t result;
33950 +       struct inode *inode;
33951 +       reiser4_context *ctx;
33952 +       loff_t pos = *off;
33953 +       cryptcompress_info_t *info;
33954 +
33955 +       assert("edward-1449", *conv == 0);
33956 +
33957 +       inode = file->f_dentry->d_inode;
33958 +       assert("edward-196", cryptcompress_inode_ok(inode));
33959 +
33960 +       info = cryptcompress_inode_data(inode);
33961 +
33962 +       ctx = reiser4_init_context(inode->i_sb);
33963 +       if (IS_ERR(ctx))
33964 +               return PTR_ERR(ctx);
33965 +
33966 +       mutex_lock(&inode->i_mutex);
33967 +
33968 +       result = generic_write_checks(file, &pos, &count, 0);
33969 +       if (unlikely(result != 0))
33970 +               goto out;
33971 +       if (unlikely(count == 0))
33972 +               goto out;
33973 +       result = remove_suid(file->f_dentry);
33974 +       if (unlikely(result != 0))
33975 +               goto out;
33976 +       /* remove_suid might create a transaction */
33977 +       reiser4_txn_restart(ctx);
33978 +
33979 +       result = write_cryptcompress_flow(file, inode, buf, count, pos, conv);
33980 +
33981 +       if (result < 0)
33982 +               goto out;
33983 +       /* update position in a file */
33984 +       *off = pos + result;
33985 + out:
33986 +       mutex_unlock(&inode->i_mutex);
33987 +
33988 +       context_set_commit_async(ctx);
33989 +       reiser4_exit_context(ctx);
33990 +       return result;
33991 +}
33992 +
33993 +int readpages_cryptcompress(struct file *file, struct address_space *mapping,
33994 +                           struct list_head *pages, unsigned nr_pages)
33995 +{
33996 +       reiser4_context * ctx;
33997 +       int ret;
33998 +
33999 +       ctx = reiser4_init_context(mapping->host->i_sb);
34000 +       if (IS_ERR(ctx)) {
34001 +               ret = PTR_ERR(ctx);
34002 +               goto err;
34003 +       }
34004 +       /* crc files can be built of ctail items only */
34005 +       ret = readpages_ctail(file, mapping, pages);
34006 +       reiser4_exit_context(ctx);
34007 +       if (ret) {
34008 +err:
34009 +               put_pages_list(pages);
34010 +       }
34011 +       return ret;
34012 +}
34013 +
34014 +static reiser4_block_nr cryptcompress_estimate_read(struct inode *inode)
34015 +{
34016 +       /* reserve one block to update stat data item */
34017 +       assert("edward-1193",
34018 +              inode_file_plugin(inode)->estimate.update ==
34019 +              estimate_update_common);
34020 +       return estimate_update_common(inode);
34021 +}
34022 +
34023 +/**
34024 + * read_cryptcompress - read of struct file_operations
34025 + * @file: file to read from
34026 + * @buf: address of user-space buffer
34027 + * @read_amount: number of bytes to read
34028 + * @off: position in file to read from
34029 + *
34030 + * This is implementation of vfs's read method of struct file_operations for
34031 + * cryptcompress plugin.
34032 + */
34033 +ssize_t read_cryptcompress(struct file * file, char __user *buf, size_t size,
34034 +                          loff_t * off)
34035 +{
34036 +       ssize_t result;
34037 +       struct inode *inode;
34038 +       reiser4_context *ctx;
34039 +       cryptcompress_info_t *info;
34040 +       reiser4_block_nr needed;
34041 +
34042 +       inode = file->f_dentry->d_inode;
34043 +       assert("edward-1194", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
34044 +
34045 +       ctx = reiser4_init_context(inode->i_sb);
34046 +       if (IS_ERR(ctx))
34047 +               return PTR_ERR(ctx);
34048 +
34049 +       info = cryptcompress_inode_data(inode);
34050 +       needed = cryptcompress_estimate_read(inode);
34051 +
34052 +       result = reiser4_grab_space(needed, BA_CAN_COMMIT);
34053 +       if (result != 0) {
34054 +               reiser4_exit_context(ctx);
34055 +               return result;
34056 +       }
34057 +
34058 +       LOCK_CNT_INC(inode_sem_r);
34059 +
34060 +       result = do_sync_read(file, buf, size, off);
34061 +
34062 +       LOCK_CNT_DEC(inode_sem_r);
34063 +
34064 +       context_set_commit_async(ctx);
34065 +       reiser4_exit_context(ctx);
34066 +
34067 +       return result;
34068 +}
34069 +
34070 +/* If @index > 0, find real disk cluster of the index (@index - 1),
34071 +   If @index == 0 find the real disk cluster of the object of maximal index.
34072 +   Keep incremented index of the result in @found.
34073 +   It succes was returned:
34074 +   (@index == 0 && @found == 0) means that the object doesn't have real disk
34075 +   clusters.
34076 +   (@index != 0 && @found == 0) means that disk cluster of (@index -1) doesn't
34077 +   exist.
34078 +*/
34079 +static int
34080 +find_real_disk_cluster(struct inode *inode, cloff_t * found, cloff_t index)
34081 +{
34082 +       int result;
34083 +       reiser4_key key;
34084 +       loff_t offset;
34085 +       hint_t *hint;
34086 +       lock_handle *lh;
34087 +       lookup_bias bias;
34088 +       coord_t *coord;
34089 +       item_plugin *iplug;
34090 +
34091 +       assert("edward-1131", inode != NULL);
34092 +       assert("edward-95", cryptcompress_inode_ok(inode));
34093 +
34094 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34095 +       if (hint == NULL)
34096 +               return RETERR(-ENOMEM);
34097 +       hint_init_zero(hint);
34098 +       lh = &hint->lh;
34099 +
34100 +       bias = (index ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN);
34101 +       offset =
34102 +           (index ? clust_to_off(index, inode) -
34103 +            1 : get_key_offset(reiser4_max_key()));
34104 +
34105 +       key_by_inode_cryptcompress(inode, offset, &key);
34106 +
34107 +       /* find the last item of this object */
34108 +       result =
34109 +           find_cluster_item(hint, &key, ZNODE_READ_LOCK, NULL /* ra_info */,
34110 +                             bias, 0);
34111 +       if (cbk_errored(result)) {
34112 +               done_lh(lh);
34113 +               kfree(hint);
34114 +               return result;
34115 +       }
34116 +       if (result == CBK_COORD_NOTFOUND) {
34117 +               /* no real disk clusters */
34118 +               done_lh(lh);
34119 +               kfree(hint);
34120 +               *found = 0;
34121 +               return 0;
34122 +       }
34123 +       /* disk cluster is found */
34124 +       coord = &hint->ext_coord.coord;
34125 +       coord_clear_iplug(coord);
34126 +       result = zload(coord->node);
34127 +       if (unlikely(result)) {
34128 +               done_lh(lh);
34129 +               kfree(hint);
34130 +               return result;
34131 +       }
34132 +       iplug = item_plugin_by_coord(coord);
34133 +       assert("edward-277", iplug == item_plugin_by_id(CTAIL_ID));
34134 +       assert("edward-1202", ctail_ok(coord));
34135 +
34136 +       item_key_by_coord(coord, &key);
34137 +       *found = off_to_clust(get_key_offset(&key), inode) + 1;
34138 +
34139 +       assert("edward-1132", ergo(index, index == *found));
34140 +
34141 +       zrelse(coord->node);
34142 +       done_lh(lh);
34143 +       kfree(hint);
34144 +       return 0;
34145 +}
34146 +
34147 +static int find_fake_appended(struct inode *inode, cloff_t * index)
34148 +{
34149 +       return find_real_disk_cluster(inode, index,
34150 +                                     0 /* find last real one */ );
34151 +}
34152 +
34153 +/* Set left coord when unit is not found after node_lookup()
34154 +   This takes into account that there can be holes in a sequence
34155 +   of disk clusters */
34156 +
34157 +static void adjust_left_coord(coord_t * left_coord)
34158 +{
34159 +       switch (left_coord->between) {
34160 +       case AFTER_UNIT:
34161 +               left_coord->between = AFTER_ITEM;
34162 +       case AFTER_ITEM:
34163 +       case BEFORE_UNIT:
34164 +               break;
34165 +       default:
34166 +               impossible("edward-1204", "bad left coord to cut");
34167 +       }
34168 +       return;
34169 +}
34170 +
34171 +#define CRC_CUT_TREE_MIN_ITERATIONS 64
34172 +int
34173 +cut_tree_worker_cryptcompress(tap_t * tap, const reiser4_key * from_key,
34174 +                             const reiser4_key * to_key,
34175 +                             reiser4_key * smallest_removed,
34176 +                             struct inode *object, int truncate, int *progress)
34177 +{
34178 +       lock_handle next_node_lock;
34179 +       coord_t left_coord;
34180 +       int result;
34181 +
34182 +       assert("edward-1158", tap->coord->node != NULL);
34183 +       assert("edward-1159", znode_is_write_locked(tap->coord->node));
34184 +       assert("edward-1160", znode_get_level(tap->coord->node) == LEAF_LEVEL);
34185 +
34186 +       *progress = 0;
34187 +       init_lh(&next_node_lock);
34188 +
34189 +       while (1) {
34190 +               znode *node;    /* node from which items are cut */
34191 +               node_plugin *nplug;     /* node plugin for @node */
34192 +
34193 +               node = tap->coord->node;
34194 +
34195 +               /* Move next_node_lock to the next node on the left. */
34196 +               result =
34197 +                   reiser4_get_left_neighbor(&next_node_lock, node,
34198 +                                             ZNODE_WRITE_LOCK,
34199 +                                             GN_CAN_USE_UPPER_LEVELS);
34200 +               if (result != 0 && result != -E_NO_NEIGHBOR)
34201 +                       break;
34202 +               /* FIXME-EDWARD: Check can we delete the node as a whole. */
34203 +               result = reiser4_tap_load(tap);
34204 +               if (result)
34205 +                       return result;
34206 +
34207 +               /* Prepare the second (right) point for cut_node() */
34208 +               if (*progress)
34209 +                       coord_init_last_unit(tap->coord, node);
34210 +
34211 +               else if (item_plugin_by_coord(tap->coord)->b.lookup == NULL)
34212 +                       /* set rightmost unit for the items without lookup method */
34213 +                       tap->coord->unit_pos = coord_last_unit_pos(tap->coord);
34214 +
34215 +               nplug = node->nplug;
34216 +
34217 +               assert("edward-1161", nplug);
34218 +               assert("edward-1162", nplug->lookup);
34219 +
34220 +               /* left_coord is leftmost unit cut from @node */
34221 +               result = nplug->lookup(node, from_key, FIND_EXACT, &left_coord);
34222 +
34223 +               if (IS_CBKERR(result))
34224 +                       break;
34225 +
34226 +               if (result == CBK_COORD_NOTFOUND)
34227 +                       adjust_left_coord(&left_coord);
34228 +
34229 +               /* adjust coordinates so that they are set to existing units */
34230 +               if (coord_set_to_right(&left_coord)
34231 +                   || coord_set_to_left(tap->coord)) {
34232 +                       result = 0;
34233 +                       break;
34234 +               }
34235 +
34236 +               if (coord_compare(&left_coord, tap->coord) ==
34237 +                   COORD_CMP_ON_RIGHT) {
34238 +                       /* keys from @from_key to @to_key are not in the tree */
34239 +                       result = 0;
34240 +                       break;
34241 +               }
34242 +
34243 +               /* cut data from one node */
34244 +               *smallest_removed = *reiser4_min_key();
34245 +               result = kill_node_content(&left_coord,
34246 +                                          tap->coord,
34247 +                                          from_key,
34248 +                                          to_key,
34249 +                                          smallest_removed,
34250 +                                          next_node_lock.node,
34251 +                                          object, truncate);
34252 +#if REISER4_DEBUG
34253 +               /*node_check(node, ~0U); */
34254 +#endif
34255 +               reiser4_tap_relse(tap);
34256 +
34257 +               if (result)
34258 +                       break;
34259 +
34260 +               ++(*progress);
34261 +
34262 +               /* Check whether all items with keys >= from_key were removed
34263 +                * from the tree. */
34264 +               if (keyle(smallest_removed, from_key))
34265 +                       /* result = 0; */
34266 +                       break;
34267 +
34268 +               if (next_node_lock.node == NULL)
34269 +                       break;
34270 +
34271 +               result = reiser4_tap_move(tap, &next_node_lock);
34272 +               done_lh(&next_node_lock);
34273 +               if (result)
34274 +                       break;
34275 +
34276 +               /* Break long cut_tree operation (deletion of a large file) if
34277 +                * atom requires commit. */
34278 +               if (*progress > CRC_CUT_TREE_MIN_ITERATIONS
34279 +                   && current_atom_should_commit()) {
34280 +                       result = -E_REPEAT;
34281 +                       break;
34282 +               }
34283 +       }
34284 +       done_lh(&next_node_lock);
34285 +       return result;
34286 +}
34287 +
34288 +/* Append or expand hole in two steps (exclusive access should be aquired!)
34289 +   1) write zeroes to the current real cluster,
34290 +   2) expand hole via fake clusters (just increase i_size) */
34291 +static int
34292 +cryptcompress_append_hole(struct inode *inode /*contains old i_size */ ,
34293 +                         loff_t new_size)
34294 +{
34295 +       int result = 0;
34296 +       hint_t *hint;
34297 +       lock_handle *lh;
34298 +       loff_t hole_size;
34299 +       int nr_zeroes;
34300 +       reiser4_slide_t win;
34301 +       reiser4_cluster_t clust;
34302 +
34303 +       assert("edward-1133", inode->i_size < new_size);
34304 +       assert("edward-1134", reiser4_schedulable());
34305 +       assert("edward-1135", cryptcompress_inode_ok(inode));
34306 +       assert("edward-1136", current_blocksize == PAGE_CACHE_SIZE);
34307 +       assert("edward-1333", off_to_cloff(inode->i_size, inode) != 0);
34308 +
34309 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34310 +       if (hint == NULL)
34311 +               return RETERR(-ENOMEM);
34312 +       hint_init_zero(hint);
34313 +       lh = &hint->lh;
34314 +
34315 +       reiser4_slide_init(&win);
34316 +       cluster_init_read(&clust, &win);
34317 +       clust.hint = hint;
34318 +
34319 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34320 +       if (result)
34321 +               goto out;
34322 +       if (off_to_cloff(inode->i_size, inode) == 0)
34323 +               goto fake_append;
34324 +       hole_size = new_size - inode->i_size;
34325 +       nr_zeroes =
34326 +               inode_cluster_size(inode) - off_to_cloff(inode->i_size, inode);
34327 +       if (hole_size < nr_zeroes)
34328 +               nr_zeroes = hole_size;
34329 +       set_window(&clust, &win, inode, inode->i_size,
34330 +                  inode->i_size + nr_zeroes);
34331 +       win.stat = HOLE_WINDOW;
34332 +
34333 +       assert("edward-1137",
34334 +              clust.index == off_to_clust(inode->i_size, inode));
34335 +
34336 +       result = prepare_cluster(inode, 0, 0, &clust, PCL_APPEND);
34337 +
34338 +       assert("edward-1271", !result || result == -ENOSPC);
34339 +       if (result)
34340 +               goto out;
34341 +       assert("edward-1139",
34342 +              clust.dstat == PREP_DISK_CLUSTER ||
34343 +              clust.dstat == UNPR_DISK_CLUSTER);
34344 +
34345 +       assert("edward-1431", hole_size >= nr_zeroes);
34346 +       if (hole_size == nr_zeroes)
34347 +       /* nothing to append anymore */
34348 +               goto out;
34349 +      fake_append:
34350 +       INODE_SET_FIELD(inode, i_size, new_size);
34351 +      out:
34352 +       done_lh(lh);
34353 +       kfree(hint);
34354 +       put_cluster_handle(&clust);
34355 +       return result;
34356 +}
34357 +
34358 +#if REISER4_DEBUG
34359 +static int
34360 +pages_truncate_ok(struct inode *inode, loff_t old_size, pgoff_t start)
34361 +{
34362 +       struct pagevec pvec;
34363 +       int i;
34364 +       int count;
34365 +       int rest;
34366 +
34367 +       rest = count_to_nrpages(old_size) - start;
34368 +
34369 +       pagevec_init(&pvec, 0);
34370 +       count = min_count(pagevec_space(&pvec), rest);
34371 +
34372 +       while (rest) {
34373 +               count = min_count(pagevec_space(&pvec), rest);
34374 +               pvec.nr = find_get_pages(inode->i_mapping, start,
34375 +                                        count, pvec.pages);
34376 +               for (i = 0; i < pagevec_count(&pvec); i++) {
34377 +                       if (PageUptodate(pvec.pages[i])) {
34378 +                               warning("edward-1205",
34379 +                                       "truncated page of index %lu is uptodate",
34380 +                                       pvec.pages[i]->index);
34381 +                               return 0;
34382 +                       }
34383 +               }
34384 +               start += count;
34385 +               rest -= count;
34386 +               pagevec_release(&pvec);
34387 +       }
34388 +       return 1;
34389 +}
34390 +
34391 +static int body_truncate_ok(struct inode *inode, cloff_t aidx)
34392 +{
34393 +       int result;
34394 +       cloff_t raidx;
34395 +
34396 +       result = find_fake_appended(inode, &raidx);
34397 +       return !result && (aidx == raidx);
34398 +}
34399 +#endif
34400 +
34401 +static int
34402 +update_cryptcompress_size(struct inode *inode, reiser4_key * key, int update_sd)
34403 +{
34404 +       return (get_key_offset(key) & ((loff_t) (inode_cluster_size(inode)) - 1)
34405 +               ? 0 : reiser4_update_file_size(inode, key, update_sd));
34406 +}
34407 +
34408 +/* prune cryptcompress file in two steps (exclusive access should be acquired!)
34409 +   1) cut all disk clusters but the last one partially truncated,
34410 +   2) set zeroes and capture last partially truncated page cluster if the last
34411 +      one exists, otherwise truncate via prune fake cluster (just decrease i_size)
34412 +*/
34413 +static int
34414 +prune_cryptcompress(struct inode *inode, loff_t new_size, int update_sd,
34415 +                   cloff_t aidx)
34416 +{
34417 +       int result = 0;
34418 +       unsigned nr_zeroes;
34419 +       loff_t to_prune;
34420 +       loff_t old_size;
34421 +       cloff_t ridx;
34422 +
34423 +       hint_t *hint;
34424 +       lock_handle *lh;
34425 +       reiser4_slide_t win;
34426 +       reiser4_cluster_t clust;
34427 +
34428 +       assert("edward-1140", inode->i_size >= new_size);
34429 +       assert("edward-1141", reiser4_schedulable());
34430 +       assert("edward-1142", cryptcompress_inode_ok(inode));
34431 +       assert("edward-1143", current_blocksize == PAGE_CACHE_SIZE);
34432 +
34433 +       old_size = inode->i_size;
34434 +
34435 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34436 +       if (hint == NULL)
34437 +               return RETERR(-ENOMEM);
34438 +       hint_init_zero(hint);
34439 +       lh = &hint->lh;
34440 +
34441 +       reiser4_slide_init(&win);
34442 +       cluster_init_read(&clust, &win);
34443 +       clust.hint = hint;
34444 +
34445 +       /* rightmost completely truncated cluster */
34446 +       ridx = count_to_nrclust(new_size, inode);
34447 +
34448 +       assert("edward-1174", ridx <= aidx);
34449 +       old_size = inode->i_size;
34450 +       if (ridx != aidx) {
34451 +               result = cut_file_items(inode,
34452 +                                       clust_to_off(ridx, inode),
34453 +                                       update_sd,
34454 +                                       clust_to_off(aidx, inode),
34455 +                                       update_cryptcompress_size);
34456 +               if (result)
34457 +                       goto out;
34458 +       }
34459 +       if (!off_to_cloff(new_size, inode)) {
34460 +               /* no partially truncated clusters */
34461 +               assert("edward-1145", inode->i_size == new_size);
34462 +               goto finish;
34463 +       }
34464 +       assert("edward-1146", new_size < inode->i_size);
34465 +
34466 +       to_prune = inode->i_size - new_size;
34467 +
34468 +       /* partial truncate of leftmost cluster,
34469 +          first check if it is fake */
34470 +       result = find_real_disk_cluster(inode, &aidx, ridx);
34471 +       if (result)
34472 +               goto out;
34473 +       if (!aidx)
34474 +               /* yup, this is fake one */
34475 +               goto finish;
34476 +
34477 +       assert("edward-1148", aidx == ridx);
34478 +
34479 +       /* do partial truncate of the leftmost page cluster,
34480 +          then try to capture this one */
34481 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
34482 +       if (result)
34483 +               goto out;
34484 +       nr_zeroes = (off_to_pgoff(new_size) ?
34485 +                    PAGE_CACHE_SIZE - off_to_pgoff(new_size) : 0);
34486 +       set_window(&clust, &win, inode, new_size, new_size + nr_zeroes);
34487 +       win.stat = HOLE_WINDOW;
34488 +
34489 +       assert("edward-1149", clust.index == ridx - 1);
34490 +
34491 +       result = prepare_cluster(inode, 0, 0, &clust, PCL_TRUNCATE);
34492 +       if (result)
34493 +               goto out;
34494 +       assert("edward-1151",
34495 +              clust.dstat == PREP_DISK_CLUSTER ||
34496 +              clust.dstat == UNPR_DISK_CLUSTER);
34497 +
34498 +       assert("edward-1191", inode->i_size == new_size);
34499 +       assert("edward-1206", body_truncate_ok(inode, ridx));
34500 +      finish:
34501 +       /* drop all the pages that don't have jnodes (i.e. pages
34502 +          which can not be truncated by cut_file_items() because
34503 +          of holes represented by fake disk clusters) including
34504 +          the pages of partially truncated cluster which was
34505 +          released by prepare_cluster() */
34506 +       truncate_inode_pages(inode->i_mapping, new_size);
34507 +       INODE_SET_FIELD(inode, i_size, new_size);
34508 +      out:
34509 +       assert("edward-1334", !result || result == -ENOSPC);
34510 +       assert("edward-1209",
34511 +              pages_truncate_ok(inode, old_size, count_to_nrpages(new_size)));
34512 +       done_lh(lh);
34513 +       kfree(hint);
34514 +       put_cluster_handle(&clust);
34515 +       return result;
34516 +}
34517 +
34518 +/* Prepare cryptcompress file for truncate:
34519 +   prune or append rightmost fake logical clusters (if any)
34520 +*/
34521 +static int
34522 +start_truncate_fake(struct inode *inode, cloff_t aidx, loff_t new_size,
34523 +                   int update_sd)
34524 +{
34525 +       int result = 0;
34526 +       int bytes;
34527 +
34528 +       if (new_size > inode->i_size) {
34529 +               /* append */
34530 +               if (inode->i_size < clust_to_off(aidx, inode))
34531 +                       /* no fake bytes */
34532 +                       return 0;
34533 +               bytes = new_size - inode->i_size;
34534 +               INODE_SET_FIELD(inode, i_size, inode->i_size + bytes);
34535 +       } else {
34536 +               /* prune */
34537 +               if (inode->i_size <= clust_to_off(aidx, inode))
34538 +                       /* no fake bytes */
34539 +                       return 0;
34540 +               bytes =
34541 +                   inode->i_size - max_count(new_size,
34542 +                                             clust_to_off(aidx, inode));
34543 +               if (!bytes)
34544 +                       return 0;
34545 +               INODE_SET_FIELD(inode, i_size, inode->i_size - bytes);
34546 +               /* In the case of fake prune we need to drop page cluster.
34547 +                  There are only 2 cases for partially truncated page:
34548 +                  1. If is is dirty, therefore it is anonymous
34549 +                  (was dirtied via mmap), and will be captured
34550 +                  later via ->capture().
34551 +                  2. If is clean, therefore it is filled by zeroes.
34552 +                  In both cases we don't need to make it dirty and
34553 +                  capture here.
34554 +                */
34555 +               truncate_inode_pages(inode->i_mapping, inode->i_size);
34556 +       }
34557 +       if (update_sd)
34558 +               result = update_sd_cryptcompress(inode);
34559 +       return result;
34560 +}
34561 +
34562 +/* This is called in setattr_cryptcompress when it is used to truncate,
34563 +   and in delete_cryptcompress */
34564 +static int cryptcompress_truncate(struct inode *inode, /* old size */
34565 +                                 loff_t new_size,      /* new size */
34566 +                                 int update_sd)
34567 +{
34568 +       int result;
34569 +       cloff_t aidx;
34570 +
34571 +       result = find_fake_appended(inode, &aidx);
34572 +       if (result)
34573 +               return result;
34574 +       assert("edward-1208",
34575 +              ergo(aidx > 0, inode->i_size > clust_to_off(aidx - 1, inode)));
34576 +
34577 +       result = start_truncate_fake(inode, aidx, new_size, update_sd);
34578 +       if (result)
34579 +               return result;
34580 +       if (inode->i_size == new_size)
34581 +               /* nothing to truncate anymore */
34582 +               return 0;
34583 +       result = (inode->i_size < new_size ?
34584 +                 cryptcompress_append_hole(inode, new_size) :
34585 +                 prune_cryptcompress(inode, new_size, update_sd, aidx));
34586 +       if (!result && update_sd)
34587 +               result = update_sd_cryptcompress(inode);
34588 +       return result;
34589 +}
34590 +
34591 +static void clear_moved_tag_cluster(struct address_space * mapping,
34592 +                                   reiser4_cluster_t * clust)
34593 +{
34594 +       int i;
34595 +       void * ret;
34596 +       read_lock_irq(&mapping->tree_lock);
34597 +       for (i = 0; i < clust->nr_pages; i++) {
34598 +               assert("edward-1438", clust->pages[i] != NULL);
34599 +               ret = radix_tree_tag_clear(&mapping->page_tree,
34600 +                                          clust->pages[i]->index,
34601 +                                          PAGECACHE_TAG_REISER4_MOVED);
34602 +               assert("edward-1439", ret == clust->pages[i]);
34603 +       }
34604 +       read_unlock_irq(&mapping->tree_lock);
34605 +}
34606 +
34607 +/* Capture an anonymous pager cluster. (Page cluser is
34608 +   anonymous if it contains at least one anonymous page */
34609 +static int
34610 +capture_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
34611 +{
34612 +       int result;
34613 +
34614 +       assert("edward-1073", clust != NULL);
34615 +       assert("edward-1074", inode != NULL);
34616 +       assert("edward-1075", clust->dstat == INVAL_DISK_CLUSTER);
34617 +
34618 +       result = prepare_cluster(inode, 0, 0, clust, PCL_APPEND);
34619 +       if (result)
34620 +               return result;
34621 +       set_cluster_pages_dirty(clust);
34622 +       clear_moved_tag_cluster(inode->i_mapping, clust);
34623 +
34624 +       result = try_capture_cluster(clust, inode);
34625 +       put_hint_cluster(clust, inode, ZNODE_WRITE_LOCK);
34626 +       if (unlikely(result)) {
34627 +               /* set cleared tag back, so it will be
34628 +                  possible to capture it again later */
34629 +               read_lock_irq(&inode->i_mapping->tree_lock);
34630 +               radix_tree_tag_set(&inode->i_mapping->page_tree,
34631 +                                  clust_to_pg(clust->index, inode),
34632 +                                  PAGECACHE_TAG_REISER4_MOVED);
34633 +               read_unlock_irq(&inode->i_mapping->tree_lock);
34634 +
34635 +               reiser4_release_cluster_pages_and_jnode(clust);
34636 +       }
34637 +       return result;
34638 +}
34639 +
34640 +#define MAX_CLUSTERS_TO_CAPTURE(inode)    (1024 >> cluster_nrpages_shift(inode))
34641 +
34642 +/* read lock should be acquired */
34643 +static int
34644 +capture_anonymous_clusters(struct address_space *mapping, pgoff_t * index,
34645 +                          int to_capture)
34646 +{
34647 +       int result = 0;
34648 +       int found;
34649 +       struct page *page = NULL;
34650 +       hint_t *hint;
34651 +       lock_handle *lh;
34652 +       reiser4_cluster_t clust;
34653 +
34654 +       assert("edward-1127", mapping != NULL);
34655 +       assert("edward-1128", mapping->host != NULL);
34656 +       assert("edward-1440",  mapping->host->i_mapping == mapping);
34657 +
34658 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
34659 +       if (hint == NULL)
34660 +               return RETERR(-ENOMEM);
34661 +       hint_init_zero(hint);
34662 +       lh = &hint->lh;
34663 +
34664 +       cluster_init_read(&clust, NULL);
34665 +       clust.hint = hint;
34666 +
34667 +       result = alloc_cluster_pgset(&clust, cluster_nrpages(mapping->host));
34668 +       if (result)
34669 +               goto out;
34670 +
34671 +       while (to_capture > 0) {
34672 +               found =
34673 +                   find_get_pages_tag(mapping, index,
34674 +                                      PAGECACHE_TAG_REISER4_MOVED, 1, &page);
34675 +               if (!found) {
34676 +                       *index = (pgoff_t) - 1;
34677 +                       break;
34678 +               }
34679 +               assert("edward-1109", page != NULL);
34680 +
34681 +               move_cluster_forward(&clust, mapping->host, page->index);
34682 +               result = capture_page_cluster(&clust, mapping->host);
34683 +               page_cache_release(page);
34684 +               if (result)
34685 +                       break;
34686 +               to_capture -= clust.nr_pages;
34687 +       }
34688 +       if (result) {
34689 +               warning("edward-1077",
34690 +                       "Cannot capture anon pages: result=%i (captured=%d)\n",
34691 +                       result,
34692 +                       ((__u32) MAX_CLUSTERS_TO_CAPTURE(mapping->host)) -
34693 +                       to_capture);
34694 +       } else {
34695 +               /* something had to be found */
34696 +               assert("edward-1078",
34697 +                      to_capture <= MAX_CLUSTERS_TO_CAPTURE(mapping->host));
34698 +               if (to_capture <= 0)
34699 +                       /* there may be left more pages */
34700 +                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
34701 +       }
34702 +      out:
34703 +       done_lh(lh);
34704 +       kfree(hint);
34705 +       put_cluster_handle(&clust);
34706 +       return result;
34707 +}
34708 +
34709 +/* Check mapping for existence of not captured dirty pages.
34710 +   This returns !0 if either page tree contains pages tagged
34711 +   PAGECACHE_TAG_REISER4_MOVED */
34712 +static int cryptcompress_inode_has_anon_pages(struct inode *inode)
34713 +{
34714 +       return mapping_tagged(inode->i_mapping, PAGECACHE_TAG_REISER4_MOVED);
34715 +}
34716 +
34717 +/* this is implementation of vfs's writepages method of struct
34718 +   address_space_operations */
34719 +int
34720 +writepages_cryptcompress(struct address_space *mapping,
34721 +                        struct writeback_control *wbc)
34722 +{
34723 +       int result;
34724 +       int to_capture;
34725 +       pgoff_t nrpages;
34726 +       pgoff_t index = 0;
34727 +       cryptcompress_info_t *info;
34728 +       struct inode *inode;
34729 +
34730 +       inode = mapping->host;
34731 +       if (!cryptcompress_inode_has_anon_pages(inode)) {
34732 +               result = 0;
34733 +               goto end;
34734 +       }
34735 +
34736 +       info = cryptcompress_inode_data(inode);
34737 +       nrpages = count_to_nrpages(i_size_read(inode));
34738 +
34739 +       if (wbc->sync_mode != WB_SYNC_ALL)
34740 +               to_capture =
34741 +                   min_count(wbc->nr_to_write, MAX_CLUSTERS_TO_CAPTURE(inode));
34742 +       else
34743 +               to_capture = MAX_CLUSTERS_TO_CAPTURE(inode);
34744 +       do {
34745 +               reiser4_context *ctx;
34746 +
34747 +               ctx = reiser4_init_context(inode->i_sb);
34748 +               if (IS_ERR(ctx)) {
34749 +                       result = PTR_ERR(ctx);
34750 +                       break;
34751 +               }
34752 +               ctx->nobalance = 1;
34753 +
34754 +               assert("edward-1079",
34755 +                      lock_stack_isclean(get_current_lock_stack()));
34756 +
34757 +               LOCK_CNT_INC(inode_sem_r);
34758 +
34759 +               result =
34760 +                   capture_anonymous_clusters(inode->i_mapping, &index,
34761 +                                              to_capture);
34762 +
34763 +               if (result != 0 || wbc->sync_mode != WB_SYNC_ALL) {
34764 +                       reiser4_exit_context(ctx);
34765 +                       break;
34766 +               }
34767 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
34768 +               reiser4_exit_context(ctx);
34769 +       } while (result == 0 && index < nrpages);
34770 +
34771 +      end:
34772 +       if (is_in_reiser4_context()) {
34773 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
34774 +                       /* there are already pages to flush, flush them out, do
34775 +                          not delay until end of reiser4_sync_inodes */
34776 +                       reiser4_writeout(inode->i_sb, wbc);
34777 +                       get_current_context()->nr_captured = 0;
34778 +               }
34779 +       }
34780 +       return result;
34781 +}
34782 +
34783 +/* plugin->u.file.mmap */
34784 +int mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
34785 +{
34786 +       int result;
34787 +       struct inode *inode;
34788 +       reiser4_context *ctx;
34789 +
34790 +       inode = file->f_dentry->d_inode;
34791 +       ctx = reiser4_init_context(inode->i_sb);
34792 +       if (IS_ERR(ctx))
34793 +               return PTR_ERR(ctx);
34794 +       /*
34795 +        * generic_file_mmap will do update_atime. Grab space for stat data
34796 +        * update.
34797 +        */
34798 +       result = reiser4_grab_space_force
34799 +               (inode_file_plugin(inode)->estimate.update(inode),
34800 +                BA_CAN_COMMIT);
34801 +       if (result) {
34802 +               reiser4_exit_context(ctx);
34803 +               return result;
34804 +       }
34805 +       result = generic_file_mmap(file, vma);
34806 +       reiser4_exit_context(ctx);
34807 +       return result;
34808 +}
34809 +
34810 +/* plugin->u.file.release */
34811 +/* plugin->u.file.get_block */
34812 +
34813 +/* this is implementation of delete method of file plugin for
34814 +   cryptcompress objects */
34815 +int delete_object_cryptcompress(struct inode *inode)
34816 +{
34817 +       int result;
34818 +
34819 +       assert("edward-429", inode->i_nlink == 0);
34820 +
34821 +       reiser4_txn_restart_current();
34822 +
34823 +       result = cryptcompress_truncate(inode, 0, 0);
34824 +       if (result) {
34825 +               warning("edward-430",
34826 +                       "cannot truncate cryptcompress file  %lli: %i",
34827 +                       (unsigned long long)get_inode_oid(inode),
34828 +                       result);
34829 +       }
34830 +       truncate_inode_pages(inode->i_mapping, 0);
34831 +       /* and remove stat data */
34832 +       return reiser4_delete_object_common(inode);
34833 +}
34834 +
34835 +/* plugin->u.file.setattr method
34836 +   This implements actual truncate (see comments in reiser4/page_cache.c) */
34837 +int setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
34838 +{
34839 +       int result;
34840 +       struct inode *inode;
34841 +
34842 +       inode = dentry->d_inode;
34843 +       if (attr->ia_valid & ATTR_SIZE) {
34844 +               if (inode->i_size != attr->ia_size) {
34845 +                       reiser4_context *ctx;
34846 +                       loff_t old_size;
34847 +
34848 +                       ctx = reiser4_init_context(dentry->d_inode->i_sb);
34849 +                       if (IS_ERR(ctx))
34850 +                               return PTR_ERR(ctx);
34851 +
34852 +                       inode_check_scale(inode, inode->i_size, attr->ia_size);
34853 +
34854 +                       old_size = inode->i_size;
34855 +
34856 +                       result =
34857 +                           cryptcompress_truncate(inode, attr->ia_size,
34858 +                                                  1 /* update stat data */ );
34859 +                       if (result) {
34860 +                               warning("edward-1192",
34861 +                                       "truncate_cryptcompress failed: oid %lli, "
34862 +                                       "old size %lld, new size %lld, retval %d",
34863 +                                       (unsigned long long)
34864 +                                       get_inode_oid(inode), old_size,
34865 +                                       attr->ia_size, result);
34866 +                       }
34867 +                       context_set_commit_async(ctx);
34868 +                       reiser4_exit_context(ctx);
34869 +               } else
34870 +                       result = 0;
34871 +       } else
34872 +               result = reiser4_setattr_common(dentry, attr);
34873 +       return result;
34874 +}
34875 +
34876 +/* sendfile_cryptcompress - sendfile of struct file_operations */
34877 +ssize_t
34878 +sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
34879 +                      read_actor_t actor, void *target)
34880 +{
34881 +       reiser4_context *ctx;
34882 +       ssize_t result;
34883 +       struct inode *inode;
34884 +       cryptcompress_info_t *info;
34885 +
34886 +       inode = file->f_dentry->d_inode;
34887 +       ctx = reiser4_init_context(inode->i_sb);
34888 +       if (IS_ERR(ctx))
34889 +               return PTR_ERR(ctx);
34890 +       /*
34891 +        * generic_file_sndfile may want to call update_atime. Grab space for
34892 +        * stat data update
34893 +        */
34894 +       result = reiser4_grab_space(estimate_update_common(inode),
34895 +                                   BA_CAN_COMMIT);
34896 +       if (result)
34897 +               goto exit;
34898 +       info = cryptcompress_inode_data(inode);
34899 +
34900 +       result = generic_file_sendfile(file, ppos, count, actor, target);
34901 + exit:
34902 +       reiser4_exit_context(ctx);
34903 +       return result;
34904 +}
34905 +
34906 +/*
34907 + * release_cryptcompress - release of struct file_operations
34908 + * @inode: inode of released file
34909 + * @file: file to release
34910 + */
34911 +int release_cryptcompress(struct inode *inode, struct file *file)
34912 +{
34913 +       reiser4_context *ctx = reiser4_init_context(inode->i_sb);
34914 +
34915 +       if (IS_ERR(ctx))
34916 +               return PTR_ERR(ctx);
34917 +       reiser4_free_file_fsdata(file);
34918 +       reiser4_exit_context(ctx);
34919 +       return 0;
34920 +}
34921 +
34922 +#if 0
34923 +int prepare_write_cryptcompress(struct file *file, struct page *page,
34924 +                               unsigned from, unsigned to)
34925 +{
34926 +       return prepare_write_common(file, page, from, to);
34927 +}
34928 +#endif  /*  0  */
34929 +
34930 +
34931 +/*
34932 +  Local variables:
34933 +  c-indentation-style: "K&R"
34934 +  mode-name: "LC"
34935 +  c-basic-offset: 8
34936 +  tab-width: 8
34937 +  fill-column: 80
34938 +  scroll-step: 1
34939 +  End:
34940 +*/
34941 diff --git a/fs/reiser4/plugin/file/cryptcompress.h b/fs/reiser4/plugin/file/cryptcompress.h
34942 new file mode 100644
34943 index 0000000..5f2d7fb
34944 --- /dev/null
34945 +++ b/fs/reiser4/plugin/file/cryptcompress.h
34946 @@ -0,0 +1,554 @@
34947 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
34948 +/* See http://www.namesys.com/cryptcompress_design.html */
34949 +
34950 +#if !defined( __FS_REISER4_CRYPTCOMPRESS_H__ )
34951 +#define __FS_REISER4_CRYPTCOMPRESS_H__
34952 +
34953 +#include "../../page_cache.h"
34954 +#include "../compress/compress.h"
34955 +#include "../crypto/cipher.h"
34956 +
34957 +#include <linux/pagemap.h>
34958 +
34959 +#define MIN_CLUSTER_SHIFT PAGE_CACHE_SHIFT
34960 +#define MAX_CLUSTER_SHIFT 16
34961 +#define MAX_CLUSTER_NRPAGES (1U << MAX_CLUSTER_SHIFT >> PAGE_CACHE_SHIFT)
34962 +#define DC_CHECKSUM_SIZE 4
34963 +
34964 +#define MIN_LATTICE_FACTOR 1
34965 +#define MAX_LATTICE_FACTOR 32
34966 +
34967 +/* this mask contains all non-standard plugins that might
34968 +   be present in reiser4-specific part of inode managed by
34969 +   cryptcompress file plugin */
34970 +#define cryptcompress_mask                             \
34971 +       ((1 << PSET_FILE) |                             \
34972 +        (1 << PSET_CLUSTER) |                          \
34973 +        (1 << PSET_CIPHER) |                           \
34974 +        (1 << PSET_DIGEST) |                           \
34975 +        (1 << PSET_COMPRESSION) |                      \
34976 +        (1 << PSET_COMPRESSION_MODE))
34977 +
34978 +static inline loff_t min_count(loff_t a, loff_t b)
34979 +{
34980 +       return (a < b ? a : b);
34981 +}
34982 +
34983 +static inline loff_t max_count(loff_t a, loff_t b)
34984 +{
34985 +       return (a > b ? a : b);
34986 +}
34987 +
34988 +#if REISER4_DEBUG
34989 +static inline int cluster_shift_ok(int shift)
34990 +{
34991 +       return (shift >= MIN_CLUSTER_SHIFT) && (shift <= MAX_CLUSTER_SHIFT);
34992 +}
34993 +#endif
34994 +
34995 +typedef struct tfm_stream {
34996 +       __u8 *data;
34997 +       size_t size;
34998 +} tfm_stream_t;
34999 +
35000 +typedef enum {
35001 +       INPUT_STREAM,
35002 +       OUTPUT_STREAM,
35003 +       LAST_STREAM
35004 +} tfm_stream_id;
35005 +
35006 +typedef tfm_stream_t *tfm_unit[LAST_STREAM];
35007 +
35008 +static inline __u8 *ts_data(tfm_stream_t * stm)
35009 +{
35010 +       assert("edward-928", stm != NULL);
35011 +       return stm->data;
35012 +}
35013 +
35014 +static inline size_t ts_size(tfm_stream_t * stm)
35015 +{
35016 +       assert("edward-929", stm != NULL);
35017 +       return stm->size;
35018 +}
35019 +
35020 +static inline void set_ts_size(tfm_stream_t * stm, size_t size)
35021 +{
35022 +       assert("edward-930", stm != NULL);
35023 +
35024 +       stm->size = size;
35025 +}
35026 +
35027 +static inline int alloc_ts(tfm_stream_t ** stm)
35028 +{
35029 +       assert("edward-931", stm);
35030 +       assert("edward-932", *stm == NULL);
35031 +
35032 +       *stm = kmalloc(sizeof **stm, reiser4_ctx_gfp_mask_get());
35033 +       if (*stm == NULL)
35034 +               return -ENOMEM;
35035 +       memset(*stm, 0, sizeof **stm);
35036 +       return 0;
35037 +}
35038 +
35039 +static inline void free_ts(tfm_stream_t * stm)
35040 +{
35041 +       assert("edward-933", !ts_data(stm));
35042 +       assert("edward-934", !ts_size(stm));
35043 +
35044 +       kfree(stm);
35045 +}
35046 +
35047 +static inline int alloc_ts_data(tfm_stream_t * stm, size_t size)
35048 +{
35049 +       assert("edward-935", !ts_data(stm));
35050 +       assert("edward-936", !ts_size(stm));
35051 +       assert("edward-937", size != 0);
35052 +
35053 +       stm->data = reiser4_vmalloc(size);
35054 +       if (!stm->data)
35055 +               return -ENOMEM;
35056 +       set_ts_size(stm, size);
35057 +       return 0;
35058 +}
35059 +
35060 +static inline void free_ts_data(tfm_stream_t * stm)
35061 +{
35062 +       assert("edward-938", equi(ts_data(stm), ts_size(stm)));
35063 +
35064 +       if (ts_data(stm))
35065 +               vfree(ts_data(stm));
35066 +       memset(stm, 0, sizeof *stm);
35067 +}
35068 +
35069 +/* Write modes for item conversion in flush convert phase */
35070 +typedef enum {
35071 +       CRC_APPEND_ITEM = 1,
35072 +       CRC_OVERWRITE_ITEM = 2,
35073 +       CRC_CUT_ITEM = 3
35074 +} cryptcompress_write_mode_t;
35075 +
35076 +typedef enum {
35077 +       PCL_UNKNOWN = 0,        /* invalid option */
35078 +       PCL_APPEND = 1,         /* append and/or overwrite */
35079 +       PCL_TRUNCATE = 2        /* truncate */
35080 +} page_cluster_op;
35081 +
35082 +/* Reiser4 file write/read transforms page cluster into disk cluster (and back)
35083 +   using crypto/compression transforms implemented by reiser4 transform plugins.
35084 +   Before each transform we allocate a pair of streams (tfm_unit) and assemble
35085 +   page cluster into the input one. After transform we split output stream into
35086 +   a set of items (disk cluster).
35087 +*/
35088 +typedef struct tfm_cluster {
35089 +       coa_set coa;
35090 +       tfm_unit tun;
35091 +       tfm_action act;
35092 +       int uptodate;
35093 +       int lsize;        /* size of the logical cluster */
35094 +       int len;          /* length of the transform stream */
35095 +} tfm_cluster_t;
35096 +
35097 +static inline coa_t get_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act)
35098 +{
35099 +       return tc->coa[id][act];
35100 +}
35101 +
35102 +static inline void
35103 +set_coa(tfm_cluster_t * tc, reiser4_compression_id id, tfm_action act, coa_t coa)
35104 +{
35105 +       tc->coa[id][act] = coa;
35106 +}
35107 +
35108 +static inline int
35109 +alloc_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35110 +{
35111 +       coa_t coa;
35112 +
35113 +       coa = cplug->alloc(tc->act);
35114 +       if (IS_ERR(coa))
35115 +               return PTR_ERR(coa);
35116 +       set_coa(tc, cplug->h.id, tc->act, coa);
35117 +       return 0;
35118 +}
35119 +
35120 +static inline int
35121 +grab_coa(tfm_cluster_t * tc, compression_plugin * cplug)
35122 +{
35123 +       return (cplug->alloc && !get_coa(tc, cplug->h.id, tc->act) ?
35124 +               alloc_coa(tc, cplug) : 0);
35125 +}
35126 +
35127 +static inline void free_coa_set(tfm_cluster_t * tc)
35128 +{
35129 +       tfm_action j;
35130 +       reiser4_compression_id i;
35131 +       compression_plugin *cplug;
35132 +
35133 +       assert("edward-810", tc != NULL);
35134 +
35135 +       for (j = 0; j < TFMA_LAST; j++)
35136 +               for (i = 0; i < LAST_COMPRESSION_ID; i++) {
35137 +                       if (!get_coa(tc, i, j))
35138 +                               continue;
35139 +                       cplug = compression_plugin_by_id(i);
35140 +                       assert("edward-812", cplug->free != NULL);
35141 +                       cplug->free(get_coa(tc, i, j), j);
35142 +                       set_coa(tc, i, j, 0);
35143 +               }
35144 +       return;
35145 +}
35146 +
35147 +static inline tfm_stream_t *tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35148 +{
35149 +       return tc->tun[id];
35150 +}
35151 +
35152 +static inline void
35153 +set_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id, tfm_stream_t * ts)
35154 +{
35155 +       tc->tun[id] = ts;
35156 +}
35157 +
35158 +static inline __u8 *tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id)
35159 +{
35160 +       return ts_data(tfm_stream(tc, id));
35161 +}
35162 +
35163 +static inline void
35164 +set_tfm_stream_data(tfm_cluster_t * tc, tfm_stream_id id, __u8 * data)
35165 +{
35166 +       tfm_stream(tc, id)->data = data;
35167 +}
35168 +
35169 +static inline size_t tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id)
35170 +{
35171 +       return ts_size(tfm_stream(tc, id));
35172 +}
35173 +
35174 +static inline void
35175 +set_tfm_stream_size(tfm_cluster_t * tc, tfm_stream_id id, size_t size)
35176 +{
35177 +       tfm_stream(tc, id)->size = size;
35178 +}
35179 +
35180 +static inline int
35181 +alloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35182 +{
35183 +       assert("edward-939", tc != NULL);
35184 +       assert("edward-940", !tfm_stream(tc, id));
35185 +
35186 +       tc->tun[id] = kmalloc(sizeof(tfm_stream_t), reiser4_ctx_gfp_mask_get());
35187 +       if (!tc->tun[id])
35188 +               return -ENOMEM;
35189 +       memset(tfm_stream(tc, id), 0, sizeof(tfm_stream_t));
35190 +       return alloc_ts_data(tfm_stream(tc, id), size);
35191 +}
35192 +
35193 +static inline int
35194 +realloc_tfm_stream(tfm_cluster_t * tc, size_t size, tfm_stream_id id)
35195 +{
35196 +       assert("edward-941", tfm_stream_size(tc, id) < size);
35197 +       free_ts_data(tfm_stream(tc, id));
35198 +       return alloc_ts_data(tfm_stream(tc, id), size);
35199 +}
35200 +
35201 +static inline void free_tfm_stream(tfm_cluster_t * tc, tfm_stream_id id)
35202 +{
35203 +       free_ts_data(tfm_stream(tc, id));
35204 +       free_ts(tfm_stream(tc, id));
35205 +       set_tfm_stream(tc, id, 0);
35206 +}
35207 +
35208 +static inline unsigned coa_overrun(compression_plugin * cplug, int ilen)
35209 +{
35210 +       return (cplug->overrun != NULL ? cplug->overrun(ilen) : 0);
35211 +}
35212 +
35213 +static inline void free_tfm_unit(tfm_cluster_t * tc)
35214 +{
35215 +       tfm_stream_id id;
35216 +       for (id = 0; id < LAST_STREAM; id++) {
35217 +               if (!tfm_stream(tc, id))
35218 +                       continue;
35219 +               free_tfm_stream(tc, id);
35220 +       }
35221 +}
35222 +
35223 +static inline void put_tfm_cluster(tfm_cluster_t * tc)
35224 +{
35225 +       assert("edward-942", tc != NULL);
35226 +       free_coa_set(tc);
35227 +       free_tfm_unit(tc);
35228 +}
35229 +
35230 +static inline int tfm_cluster_is_uptodate(tfm_cluster_t * tc)
35231 +{
35232 +       assert("edward-943", tc != NULL);
35233 +       assert("edward-944", tc->uptodate == 0 || tc->uptodate == 1);
35234 +       return (tc->uptodate == 1);
35235 +}
35236 +
35237 +static inline void tfm_cluster_set_uptodate(tfm_cluster_t * tc)
35238 +{
35239 +       assert("edward-945", tc != NULL);
35240 +       assert("edward-946", tc->uptodate == 0 || tc->uptodate == 1);
35241 +       tc->uptodate = 1;
35242 +       return;
35243 +}
35244 +
35245 +static inline void tfm_cluster_clr_uptodate(tfm_cluster_t * tc)
35246 +{
35247 +       assert("edward-947", tc != NULL);
35248 +       assert("edward-948", tc->uptodate == 0 || tc->uptodate == 1);
35249 +       tc->uptodate = 0;
35250 +       return;
35251 +}
35252 +
35253 +static inline int tfm_stream_is_set(tfm_cluster_t * tc, tfm_stream_id id)
35254 +{
35255 +       return (tfm_stream(tc, id) &&
35256 +               tfm_stream_data(tc, id) && tfm_stream_size(tc, id));
35257 +}
35258 +
35259 +static inline int tfm_cluster_is_set(tfm_cluster_t * tc)
35260 +{
35261 +       int i;
35262 +       for (i = 0; i < LAST_STREAM; i++)
35263 +               if (!tfm_stream_is_set(tc, i))
35264 +                       return 0;
35265 +       return 1;
35266 +}
35267 +
35268 +static inline void alternate_streams(tfm_cluster_t * tc)
35269 +{
35270 +       tfm_stream_t *tmp = tfm_stream(tc, INPUT_STREAM);
35271 +
35272 +       set_tfm_stream(tc, INPUT_STREAM, tfm_stream(tc, OUTPUT_STREAM));
35273 +       set_tfm_stream(tc, OUTPUT_STREAM, tmp);
35274 +}
35275 +
35276 +/* a kind of data that we can write to the window */
35277 +typedef enum {
35278 +       DATA_WINDOW,            /* the data we copy form user space */
35279 +       HOLE_WINDOW             /* zeroes if we write hole */
35280 +} window_stat;
35281 +
35282 +/* Sliding window of cluster size which should be set to the approprite position
35283 +   (defined by cluster index) in a file before page cluster modification by
35284 +   file_write. Then we translate file size, offset to write from, number of
35285 +   bytes to write, etc.. to the following configuration needed to estimate
35286 +   number of pages to read before write, etc...
35287 +*/
35288 +typedef struct reiser4_slide {
35289 +       unsigned off;           /* offset we start to write/truncate from */
35290 +       unsigned count;         /* number of bytes (zeroes) to write/truncate */
35291 +       unsigned delta;         /* number of bytes to append to the hole */
35292 +       window_stat stat;       /* a kind of data to write to the window */
35293 +} reiser4_slide_t;
35294 +
35295 +/* The following is a set of possible disk cluster states */
35296 +typedef enum {
35297 +       INVAL_DISK_CLUSTER,     /* unknown state */
35298 +       PREP_DISK_CLUSTER,      /* disk cluster got converted by flush
35299 +                                  at least 1 time */
35300 +       UNPR_DISK_CLUSTER,      /* disk cluster just created and should be
35301 +                                  converted by flush */
35302 +       FAKE_DISK_CLUSTER       /* disk cluster doesn't exist neither in memory
35303 +                                  nor on disk */
35304 +} disk_cluster_stat;
35305 +
35306 +/*
35307 +   While implementing all transforms (from page to disk cluster, and back)
35308 +   reiser4 cluster manager fills the following structure incapsulating pointers
35309 +   to all the clusters for the same index including the sliding window above
35310 +*/
35311 +typedef struct reiser4_cluster {
35312 +       tfm_cluster_t tc;       /* transform cluster */
35313 +       int nr_pages;           /* number of pages */
35314 +       struct page **pages;    /* page cluster */
35315 +       page_cluster_op op;     /* page cluster operation */
35316 +       struct file *file;
35317 +       hint_t *hint;           /* disk cluster item for traversal */
35318 +       disk_cluster_stat dstat;        /* state of the current disk cluster */
35319 +       cloff_t index;          /* offset in the units of cluster size */
35320 +       int index_valid;        /* to validate the index above, if needed */
35321 +       reiser4_slide_t *win;   /* sliding window of cluster size */
35322 +       int reserved;           /* this indicates that space for disk
35323 +                                  cluster modification is reserved */
35324 +#if REISER4_DEBUG
35325 +       reiser4_context *ctx;
35326 +       int reserved_prepped;
35327 +       int reserved_unprepped;
35328 +#endif
35329 +
35330 +} reiser4_cluster_t;
35331 +
35332 +static inline __u8 * tfm_input_data (reiser4_cluster_t * clust)
35333 +{
35334 +       return tfm_stream_data(&clust->tc, INPUT_STREAM);
35335 +}
35336 +
35337 +static inline __u8 * tfm_output_data (reiser4_cluster_t * clust)
35338 +{
35339 +       return tfm_stream_data(&clust->tc, OUTPUT_STREAM);
35340 +}
35341 +
35342 +static inline int reset_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35343 +{
35344 +       assert("edward-1057", clust->pages != NULL);
35345 +       memset(clust->pages, 0, sizeof(*clust->pages) * nrpages);
35346 +       return 0;
35347 +}
35348 +
35349 +static inline int alloc_cluster_pgset(reiser4_cluster_t * clust, int nrpages)
35350 +{
35351 +       assert("edward-949", clust != NULL);
35352 +       assert("edward-1362", clust->pages == NULL);
35353 +       assert("edward-950", nrpages != 0 && nrpages <= MAX_CLUSTER_NRPAGES);
35354 +
35355 +       clust->pages =
35356 +               kmalloc(sizeof(*clust->pages) * nrpages,
35357 +                       reiser4_ctx_gfp_mask_get());
35358 +       if (!clust->pages)
35359 +               return RETERR(-ENOMEM);
35360 +       reset_cluster_pgset(clust, nrpages);
35361 +       return 0;
35362 +}
35363 +
35364 +static inline void free_cluster_pgset(reiser4_cluster_t * clust)
35365 +{
35366 +       assert("edward-951", clust->pages != NULL);
35367 +       kfree(clust->pages);
35368 +       clust->pages = NULL;
35369 +}
35370 +
35371 +static inline void put_cluster_handle(reiser4_cluster_t * clust)
35372 +{
35373 +       assert("edward-435", clust != NULL);
35374 +
35375 +       put_tfm_cluster(&clust->tc);
35376 +       if (clust->pages)
35377 +               free_cluster_pgset(clust);
35378 +       memset(clust, 0, sizeof *clust);
35379 +}
35380 +
35381 +static inline void inc_keyload_count(crypto_stat_t * data)
35382 +{
35383 +       assert("edward-1410", data != NULL);
35384 +       data->keyload_count++;
35385 +}
35386 +
35387 +static inline void dec_keyload_count(crypto_stat_t * data)
35388 +{
35389 +       assert("edward-1411", data != NULL);
35390 +       assert("edward-1412", data->keyload_count > 0);
35391 +       data->keyload_count--;
35392 +}
35393 +
35394 +/* cryptcompress specific part of reiser4_inode */
35395 +typedef struct cryptcompress_info {
35396 +       crypto_stat_t *crypt;
35397 +       /* the following 2 fields are controlled by compression mode plugin */
35398 +       int compress_toggle; /* current status of compressibility */
35399 +       int lattice_factor;  /* factor of dynamic lattice. FIXME: Have a
35400 +                               compression_toggle to keep the factor */
35401 +#if REISER4_DEBUG
35402 +       int pgcount;              /* number of captured pages */
35403 +#endif
35404 +} cryptcompress_info_t;
35405 +
35406 +static inline void set_compression_toggle (cryptcompress_info_t * info, int val)
35407 +{
35408 +       info->compress_toggle = val;
35409 +}
35410 +
35411 +static inline int get_compression_toggle (cryptcompress_info_t * info)
35412 +{
35413 +       return info->compress_toggle;
35414 +}
35415 +
35416 +static inline int compression_is_on(cryptcompress_info_t * info)
35417 +{
35418 +       return get_compression_toggle(info) == 1;
35419 +}
35420 +
35421 +static inline void turn_on_compression(cryptcompress_info_t * info)
35422 +{
35423 +       set_compression_toggle(info, 1);
35424 +}
35425 +
35426 +static inline void turn_off_compression(cryptcompress_info_t * info)
35427 +{
35428 +       set_compression_toggle(info, 0);
35429 +}
35430 +
35431 +static inline void set_lattice_factor(cryptcompress_info_t * info, int val)
35432 +{
35433 +       info->lattice_factor = val;
35434 +}
35435 +
35436 +static inline int get_lattice_factor(cryptcompress_info_t * info)
35437 +{
35438 +       return info->lattice_factor;
35439 +}
35440 +
35441 +cryptcompress_info_t *cryptcompress_inode_data(const struct inode *);
35442 +int equal_to_rdk(znode *, const reiser4_key *);
35443 +int goto_right_neighbor(coord_t *, lock_handle *);
35444 +int cryptcompress_inode_ok(struct inode *inode);
35445 +int coord_is_unprepped_ctail(const coord_t * coord);
35446 +extern int ctail_read_disk_cluster (reiser4_cluster_t *, struct inode *,
35447 +                                   znode_lock_mode mode);
35448 +extern int do_readpage_ctail(struct inode *, reiser4_cluster_t *,
35449 +                            struct page * page, znode_lock_mode mode);
35450 +extern int ctail_insert_unprepped_cluster(reiser4_cluster_t * clust,
35451 +                                         struct inode * inode);
35452 +extern int readpages_cryptcompress(struct file*, struct address_space*,
35453 +                                  struct list_head*, unsigned);
35454 +int bind_cryptcompress(struct inode *child, struct inode *parent);
35455 +void destroy_inode_cryptcompress(struct inode * inode);
35456 +int grab_cluster_pages(struct inode *inode, reiser4_cluster_t * clust);
35457 +int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
35458 +                         reiser4_cluster_t * clust, int * progress);
35459 +crypto_stat_t * inode_crypto_stat (struct inode * inode);
35460 +void inherit_crypto_stat_common(struct inode * parent, struct inode * object,
35461 +                               int (*can_inherit)(struct inode * child,
35462 +                                                  struct inode * parent));
35463 +void reiser4_attach_crypto_stat(struct inode * inode, crypto_stat_t * info);
35464 +void change_crypto_stat(struct inode * inode, crypto_stat_t * new);
35465 +crypto_stat_t * reiser4_alloc_crypto_stat (struct inode * inode);
35466 +
35467 +static inline struct crypto_blkcipher * info_get_cipher(crypto_stat_t * info)
35468 +{
35469 +       return info->cipher;
35470 +}
35471 +
35472 +static inline void info_set_cipher(crypto_stat_t * info,
35473 +                                  struct crypto_blkcipher * tfm)
35474 +{
35475 +       info->cipher = tfm;
35476 +}
35477 +
35478 +static inline struct crypto_hash * info_get_digest(crypto_stat_t * info)
35479 +{
35480 +       return info->digest;
35481 +}
35482 +
35483 +static inline void info_set_digest(crypto_stat_t * info,
35484 +                                  struct crypto_hash * tfm)
35485 +{
35486 +       info->digest = tfm;
35487 +}
35488 +
35489 +#endif                         /* __FS_REISER4_CRYPTCOMPRESS_H__ */
35490 +
35491 +/* Make Linus happy.
35492 +   Local variables:
35493 +   c-indentation-style: "K&R"
35494 +   mode-name: "LC"
35495 +   c-basic-offset: 8
35496 +   tab-width: 8
35497 +   fill-column: 120
35498 +   scroll-step: 1
35499 +   End:
35500 +*/
35501 diff --git a/fs/reiser4/plugin/file/file.c b/fs/reiser4/plugin/file/file.c
35502 new file mode 100644
35503 index 0000000..67501aa
35504 --- /dev/null
35505 +++ b/fs/reiser4/plugin/file/file.c
35506 @@ -0,0 +1,2820 @@
35507 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
35508 + * reiser4/README */
35509 +
35510 +/*
35511 + * this file contains implementations of inode/file/address_space/file plugin
35512 + * operations specific for "unix file plugin" (plugin id is
35513 + * UNIX_FILE_PLUGIN_ID). "Unix file" is either built of tail items only
35514 + * (FORMATTING_ID) or of extent items only (EXTENT_POINTER_ID) or empty (have
35515 + * no items but stat data)
35516 + */
35517 +
35518 +#include "../../inode.h"
35519 +#include "../../super.h"
35520 +#include "../../tree_walk.h"
35521 +#include "../../carry.h"
35522 +#include "../../page_cache.h"
35523 +#include "../../ioctl.h"
35524 +#include "../object.h"
35525 +#include "../../safe_link.h"
35526 +
35527 +#include <linux/writeback.h>
35528 +#include <linux/pagevec.h>
35529 +#include <linux/syscalls.h>
35530 +
35531 +
35532 +static int unpack(struct file *file, struct inode *inode, int forever);
35533 +static void drop_access(unix_file_info_t *);
35534 +static int hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
35535 +                        znode_lock_mode lock_mode);
35536 +
35537 +/* get unix file plugin specific portion of inode */
35538 +unix_file_info_t *unix_file_inode_data(const struct inode *inode)
35539 +{
35540 +       return &reiser4_inode_data(inode)->file_plugin_data.unix_file_info;
35541 +}
35542 +
35543 +/**
35544 + * equal_to_rdk - compare key and znode's right delimiting key
35545 + * @node: node whose right delimiting key to compare with @key
35546 + * @key: key to compare with @node's right delimiting key
35547 + *
35548 + * Returns true if @key is equal to right delimiting key of @node.
35549 + */
35550 +int equal_to_rdk(znode *node, const reiser4_key *key)
35551 +{
35552 +       int result;
35553 +
35554 +       read_lock_dk(znode_get_tree(node));
35555 +       result = keyeq(key, znode_get_rd_key(node));
35556 +       read_unlock_dk(znode_get_tree(node));
35557 +       return result;
35558 +}
35559 +
35560 +#if REISER4_DEBUG
35561 +
35562 +/**
35563 + * equal_to_ldk - compare key and znode's left delimiting key
35564 + * @node: node whose left delimiting key to compare with @key
35565 + * @key: key to compare with @node's left delimiting key
35566 + *
35567 + * Returns true if @key is equal to left delimiting key of @node.
35568 + */
35569 +int equal_to_ldk(znode *node, const reiser4_key *key)
35570 +{
35571 +       int result;
35572 +
35573 +       read_lock_dk(znode_get_tree(node));
35574 +       result = keyeq(key, znode_get_ld_key(node));
35575 +       read_unlock_dk(znode_get_tree(node));
35576 +       return result;
35577 +}
35578 +
35579 +/**
35580 + * check_coord - check whether coord corresponds to key
35581 + * @coord: coord to check
35582 + * @key: key @coord has to correspond to
35583 + *
35584 + * Returns true if @coord is set as if it was set as result of lookup with @key
35585 + * in coord->node.
35586 + */
35587 +static int check_coord(const coord_t *coord, const reiser4_key *key)
35588 +{
35589 +       coord_t twin;
35590 +
35591 +       node_plugin_by_node(coord->node)->lookup(coord->node, key,
35592 +                                                FIND_MAX_NOT_MORE_THAN, &twin);
35593 +       return coords_equal(coord, &twin);
35594 +}
35595 +
35596 +#endif /* REISER4_DEBUG */
35597 +
35598 +/**
35599 + * init_uf_coord - initialize extended coord
35600 + * @uf_coord:
35601 + * @lh:
35602 + *
35603 + *
35604 + */
35605 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh)
35606 +{
35607 +       coord_init_zero(&uf_coord->coord);
35608 +       coord_clear_iplug(&uf_coord->coord);
35609 +       uf_coord->lh = lh;
35610 +       init_lh(lh);
35611 +       memset(&uf_coord->extension, 0, sizeof(uf_coord->extension));
35612 +       uf_coord->valid = 0;
35613 +}
35614 +
35615 +static void validate_extended_coord(uf_coord_t *uf_coord, loff_t offset)
35616 +{
35617 +       assert("vs-1333", uf_coord->valid == 0);
35618 +
35619 +       if (coord_is_between_items(&uf_coord->coord))
35620 +               return;
35621 +
35622 +       assert("vs-1348",
35623 +              item_plugin_by_coord(&uf_coord->coord)->s.file.
35624 +              init_coord_extension);
35625 +
35626 +       item_body_by_coord(&uf_coord->coord);
35627 +       item_plugin_by_coord(&uf_coord->coord)->s.file.
35628 +           init_coord_extension(uf_coord, offset);
35629 +}
35630 +
35631 +/**
35632 + * goto_right_neighbor - lock right neighbor, drop current node lock
35633 + * @coord:
35634 + * @lh:
35635 + *
35636 + * Obtain lock on right neighbor and drop lock on current node.
35637 + */
35638 +int goto_right_neighbor(coord_t *coord, lock_handle *lh)
35639 +{
35640 +       int result;
35641 +       lock_handle lh_right;
35642 +
35643 +       assert("vs-1100", znode_is_locked(coord->node));
35644 +
35645 +       init_lh(&lh_right);
35646 +       result = reiser4_get_right_neighbor(&lh_right, coord->node,
35647 +                                           znode_is_wlocked(coord->node) ?
35648 +                                           ZNODE_WRITE_LOCK : ZNODE_READ_LOCK,
35649 +                                           GN_CAN_USE_UPPER_LEVELS);
35650 +       if (result) {
35651 +               done_lh(&lh_right);
35652 +               return result;
35653 +       }
35654 +
35655 +       /*
35656 +        * we hold two longterm locks on neighboring nodes. Unlock left of
35657 +        * them
35658 +        */
35659 +       done_lh(lh);
35660 +
35661 +       coord_init_first_unit_nocheck(coord, lh_right.node);
35662 +       move_lh(lh, &lh_right);
35663 +
35664 +       return 0;
35665 +
35666 +}
35667 +
35668 +/**
35669 + * set_file_state
35670 + * @uf_info:
35671 + * @cbk_result:
35672 + * @level:
35673 + *
35674 + * This is to be used by find_file_item and in find_file_state to
35675 + * determine real state of file
35676 + */
35677 +static void set_file_state(unix_file_info_t *uf_info, int cbk_result,
35678 +                          tree_level level)
35679 +{
35680 +       if (cbk_errored(cbk_result))
35681 +               /* error happened in find_file_item */
35682 +               return;
35683 +
35684 +       assert("vs-1164", level == LEAF_LEVEL || level == TWIG_LEVEL);
35685 +
35686 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35687 +               /*
35688 +                * container is unknown, therefore conversion can not be in
35689 +                * progress
35690 +                */
35691 +               assert("",
35692 +                      !reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35693 +                                              REISER4_PART_IN_CONV));
35694 +               if (cbk_result == CBK_COORD_NOTFOUND)
35695 +                       uf_info->container = UF_CONTAINER_EMPTY;
35696 +               else if (level == LEAF_LEVEL)
35697 +                       uf_info->container = UF_CONTAINER_TAILS;
35698 +               else
35699 +                       uf_info->container = UF_CONTAINER_EXTENTS;
35700 +       } else {
35701 +               /*
35702 +                * file state is known, check whether it is set correctly if
35703 +                * file is not being tail converted
35704 +                */
35705 +               if (!reiser4_inode_get_flag(unix_file_info_to_inode(uf_info),
35706 +                                           REISER4_PART_IN_CONV)) {
35707 +                       assert("vs-1162",
35708 +                              ergo(level == LEAF_LEVEL &&
35709 +                                   cbk_result == CBK_COORD_FOUND,
35710 +                                   uf_info->container == UF_CONTAINER_TAILS));
35711 +                       assert("vs-1165",
35712 +                              ergo(level == TWIG_LEVEL &&
35713 +                                   cbk_result == CBK_COORD_FOUND,
35714 +                                   uf_info->container == UF_CONTAINER_EXTENTS));
35715 +               }
35716 +       }
35717 +}
35718 +
35719 +int find_file_item_nohint(coord_t *coord, lock_handle *lh,
35720 +                         const reiser4_key *key, znode_lock_mode lock_mode,
35721 +                         struct inode *inode)
35722 +{
35723 +       return reiser4_object_lookup(inode, key, coord, lh, lock_mode,
35724 +                                    FIND_MAX_NOT_MORE_THAN,
35725 +                                    TWIG_LEVEL, LEAF_LEVEL,
35726 +                                    (lock_mode == ZNODE_READ_LOCK) ? CBK_UNIQUE :
35727 +                                    (CBK_UNIQUE | CBK_FOR_INSERT),
35728 +                                    NULL /* ra_info */ );
35729 +}
35730 +
35731 +/**
35732 + * find_file_item - look for file item in the tree
35733 + * @hint: provides coordinate, lock handle, seal
35734 + * @key: key for search
35735 + * @mode: mode of lock to put on returned node
35736 + * @ra_info:
35737 + * @inode:
35738 + *
35739 + * This finds position in the tree corresponding to @key. It first tries to use
35740 + * @hint's seal if it is set.
35741 + */
35742 +int find_file_item(hint_t *hint, const reiser4_key *key,
35743 +                  znode_lock_mode lock_mode,
35744 +                  struct inode *inode)
35745 +{
35746 +       int result;
35747 +       coord_t *coord;
35748 +       lock_handle *lh;
35749 +
35750 +       assert("nikita-3030", reiser4_schedulable());
35751 +       assert("vs-1707", hint != NULL);
35752 +       assert("vs-47", inode != NULL);
35753 +
35754 +       coord = &hint->ext_coord.coord;
35755 +       lh = hint->ext_coord.lh;
35756 +       init_lh(lh);
35757 +
35758 +       result = hint_validate(hint, key, 1 /* check key */, lock_mode);
35759 +       if (!result) {
35760 +               if (coord->between == AFTER_UNIT &&
35761 +                   equal_to_rdk(coord->node, key)) {
35762 +                       result = goto_right_neighbor(coord, lh);
35763 +                       if (result == -E_NO_NEIGHBOR)
35764 +                               return RETERR(-EIO);
35765 +                       if (result)
35766 +                               return result;
35767 +                       assert("vs-1152", equal_to_ldk(coord->node, key));
35768 +                       /*
35769 +                        * we moved to different node. Invalidate coord
35770 +                        * extension, zload is necessary to init it again
35771 +                        */
35772 +                       hint->ext_coord.valid = 0;
35773 +               }
35774 +
35775 +               set_file_state(unix_file_inode_data(inode), CBK_COORD_FOUND,
35776 +                              znode_get_level(coord->node));
35777 +
35778 +               return CBK_COORD_FOUND;
35779 +       }
35780 +
35781 +       coord_init_zero(coord);
35782 +       result = find_file_item_nohint(coord, lh, key, lock_mode, inode);
35783 +       set_file_state(unix_file_inode_data(inode), result,
35784 +                      znode_get_level(coord->node));
35785 +
35786 +       /* FIXME: we might already have coord extension initialized */
35787 +       hint->ext_coord.valid = 0;
35788 +       return result;
35789 +}
35790 +
35791 +/* plugin->u.file.write_flowom = NULL
35792 +   plugin->u.file.read_flow = NULL */
35793 +
35794 +void hint_init_zero(hint_t * hint)
35795 +{
35796 +       memset(hint, 0, sizeof(*hint));
35797 +       init_lh(&hint->lh);
35798 +       hint->ext_coord.lh = &hint->lh;
35799 +}
35800 +
35801 +static int find_file_state(struct inode *inode, unix_file_info_t *uf_info)
35802 +{
35803 +       int result;
35804 +       reiser4_key key;
35805 +       coord_t coord;
35806 +       lock_handle lh;
35807 +
35808 +       assert("vs-1628", ea_obtained(uf_info));
35809 +
35810 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
35811 +               key_by_inode_and_offset_common(inode, 0, &key);
35812 +               init_lh(&lh);
35813 +               result = find_file_item_nohint(&coord, &lh, &key,
35814 +                                              ZNODE_READ_LOCK, inode);
35815 +               set_file_state(uf_info, result, znode_get_level(coord.node));
35816 +               done_lh(&lh);
35817 +               if (!cbk_errored(result))
35818 +                       result = 0;
35819 +       } else
35820 +               result = 0;
35821 +       assert("vs-1074",
35822 +              ergo(result == 0, uf_info->container != UF_CONTAINER_UNKNOWN));
35823 +       reiser4_txn_restart_current();
35824 +       return result;
35825 +}
35826 +
35827 +/* estimate and reserve space needed to truncate page which gets partially truncated: one block for page itself, stat
35828 +   data update (estimate_one_insert_into_item) and one item insertion (estimate_one_insert_into_item) which may happen
35829 +   if page corresponds to hole extent and unallocated one will have to be created */
35830 +static int reserve_partial_page(reiser4_tree * tree)
35831 +{
35832 +       grab_space_enable();
35833 +       return reiser4_grab_reserved(reiser4_get_current_sb(),
35834 +                                    1 +
35835 +                                    2 * estimate_one_insert_into_item(tree),
35836 +                                    BA_CAN_COMMIT);
35837 +}
35838 +
35839 +/* estimate and reserve space needed to cut one item and update one stat data */
35840 +static int reserve_cut_iteration(reiser4_tree * tree)
35841 +{
35842 +       __u64 estimate = estimate_one_item_removal(tree)
35843 +           + estimate_one_insert_into_item(tree);
35844 +
35845 +       assert("nikita-3172", lock_stack_isclean(get_current_lock_stack()));
35846 +
35847 +       grab_space_enable();
35848 +       /* We need to double our estimate now that we can delete more than one
35849 +          node. */
35850 +       return reiser4_grab_reserved(reiser4_get_current_sb(), estimate * 2,
35851 +                                    BA_CAN_COMMIT);
35852 +}
35853 +
35854 +int reiser4_update_file_size(struct inode *inode, reiser4_key * key,
35855 +                            int update_sd)
35856 +{
35857 +       int result = 0;
35858 +
35859 +       INODE_SET_FIELD(inode, i_size, get_key_offset(key));
35860 +       if (update_sd) {
35861 +               inode->i_ctime = inode->i_mtime = CURRENT_TIME;
35862 +               result = reiser4_update_sd(inode);
35863 +       }
35864 +       return result;
35865 +}
35866 +
35867 +/* cut file items one by one starting from the last one until new file size (inode->i_size) is reached. Reserve space
35868 +   and update file stat data on every single cut from the tree */
35869 +int
35870 +cut_file_items(struct inode *inode, loff_t new_size, int update_sd,
35871 +              loff_t cur_size, int (*update_actor) (struct inode *,
35872 +                                                    reiser4_key *, int))
35873 +{
35874 +       reiser4_key from_key, to_key;
35875 +       reiser4_key smallest_removed;
35876 +       file_plugin *fplug = inode_file_plugin(inode);
35877 +       int result;
35878 +       int progress = 0;
35879 +
35880 +       assert("vs-1248",
35881 +              fplug == file_plugin_by_id(UNIX_FILE_PLUGIN_ID) ||
35882 +              fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
35883 +
35884 +       fplug->key_by_inode(inode, new_size, &from_key);
35885 +       to_key = from_key;
35886 +       set_key_offset(&to_key, cur_size - 1 /*get_key_offset(reiser4_max_key()) */ );
35887 +       /* this loop normally runs just once */
35888 +       while (1) {
35889 +               result = reserve_cut_iteration(reiser4_tree_by_inode(inode));
35890 +               if (result)
35891 +                       break;
35892 +
35893 +               result = reiser4_cut_tree_object(current_tree, &from_key, &to_key,
35894 +                                                &smallest_removed, inode, 1,
35895 +                                                &progress);
35896 +               if (result == -E_REPEAT) {
35897 +                       /* -E_REPEAT is a signal to interrupt a long file truncation process */
35898 +                       if (progress) {
35899 +                               result =
35900 +                                   update_actor(inode, &smallest_removed,
35901 +                                                update_sd);
35902 +                               if (result)
35903 +                                       break;
35904 +                       }
35905 +
35906 +                       /* the below does up(sbinfo->delete_mutex). Do not get folled */
35907 +                       reiser4_release_reserved(inode->i_sb);
35908 +
35909 +                       /* reiser4_cut_tree_object() was interrupted probably because
35910 +                        * current atom requires commit, we have to release
35911 +                        * transaction handle to allow atom commit. */
35912 +                       reiser4_txn_restart_current();
35913 +                       continue;
35914 +               }
35915 +               if (result
35916 +                   && !(result == CBK_COORD_NOTFOUND && new_size == 0
35917 +                        && inode->i_size == 0))
35918 +                       break;
35919 +
35920 +               set_key_offset(&smallest_removed, new_size);
35921 +               /* Final sd update after the file gets its correct size */
35922 +               result = update_actor(inode, &smallest_removed, update_sd);
35923 +               break;
35924 +       }
35925 +
35926 +       /* the below does up(sbinfo->delete_mutex). Do not get folled */
35927 +       reiser4_release_reserved(inode->i_sb);
35928 +
35929 +       return result;
35930 +}
35931 +
35932 +int find_or_create_extent(struct page *page);
35933 +
35934 +/* part of truncate_file_body: it is called when truncate is used to make file
35935 +   shorter */
35936 +static int shorten_file(struct inode *inode, loff_t new_size)
35937 +{
35938 +       int result;
35939 +       struct page *page;
35940 +       int padd_from;
35941 +       unsigned long index;
35942 +       char *kaddr;
35943 +       unix_file_info_t *uf_info;
35944 +
35945 +       /*
35946 +        * all items of ordinary reiser4 file are grouped together. That is why
35947 +        * we can use reiser4_cut_tree. Plan B files (for instance) can not be
35948 +        * truncated that simply
35949 +        */
35950 +       result = cut_file_items(inode, new_size, 1 /*update_sd */ ,
35951 +                               get_key_offset(reiser4_max_key()),
35952 +                               reiser4_update_file_size);
35953 +       if (result)
35954 +               return result;
35955 +
35956 +       uf_info = unix_file_inode_data(inode);
35957 +       assert("vs-1105", new_size == inode->i_size);
35958 +       if (new_size == 0) {
35959 +               uf_info->container = UF_CONTAINER_EMPTY;
35960 +               return 0;
35961 +       }
35962 +
35963 +       result = find_file_state(inode, uf_info);
35964 +       if (result)
35965 +               return result;
35966 +       if (uf_info->container == UF_CONTAINER_TAILS)
35967 +               /*
35968 +                * No need to worry about zeroing last page after new file
35969 +                * end
35970 +                */
35971 +               return 0;
35972 +
35973 +       padd_from = inode->i_size & (PAGE_CACHE_SIZE - 1);
35974 +       if (!padd_from)
35975 +               /* file is truncated to page boundary */
35976 +               return 0;
35977 +
35978 +       result = reserve_partial_page(reiser4_tree_by_inode(inode));
35979 +       if (result) {
35980 +               reiser4_release_reserved(inode->i_sb);
35981 +               return result;
35982 +       }
35983 +
35984 +       /* last page is partially truncated - zero its content */
35985 +       index = (inode->i_size >> PAGE_CACHE_SHIFT);
35986 +       page = read_mapping_page(inode->i_mapping, index, NULL);
35987 +       if (IS_ERR(page)) {
35988 +               /*
35989 +                * the below does up(sbinfo->delete_mutex). Do not get
35990 +                * confused
35991 +                */
35992 +               reiser4_release_reserved(inode->i_sb);
35993 +               if (likely(PTR_ERR(page) == -EINVAL)) {
35994 +                       /* looks like file is built of tail items */
35995 +                       return 0;
35996 +               }
35997 +               return PTR_ERR(page);
35998 +       }
35999 +       wait_on_page_locked(page);
36000 +       if (!PageUptodate(page)) {
36001 +               page_cache_release(page);
36002 +               /*
36003 +                * the below does up(sbinfo->delete_mutex). Do not get
36004 +                * confused
36005 +                */
36006 +               reiser4_release_reserved(inode->i_sb);
36007 +               return RETERR(-EIO);
36008 +       }
36009 +
36010 +       /*
36011 +        * if page correspons to hole extent unit - unallocated one will be
36012 +        * created here. This is not necessary
36013 +        */
36014 +       result = find_or_create_extent(page);
36015 +
36016 +       /*
36017 +        * FIXME: cut_file_items has already updated inode. Probably it would
36018 +        * be better to update it here when file is really truncated
36019 +        */
36020 +       if (result) {
36021 +               page_cache_release(page);
36022 +               /*
36023 +                * the below does up(sbinfo->delete_mutex). Do not get
36024 +                * confused
36025 +                */
36026 +               reiser4_release_reserved(inode->i_sb);
36027 +               return result;
36028 +       }
36029 +
36030 +       lock_page(page);
36031 +       assert("vs-1066", PageLocked(page));
36032 +       kaddr = kmap_atomic(page, KM_USER0);
36033 +       memset(kaddr + padd_from, 0, PAGE_CACHE_SIZE - padd_from);
36034 +       flush_dcache_page(page);
36035 +       kunmap_atomic(kaddr, KM_USER0);
36036 +       unlock_page(page);
36037 +       page_cache_release(page);
36038 +       /* the below does up(sbinfo->delete_mutex). Do not get confused */
36039 +       reiser4_release_reserved(inode->i_sb);
36040 +       return 0;
36041 +}
36042 +
36043 +/**
36044 + * should_have_notail
36045 + * @uf_info:
36046 + * @new_size:
36047 + *
36048 + * Calls formatting plugin to see whether file of size @new_size has to be
36049 + * stored in unformatted nodes or in tail items. 0 is returned for later case.
36050 + */
36051 +static int should_have_notail(const unix_file_info_t *uf_info, loff_t new_size)
36052 +{
36053 +       if (!uf_info->tplug)
36054 +               return 1;
36055 +       return !uf_info->tplug->have_tail(unix_file_info_to_inode(uf_info),
36056 +                                         new_size);
36057 +
36058 +}
36059 +
36060 +/**
36061 + * truncate_file_body - change length of file
36062 + * @inode: inode of file
36063 + * @new_size: new file length
36064 + *
36065 + * Adjusts items file @inode is built of to match @new_size. It may either cut
36066 + * items or add them to represent a hole at the end of file. The caller has to
36067 + * obtain exclusive access to the file.
36068 + */
36069 +static int truncate_file_body(struct inode *inode, loff_t new_size)
36070 +{
36071 +       int result;
36072 +
36073 +       if (inode->i_size < new_size) {
36074 +               /* expanding truncate */
36075 +               struct dentry dentry;
36076 +               struct file file;
36077 +               unix_file_info_t *uf_info;
36078 +
36079 +               dentry.d_inode = inode;
36080 +               file.f_dentry = &dentry;
36081 +               file.private_data = NULL;
36082 +               file.f_pos = new_size;
36083 +               file.private_data = NULL;
36084 +               uf_info = unix_file_inode_data(inode);
36085 +               result = find_file_state(inode, uf_info);
36086 +               if (result)
36087 +                       return result;
36088 +
36089 +               if (should_have_notail(uf_info, new_size)) {
36090 +                       /*
36091 +                        * file of size @new_size has to be built of
36092 +                        * extents. If it is built of tails - convert to
36093 +                        * extents
36094 +                        */
36095 +                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
36096 +                               /*
36097 +                                * if file is being convered by another process
36098 +                                * - wait until it completes
36099 +                                */
36100 +                               while (1) {
36101 +                                       if (reiser4_inode_get_flag(inode,
36102 +                                                                  REISER4_PART_IN_CONV)) {
36103 +                                               drop_exclusive_access(uf_info);
36104 +                                               schedule();
36105 +                                               get_exclusive_access(uf_info);
36106 +                                               continue;
36107 +                                       }
36108 +                                       break;
36109 +                               }
36110 +
36111 +                               if (uf_info->container ==  UF_CONTAINER_TAILS) {
36112 +                                       result = tail2extent(uf_info);
36113 +                                       if (result)
36114 +                                               return result;
36115 +                               }
36116 +                       }
36117 +                       result = reiser4_write_extent(&file, NULL, 0,
36118 +                                                     &new_size);
36119 +                       if (result)
36120 +                               return result;
36121 +                       uf_info->container = UF_CONTAINER_EXTENTS;
36122 +               } else {
36123 +                       if (uf_info->container ==  UF_CONTAINER_EXTENTS) {
36124 +                               result = reiser4_write_extent(&file, NULL, 0,
36125 +                                                             &new_size);
36126 +                               if (result)
36127 +                                       return result;
36128 +                       } else {
36129 +                               result = reiser4_write_tail(&file, NULL, 0,
36130 +                                                           &new_size);
36131 +                               if (result)
36132 +                                       return result;
36133 +                               uf_info->container = UF_CONTAINER_TAILS;
36134 +                       }
36135 +               }
36136 +               BUG_ON(result > 0);
36137 +               INODE_SET_FIELD(inode, i_size, new_size);
36138 +               file_update_time(&file);
36139 +               result = reiser4_update_sd(inode);
36140 +               BUG_ON(result != 0);
36141 +               reiser4_free_file_fsdata(&file);
36142 +       } else
36143 +               result = shorten_file(inode, new_size);
36144 +       return result;
36145 +}
36146 +
36147 +/* plugin->u.write_sd_by_inode = write_sd_by_inode_common */
36148 +
36149 +/**
36150 + * load_file_hint - copy hint from struct file to local variable
36151 + * @file: file to get hint from
36152 + * @hint: structure to fill
36153 + *
36154 + * Reiser4 specific portion of struct file may contain information (hint)
36155 + * stored on exiting from previous read or write. That information includes
36156 + * seal of znode and coord within that znode where previous read or write
36157 + * stopped. This function copies that information to @hint if it was stored or
36158 + * initializes @hint by 0s otherwise.
36159 + */
36160 +int load_file_hint(struct file *file, hint_t *hint)
36161 +{
36162 +       reiser4_file_fsdata *fsdata;
36163 +
36164 +       if (file) {
36165 +               fsdata = reiser4_get_file_fsdata(file);
36166 +               if (IS_ERR(fsdata))
36167 +                       return PTR_ERR(fsdata);
36168 +
36169 +               spin_lock_inode(file->f_dentry->d_inode);
36170 +               if (reiser4_seal_is_set(&fsdata->reg.hint.seal)) {
36171 +                       *hint = fsdata->reg.hint;
36172 +                       init_lh(&hint->lh);
36173 +                       hint->ext_coord.lh = &hint->lh;
36174 +                       spin_unlock_inode(file->f_dentry->d_inode);
36175 +                       /*
36176 +                        * force re-validation of the coord on the first
36177 +                        * iteration of the read/write loop.
36178 +                        */
36179 +                       hint->ext_coord.valid = 0;
36180 +                       assert("nikita-19892", coords_equal(&hint->seal.coord1,
36181 +                                                           &hint->ext_coord.
36182 +                                                           coord));
36183 +                       return 0;
36184 +               }
36185 +               memset(&fsdata->reg.hint, 0, sizeof(hint_t));
36186 +               spin_unlock_inode(file->f_dentry->d_inode);
36187 +       }
36188 +       hint_init_zero(hint);
36189 +       return 0;
36190 +}
36191 +
36192 +/**
36193 + * save_file_hint - copy hint to reiser4 private struct file's part
36194 + * @file: file to save hint in
36195 + * @hint: hint to save
36196 + *
36197 + * This copies @hint to reiser4 private part of struct file. It can help
36198 + * speedup future accesses to the file.
36199 + */
36200 +void save_file_hint(struct file *file, const hint_t *hint)
36201 +{
36202 +       reiser4_file_fsdata *fsdata;
36203 +
36204 +       assert("edward-1337", hint != NULL);
36205 +
36206 +       if (!file || !reiser4_seal_is_set(&hint->seal))
36207 +               return;
36208 +       fsdata = reiser4_get_file_fsdata(file);
36209 +       assert("vs-965", !IS_ERR(fsdata));
36210 +       assert("nikita-19891",
36211 +              coords_equal(&hint->seal.coord1, &hint->ext_coord.coord));
36212 +       assert("vs-30", hint->lh.owner == NULL);
36213 +       spin_lock_inode(file->f_dentry->d_inode);
36214 +       fsdata->reg.hint = *hint;
36215 +       spin_unlock_inode(file->f_dentry->d_inode);
36216 +       return;
36217 +}
36218 +
36219 +void reiser4_unset_hint(hint_t * hint)
36220 +{
36221 +       assert("vs-1315", hint);
36222 +       hint->ext_coord.valid = 0;
36223 +       reiser4_seal_done(&hint->seal);
36224 +       done_lh(&hint->lh);
36225 +}
36226 +
36227 +/* coord must be set properly. So, that reiser4_set_hint
36228 +   has nothing to do */
36229 +void reiser4_set_hint(hint_t * hint, const reiser4_key * key,
36230 +                     znode_lock_mode mode)
36231 +{
36232 +       ON_DEBUG(coord_t * coord = &hint->ext_coord.coord);
36233 +       assert("vs-1207", WITH_DATA(coord->node, check_coord(coord, key)));
36234 +
36235 +       reiser4_seal_init(&hint->seal, &hint->ext_coord.coord, key);
36236 +       hint->offset = get_key_offset(key);
36237 +       hint->mode = mode;
36238 +       done_lh(&hint->lh);
36239 +}
36240 +
36241 +int hint_is_set(const hint_t * hint)
36242 +{
36243 +       return reiser4_seal_is_set(&hint->seal);
36244 +}
36245 +
36246 +#if REISER4_DEBUG
36247 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
36248 +{
36249 +       return (get_key_locality(k1) == get_key_locality(k2) &&
36250 +               get_key_type(k1) == get_key_type(k2) &&
36251 +               get_key_band(k1) == get_key_band(k2) &&
36252 +               get_key_ordering(k1) == get_key_ordering(k2) &&
36253 +               get_key_objectid(k1) == get_key_objectid(k2));
36254 +}
36255 +#endif
36256 +
36257 +static int
36258 +hint_validate(hint_t * hint, const reiser4_key * key, int check_key,
36259 +             znode_lock_mode lock_mode)
36260 +{
36261 +       if (!hint || !hint_is_set(hint) || hint->mode != lock_mode)
36262 +               /* hint either not set or set by different operation */
36263 +               return RETERR(-E_REPEAT);
36264 +
36265 +       assert("vs-1277", all_but_offset_key_eq(key, &hint->seal.key));
36266 +
36267 +       if (check_key && get_key_offset(key) != hint->offset)
36268 +               /* hint is set for different key */
36269 +               return RETERR(-E_REPEAT);
36270 +
36271 +       assert("vs-31", hint->ext_coord.lh == &hint->lh);
36272 +       return reiser4_seal_validate(&hint->seal, &hint->ext_coord.coord, key,
36273 +                                    hint->ext_coord.lh, lock_mode,
36274 +                                    ZNODE_LOCK_LOPRI);
36275 +}
36276 +
36277 +/**
36278 + * find_or_create_extent -
36279 + * @page:
36280 + *
36281 + *
36282 + */
36283 +/* look for place at twig level for extent corresponding to page, call extent's writepage method to create
36284 +   unallocated extent if it does not exist yet, initialize jnode, capture page */
36285 +int find_or_create_extent(struct page *page)
36286 +{
36287 +       int result;
36288 +       struct inode *inode;
36289 +       int plugged_hole;
36290 +
36291 +       jnode *node;
36292 +
36293 +       assert("vs-1065", page->mapping && page->mapping->host);
36294 +       inode = page->mapping->host;
36295 +
36296 +       lock_page(page);
36297 +       node = jnode_of_page(page);
36298 +       if (IS_ERR(node)) {
36299 +               unlock_page(page);
36300 +               return PTR_ERR(node);
36301 +       }
36302 +       JF_SET(node, JNODE_WRITE_PREPARED);
36303 +       unlock_page(page);
36304 +
36305 +       if (node->blocknr == 0) {
36306 +               plugged_hole = 0;
36307 +               result = reiser4_update_extent(inode, node, page_offset(page),
36308 +                                              &plugged_hole);
36309 +               if (result) {
36310 +                       JF_CLR(node, JNODE_WRITE_PREPARED);
36311 +                       jput(node);
36312 +                       warning("", "reiser4_update_extent failed: %d", result);
36313 +                       return result;
36314 +               }
36315 +               if (plugged_hole)
36316 +                       reiser4_update_sd(inode);
36317 +       } else {
36318 +               spin_lock_jnode(node);
36319 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
36320 +               BUG_ON(result != 0);
36321 +               jnode_make_dirty_locked(node);
36322 +               spin_unlock_jnode(node);
36323 +       }
36324 +
36325 +       BUG_ON(node->atom == NULL);
36326 +       JF_CLR(node, JNODE_WRITE_PREPARED);
36327 +       jput(node);
36328 +
36329 +       if (get_current_context()->entd) {
36330 +               entd_context *ent = get_entd_context(node->tree->super);
36331 +
36332 +               if (ent->cur_request->page == page)
36333 +                       ent->cur_request->node = node;
36334 +       }
36335 +       return 0;
36336 +}
36337 +
36338 +/**
36339 + * has_anonymous_pages - check whether inode has pages dirtied via mmap
36340 + * @inode: inode to check
36341 + *
36342 + * Returns true if inode's mapping has dirty pages which do not belong to any
36343 + * atom. Those are either tagged PAGECACHE_TAG_REISER4_MOVED in mapping's page
36344 + * tree or were eflushed and can be found via jnodes tagged
36345 + * EFLUSH_TAG_ANONYMOUS in radix tree of jnodes.
36346 + */
36347 +static int has_anonymous_pages(struct inode *inode)
36348 +{
36349 +       int result;
36350 +
36351 +       read_lock_irq(&inode->i_mapping->tree_lock);
36352 +       result = radix_tree_tagged(&inode->i_mapping->page_tree, PAGECACHE_TAG_REISER4_MOVED);
36353 +       read_unlock_irq(&inode->i_mapping->tree_lock);
36354 +       return result;
36355 +}
36356 +
36357 +/**
36358 + * capture_page_and_create_extent -
36359 + * @page: page to be captured
36360 + *
36361 + * Grabs space for extent creation and stat data update and calls function to
36362 + * do actual work.
36363 + */
36364 +static int capture_page_and_create_extent(struct page *page)
36365 +{
36366 +       int result;
36367 +       struct inode *inode;
36368 +
36369 +       assert("vs-1084", page->mapping && page->mapping->host);
36370 +       inode = page->mapping->host;
36371 +       assert("vs-1139",
36372 +              unix_file_inode_data(inode)->container == UF_CONTAINER_EXTENTS);
36373 +       /* page belongs to file */
36374 +       assert("vs-1393",
36375 +              inode->i_size > page_offset(page));
36376 +
36377 +       /* page capture may require extent creation (if it does not exist yet)
36378 +          and stat data's update (number of blocks changes on extent
36379 +          creation) */
36380 +       grab_space_enable();
36381 +       result = reiser4_grab_space(2 * estimate_one_insert_into_item
36382 +                                   (reiser4_tree_by_inode(inode)),
36383 +                                   BA_CAN_COMMIT);
36384 +       if (likely(!result))
36385 +               result = find_or_create_extent(page);
36386 +
36387 +       if (result != 0)
36388 +               SetPageError(page);
36389 +       return result;
36390 +}
36391 +
36392 +/* this is implementation of method commit_write of struct
36393 +   address_space_operations for unix file plugin */
36394 +int
36395 +commit_write_unix_file(struct file *file, struct page *page,
36396 +                      unsigned from, unsigned to)
36397 +{
36398 +       reiser4_context *ctx;
36399 +       struct inode *inode;
36400 +       int result;
36401 +
36402 +       assert("umka-3101", file != NULL);
36403 +       assert("umka-3102", page != NULL);
36404 +       assert("umka-3093", PageLocked(page));
36405 +
36406 +       SetPageUptodate(page);
36407 +
36408 +       inode = page->mapping->host;
36409 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
36410 +       if (IS_ERR(ctx))
36411 +               return PTR_ERR(ctx);
36412 +       page_cache_get(page);
36413 +       unlock_page(page);
36414 +       result = capture_page_and_create_extent(page);
36415 +       lock_page(page);
36416 +       page_cache_release(page);
36417 +
36418 +       /* don't commit transaction under inode semaphore */
36419 +       context_set_commit_async(ctx);
36420 +       reiser4_exit_context(ctx);
36421 +       return result;
36422 +}
36423 +
36424 +/*
36425 + * Support for "anonymous" pages and jnodes.
36426 + *
36427 + * When file is write-accessed through mmap pages can be dirtied from the user
36428 + * level. In this case kernel is not notified until one of following happens:
36429 + *
36430 + *     (1) msync()
36431 + *
36432 + *     (2) truncate() (either explicit or through unlink)
36433 + *
36434 + *     (3) VM scanner starts reclaiming mapped pages, dirtying them before
36435 + *     starting write-back.
36436 + *
36437 + * As a result of (3) ->writepage may be called on a dirty page without
36438 + * jnode. Such page is called "anonymous" in reiser4. Certain work-loads
36439 + * (iozone) generate huge number of anonymous pages. Emergency flush handles
36440 + * this situation by creating jnode for anonymous page, starting IO on the
36441 + * page, and marking jnode with JNODE_KEEPME bit so that it's not thrown out of
36442 + * memory. Such jnode is also called anonymous.
36443 + *
36444 + * reiser4_sync_sb() method tries to insert anonymous pages and jnodes into
36445 + * tree. This is done by capture_anonymous_*() functions below.
36446 + */
36447 +
36448 +/**
36449 + * capture_anonymous_page - involve page into transaction
36450 + * @pg: page to deal with
36451 + *
36452 + * Takes care that @page has corresponding metadata in the tree, creates jnode
36453 + * for @page and captures it. On success 1 is returned.
36454 + */
36455 +static int capture_anonymous_page(struct page *page)
36456 +{
36457 +       int result;
36458 +
36459 +       if (PageWriteback(page))
36460 +               /* FIXME: do nothing? */
36461 +               return 0;
36462 +
36463 +       result = capture_page_and_create_extent(page);
36464 +       if (result == 0) {
36465 +               result = 1;
36466 +       } else
36467 +               warning("nikita-3329",
36468 +                               "Cannot capture anon page: %i", result);
36469 +
36470 +       return result;
36471 +}
36472 +
36473 +/**
36474 + * capture_anonymous_pages - find and capture pages dirtied via mmap
36475 + * @mapping: address space where to look for pages
36476 + * @index: start index
36477 + * @to_capture: maximum number of pages to capture
36478 + *
36479 + * Looks for pages tagged REISER4_MOVED starting from the *@index-th page,
36480 + * captures (involves into atom) them, returns number of captured pages,
36481 + * updates @index to next page after the last captured one.
36482 + */
36483 +static int
36484 +capture_anonymous_pages(struct address_space *mapping, pgoff_t *index,
36485 +                       unsigned int to_capture)
36486 +{
36487 +       int result;
36488 +       struct pagevec pvec;
36489 +       unsigned int i, count;
36490 +       int nr;
36491 +
36492 +       pagevec_init(&pvec, 0);
36493 +       count = min(pagevec_space(&pvec), to_capture);
36494 +       nr = 0;
36495 +
36496 +       /* find pages tagged MOVED */
36497 +       write_lock_irq(&mapping->tree_lock);
36498 +       pvec.nr = radix_tree_gang_lookup_tag(&mapping->page_tree,
36499 +                                            (void **)pvec.pages, *index, count,
36500 +                                            PAGECACHE_TAG_REISER4_MOVED);
36501 +       if (pagevec_count(&pvec) == 0) {
36502 +               /*
36503 +                * there are no pages tagged MOVED in mapping->page_tree
36504 +                * starting from *index
36505 +                */
36506 +               write_unlock_irq(&mapping->tree_lock);
36507 +               *index = (pgoff_t)-1;
36508 +               return 0;
36509 +       }
36510 +
36511 +       /* clear MOVED tag for all found pages */
36512 +       for (i = 0; i < pagevec_count(&pvec); i++) {
36513 +               void *p;
36514 +
36515 +               page_cache_get(pvec.pages[i]);
36516 +               p = radix_tree_tag_clear(&mapping->page_tree, pvec.pages[i]->index,
36517 +                                        PAGECACHE_TAG_REISER4_MOVED);
36518 +               assert("vs-49", p == pvec.pages[i]);
36519 +       }
36520 +       write_unlock_irq(&mapping->tree_lock);
36521 +
36522 +
36523 +       *index = pvec.pages[i - 1]->index + 1;
36524 +
36525 +       for (i = 0; i < pagevec_count(&pvec); i++) {
36526 +               /*
36527 +                * tag PAGECACHE_TAG_REISER4_MOVED will be cleared by
36528 +                * reiser4_set_page_dirty_internal which is called when jnode is
36529 +                * captured
36530 +                */
36531 +               result = capture_anonymous_page(pvec.pages[i]);
36532 +               if (result == 1)
36533 +                       nr++;
36534 +               else {
36535 +                       if (result < 0) {
36536 +                               warning("vs-1454",
36537 +                                       "failed to capture page: "
36538 +                                       "result=%d, captured=%d)\n",
36539 +                                       result, i);
36540 +
36541 +                               /*
36542 +                                * set MOVED tag to all pages which left not
36543 +                                * captured
36544 +                                */
36545 +                               write_lock_irq(&mapping->tree_lock);
36546 +                               for (; i < pagevec_count(&pvec); i ++) {
36547 +                                       radix_tree_tag_set(&mapping->page_tree,
36548 +                                                          pvec.pages[i]->index,
36549 +                                                          PAGECACHE_TAG_REISER4_MOVED);
36550 +                               }
36551 +                               write_unlock_irq(&mapping->tree_lock);
36552 +
36553 +                               pagevec_release(&pvec);
36554 +                               return result;
36555 +                       } else {
36556 +                               /*
36557 +                                * result == 0. capture_anonymous_page returns
36558 +                                * 0 for Writeback-ed page. Set MOVED tag on
36559 +                                * that page
36560 +                                */
36561 +                               write_lock_irq(&mapping->tree_lock);
36562 +                               radix_tree_tag_set(&mapping->page_tree,
36563 +                                                  pvec.pages[i]->index,
36564 +                                                  PAGECACHE_TAG_REISER4_MOVED);
36565 +                               write_unlock_irq(&mapping->tree_lock);
36566 +                               if (i == 0)
36567 +                                       *index = pvec.pages[0]->index;
36568 +                               else
36569 +                                       *index = pvec.pages[i - 1]->index + 1;
36570 +                       }
36571 +               }
36572 +       }
36573 +       pagevec_release(&pvec);
36574 +       return nr;
36575 +}
36576 +
36577 +/**
36578 + * capture_anonymous_jnodes - find and capture anonymous jnodes
36579 + * @mapping: address space where to look for jnodes
36580 + * @from: start index
36581 + * @to: end index
36582 + * @to_capture: maximum number of jnodes to capture
36583 + *
36584 + * Looks for jnodes tagged EFLUSH_TAG_ANONYMOUS in inode's tree of jnodes in
36585 + * the range of indexes @from-@to and captures them, returns number of captured
36586 + * jnodes, updates @from to next jnode after the last captured one.
36587 + */
36588 +static int
36589 +capture_anonymous_jnodes(struct address_space *mapping,
36590 +                        pgoff_t *from, pgoff_t to, int to_capture)
36591 +{
36592 +       *from = to;
36593 +       return 0;
36594 +}
36595 +
36596 +/*
36597 + * Commit atom of the jnode of a page.
36598 + */
36599 +static int sync_page(struct page *page)
36600 +{
36601 +       int result;
36602 +       do {
36603 +               jnode *node;
36604 +               txn_atom *atom;
36605 +
36606 +               lock_page(page);
36607 +               node = jprivate(page);
36608 +               if (node != NULL) {
36609 +                       spin_lock_jnode(node);
36610 +                       atom = jnode_get_atom(node);
36611 +                       spin_unlock_jnode(node);
36612 +               } else
36613 +                       atom = NULL;
36614 +               unlock_page(page);
36615 +               result = reiser4_sync_atom(atom);
36616 +       } while (result == -E_REPEAT);
36617 +       /*
36618 +        * ZAM-FIXME-HANS: document the logic of this loop, is it just to
36619 +        * handle the case where more pages get added to the atom while we are
36620 +        * syncing it?
36621 +        */
36622 +       assert("nikita-3485", ergo(result == 0,
36623 +                                  get_current_context()->trans->atom == NULL));
36624 +       return result;
36625 +}
36626 +
36627 +/*
36628 + * Commit atoms of pages on @pages list.
36629 + * call sync_page for each page from mapping's page tree
36630 + */
36631 +static int sync_page_list(struct inode *inode)
36632 +{
36633 +       int result;
36634 +       struct address_space *mapping;
36635 +       unsigned long from;     /* start index for radix_tree_gang_lookup */
36636 +       unsigned int found;     /* return value for radix_tree_gang_lookup */
36637 +
36638 +       mapping = inode->i_mapping;
36639 +       from = 0;
36640 +       result = 0;
36641 +       read_lock_irq(&mapping->tree_lock);
36642 +       while (result == 0) {
36643 +               struct page *page;
36644 +
36645 +               found =
36646 +                   radix_tree_gang_lookup(&mapping->page_tree, (void **)&page,
36647 +                                          from, 1);
36648 +               assert("", found < 2);
36649 +               if (found == 0)
36650 +                       break;
36651 +
36652 +               /* page may not leave radix tree because it is protected from truncating by inode->i_mutex locked by
36653 +                  sys_fsync */
36654 +               page_cache_get(page);
36655 +               read_unlock_irq(&mapping->tree_lock);
36656 +
36657 +               from = page->index + 1;
36658 +
36659 +               result = sync_page(page);
36660 +
36661 +               page_cache_release(page);
36662 +               read_lock_irq(&mapping->tree_lock);
36663 +       }
36664 +
36665 +       read_unlock_irq(&mapping->tree_lock);
36666 +       return result;
36667 +}
36668 +
36669 +static int commit_file_atoms(struct inode *inode)
36670 +{
36671 +       int result;
36672 +       unix_file_info_t *uf_info;
36673 +
36674 +       uf_info = unix_file_inode_data(inode);
36675 +
36676 +       get_exclusive_access(uf_info);
36677 +       /*
36678 +        * find what items file is made from
36679 +        */
36680 +       result = find_file_state(inode, uf_info);
36681 +       drop_exclusive_access(uf_info);
36682 +       if (result != 0)
36683 +               return result;
36684 +
36685 +       /*
36686 +        * file state cannot change because we are under ->i_mutex
36687 +        */
36688 +       switch (uf_info->container) {
36689 +       case UF_CONTAINER_EXTENTS:
36690 +               /* find_file_state might open join an atom */
36691 +               reiser4_txn_restart_current();
36692 +               result =
36693 +                   /*
36694 +                    * when we are called by
36695 +                    * filemap_fdatawrite->
36696 +                    *    do_writepages()->
36697 +                    *       reiser4_writepages()
36698 +                    *
36699 +                    * inode->i_mapping->dirty_pages are spices into
36700 +                    * ->io_pages, leaving ->dirty_pages dirty.
36701 +                    *
36702 +                    * When we are called from
36703 +                    * reiser4_fsync()->sync_unix_file(), we have to
36704 +                    * commit atoms of all pages on the ->dirty_list.
36705 +                    *
36706 +                    * So for simplicity we just commit ->io_pages and
36707 +                    * ->dirty_pages.
36708 +                    */
36709 +                   sync_page_list(inode);
36710 +               break;
36711 +       case UF_CONTAINER_TAILS:
36712 +               /*
36713 +                * NOTE-NIKITA probably we can be smarter for tails. For now
36714 +                * just commit all existing atoms.
36715 +                */
36716 +               result = txnmgr_force_commit_all(inode->i_sb, 0);
36717 +               break;
36718 +       case UF_CONTAINER_EMPTY:
36719 +               result = 0;
36720 +               break;
36721 +       case UF_CONTAINER_UNKNOWN:
36722 +       default:
36723 +               result = -EIO;
36724 +               break;
36725 +       }
36726 +
36727 +       /*
36728 +        * commit current transaction: there can be captured nodes from
36729 +        * find_file_state() and finish_conversion().
36730 +        */
36731 +       reiser4_txn_restart_current();
36732 +       return result;
36733 +}
36734 +
36735 +/**
36736 + * writepages_unix_file - writepages of struct address_space_operations
36737 + * @mapping:
36738 + * @wbc:
36739 + *
36740 + * This captures anonymous pages and anonymous jnodes. Anonymous pages are
36741 + * pages which are dirtied via mmapping. Anonymous jnodes are ones which were
36742 + * created by reiser4_writepage.
36743 + */
36744 +int writepages_unix_file(struct address_space *mapping,
36745 +                    struct writeback_control *wbc)
36746 +{
36747 +       int result;
36748 +       unix_file_info_t *uf_info;
36749 +       pgoff_t pindex, jindex, nr_pages;
36750 +       long to_capture;
36751 +       struct inode *inode;
36752 +
36753 +       inode = mapping->host;
36754 +       if (!has_anonymous_pages(inode)) {
36755 +               result = 0;
36756 +               goto end;
36757 +       }
36758 +       jindex = pindex = wbc->range_start >> PAGE_CACHE_SHIFT;
36759 +       result = 0;
36760 +       nr_pages =
36761 +           (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
36762 +       uf_info = unix_file_inode_data(inode);
36763 +
36764 +       do {
36765 +               reiser4_context *ctx;
36766 +
36767 +               if (wbc->sync_mode != WB_SYNC_ALL)
36768 +                       to_capture = min(wbc->nr_to_write, CAPTURE_APAGE_BURST);
36769 +               else
36770 +                       to_capture = CAPTURE_APAGE_BURST;
36771 +
36772 +               ctx = reiser4_init_context(inode->i_sb);
36773 +               if (IS_ERR(ctx)) {
36774 +                       result = PTR_ERR(ctx);
36775 +                       break;
36776 +               }
36777 +               /* avoid recursive calls to ->sync_inodes */
36778 +               ctx->nobalance = 1;
36779 +               assert("zam-760", lock_stack_isclean(get_current_lock_stack()));
36780 +               assert("", LOCK_CNT_NIL(inode_sem_w));
36781 +               assert("", LOCK_CNT_NIL(inode_sem_r));
36782 +
36783 +               reiser4_txn_restart_current();
36784 +
36785 +               /* we have to get nonexclusive access to the file */
36786 +               if (get_current_context()->entd) {
36787 +                       /*
36788 +                        * use nonblocking version of nonexclusive_access to
36789 +                        * avoid deadlock which might look like the following:
36790 +                        * process P1 holds NEA on file F1 and called entd to
36791 +                        * reclaim some memory. Entd works for P1 and is going
36792 +                        * to capture pages of file F2. To do that entd has to
36793 +                        * get NEA to F2. F2 is held by process P2 which also
36794 +                        * called entd. But entd is serving P1 at the moment
36795 +                        * and P2 has to wait. Process P3 trying to get EA to
36796 +                        * file F2. Existence of pending EA request to file F2
36797 +                        * makes impossible for entd to get NEA to file
36798 +                        * F2. Neither of these process can continue. Using
36799 +                        * nonblocking version of gettign NEA is supposed to
36800 +                        * avoid this deadlock.
36801 +                        */
36802 +                       if (try_to_get_nonexclusive_access(uf_info) == 0) {
36803 +                               result = RETERR(-EBUSY);
36804 +                               reiser4_exit_context(ctx);
36805 +                               break;
36806 +                       }
36807 +               } else
36808 +                       get_nonexclusive_access(uf_info);
36809 +
36810 +               while (to_capture > 0) {
36811 +                       pgoff_t start;
36812 +
36813 +                       assert("vs-1727", jindex <= pindex);
36814 +                       if (pindex == jindex) {
36815 +                               start = pindex;
36816 +                               result =
36817 +                                   capture_anonymous_pages(inode->i_mapping,
36818 +                                                           &pindex,
36819 +                                                           to_capture);
36820 +                               if (result <= 0)
36821 +                                       break;
36822 +                               to_capture -= result;
36823 +                               wbc->nr_to_write -= result;
36824 +                               if (start + result == pindex) {
36825 +                                       jindex = pindex;
36826 +                                       continue;
36827 +                               }
36828 +                               if (to_capture <= 0)
36829 +                                       break;
36830 +                       }
36831 +                       /* deal with anonymous jnodes between jindex and pindex */
36832 +                       result =
36833 +                           capture_anonymous_jnodes(inode->i_mapping, &jindex,
36834 +                                                    pindex, to_capture);
36835 +                       if (result < 0)
36836 +                               break;
36837 +                       to_capture -= result;
36838 +                       get_current_context()->nr_captured += result;
36839 +
36840 +                       if (jindex == (pgoff_t) - 1) {
36841 +                               assert("vs-1728", pindex == (pgoff_t) - 1);
36842 +                               break;
36843 +                       }
36844 +               }
36845 +               if (to_capture <= 0)
36846 +                       /* there may be left more pages */
36847 +                       __mark_inode_dirty(inode, I_DIRTY_PAGES);
36848 +
36849 +               drop_nonexclusive_access(uf_info);
36850 +               if (result < 0) {
36851 +                       /* error happened */
36852 +                       reiser4_exit_context(ctx);
36853 +                       return result;
36854 +               }
36855 +               if (wbc->sync_mode != WB_SYNC_ALL) {
36856 +                       reiser4_exit_context(ctx);
36857 +                       return 0;
36858 +               }
36859 +               result = commit_file_atoms(inode);
36860 +               reiser4_exit_context(ctx);
36861 +               if (pindex >= nr_pages && jindex == pindex)
36862 +                       break;
36863 +       } while (1);
36864 +
36865 +      end:
36866 +       if (is_in_reiser4_context()) {
36867 +               if (get_current_context()->nr_captured >= CAPTURE_APAGE_BURST) {
36868 +                       /*
36869 +                        * there are already pages to flush, flush them out, do
36870 +                        * not delay until end of reiser4_sync_inodes
36871 +                        */
36872 +                       reiser4_writeout(inode->i_sb, wbc);
36873 +                       get_current_context()->nr_captured = 0;
36874 +               }
36875 +       }
36876 +       return result;
36877 +}
36878 +
36879 +/*
36880 + * ->sync() method for unix file.
36881 + *
36882 + * We are trying to be smart here. Instead of committing all atoms (original
36883 + * solution), we scan dirty pages of this file and commit all atoms they are
36884 + * part of.
36885 + *
36886 + * Situation is complicated by anonymous pages: i.e., extent-less pages
36887 + * dirtied through mmap. Fortunately sys_fsync() first calls
36888 + * filemap_fdatawrite() that will ultimately call reiser4_writepages(), insert
36889 + * all missing extents and capture anonymous pages.
36890 + */
36891 +int sync_unix_file(struct file *file, struct dentry *dentry, int datasync)
36892 +{
36893 +       reiser4_context *ctx;
36894 +       txn_atom *atom;
36895 +       reiser4_block_nr reserve;
36896 +
36897 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
36898 +       if (IS_ERR(ctx))
36899 +               return PTR_ERR(ctx);
36900 +
36901 +       reserve = estimate_update_common(dentry->d_inode);
36902 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
36903 +               reiser4_exit_context(ctx);
36904 +               return RETERR(-ENOSPC);
36905 +       }
36906 +       write_sd_by_inode_common(dentry->d_inode);
36907 +
36908 +       atom = get_current_atom_locked();
36909 +       spin_lock_txnh(ctx->trans);
36910 +       force_commit_atom(ctx->trans);
36911 +       reiser4_exit_context(ctx);
36912 +       return 0;
36913 +}
36914 +
36915 +/**
36916 + * readpage_unix_file_nolock - readpage of struct address_space_operations
36917 + * @file:
36918 + * @page:
36919 + *
36920 + * Compose a key and search for item containing information about @page
36921 + * data. If item is found - its readpage method is called.
36922 + */
36923 +int readpage_unix_file(struct file *file, struct page *page)
36924 +{
36925 +       reiser4_context *ctx;
36926 +       int result;
36927 +       struct inode *inode;
36928 +       reiser4_key key;
36929 +       item_plugin *iplug;
36930 +       hint_t *hint;
36931 +       lock_handle *lh;
36932 +       coord_t *coord;
36933 +
36934 +       assert("vs-1062", PageLocked(page));
36935 +       assert("vs-976", !PageUptodate(page));
36936 +       assert("vs-1061", page->mapping && page->mapping->host);
36937 +
36938 +       if (page->mapping->host->i_size <= page_offset(page)) {
36939 +               /* page is out of file already */
36940 +               unlock_page(page);
36941 +               return -EINVAL;
36942 +       }
36943 +
36944 +       inode = page->mapping->host;
36945 +       ctx = reiser4_init_context(inode->i_sb);
36946 +       if (IS_ERR(ctx)) {
36947 +               unlock_page(page);
36948 +               return PTR_ERR(ctx);
36949 +       }
36950 +
36951 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
36952 +       if (hint == NULL) {
36953 +               unlock_page(page);
36954 +               reiser4_exit_context(ctx);
36955 +               return RETERR(-ENOMEM);
36956 +       }
36957 +
36958 +       result = load_file_hint(file, hint);
36959 +       if (result) {
36960 +               kfree(hint);
36961 +               unlock_page(page);
36962 +               reiser4_exit_context(ctx);
36963 +               return result;
36964 +       }
36965 +       lh = &hint->lh;
36966 +
36967 +       /* get key of first byte of the page */
36968 +       key_by_inode_and_offset_common(inode, page_offset(page), &key);
36969 +
36970 +       /* look for file metadata corresponding to first byte of page */
36971 +       page_cache_get(page);
36972 +       unlock_page(page);
36973 +       result = find_file_item(hint, &key, ZNODE_READ_LOCK, inode);
36974 +       lock_page(page);
36975 +       page_cache_release(page);
36976 +
36977 +       if (page->mapping == NULL) {
36978 +               /*
36979 +                * readpage allows truncate to run concurrently. Page was
36980 +                * truncated while it was not locked
36981 +                */
36982 +               done_lh(lh);
36983 +               kfree(hint);
36984 +               unlock_page(page);
36985 +               reiser4_txn_restart(ctx);
36986 +               reiser4_exit_context(ctx);
36987 +               return -EINVAL;
36988 +       }
36989 +
36990 +       if (result != CBK_COORD_FOUND || hint->ext_coord.coord.between != AT_UNIT) {
36991 +               if (result == CBK_COORD_FOUND &&
36992 +                   hint->ext_coord.coord.between != AT_UNIT)
36993 +                       /* file is truncated */
36994 +                       result = -EINVAL;
36995 +               done_lh(lh);
36996 +               kfree(hint);
36997 +               unlock_page(page);
36998 +               reiser4_txn_restart(ctx);
36999 +               reiser4_exit_context(ctx);
37000 +               return result;
37001 +       }
37002 +
37003 +       /*
37004 +        * item corresponding to page is found. It can not be removed because
37005 +        * znode lock is held
37006 +        */
37007 +       if (PageUptodate(page)) {
37008 +               done_lh(lh);
37009 +               kfree(hint);
37010 +               unlock_page(page);
37011 +               reiser4_txn_restart(ctx);
37012 +               reiser4_exit_context(ctx);
37013 +               return 0;
37014 +       }
37015 +
37016 +       coord = &hint->ext_coord.coord;
37017 +       result = zload(coord->node);
37018 +       if (result) {
37019 +               done_lh(lh);
37020 +               kfree(hint);
37021 +               unlock_page(page);
37022 +               reiser4_txn_restart(ctx);
37023 +               reiser4_exit_context(ctx);
37024 +               return result;
37025 +       }
37026 +
37027 +       validate_extended_coord(&hint->ext_coord, page_offset(page));
37028 +
37029 +       if (!coord_is_existing_unit(coord)) {
37030 +               /* this indicates corruption */
37031 +               warning("vs-280",
37032 +                       "Looking for page %lu of file %llu (size %lli). "
37033 +                       "No file items found (%d). File is corrupted?\n",
37034 +                       page->index, (unsigned long long)get_inode_oid(inode),
37035 +                       inode->i_size, result);
37036 +               zrelse(coord->node);
37037 +               done_lh(lh);
37038 +               kfree(hint);
37039 +               unlock_page(page);
37040 +               reiser4_txn_restart(ctx);
37041 +               reiser4_exit_context(ctx);
37042 +               return RETERR(-EIO);
37043 +       }
37044 +
37045 +       /*
37046 +        * get plugin of found item or use plugin if extent if there are no
37047 +        * one
37048 +        */
37049 +       iplug = item_plugin_by_coord(coord);
37050 +       if (iplug->s.file.readpage)
37051 +               result = iplug->s.file.readpage(coord, page);
37052 +       else
37053 +               result = RETERR(-EINVAL);
37054 +
37055 +       if (!result) {
37056 +               set_key_offset(&key,
37057 +                              (loff_t) (page->index + 1) << PAGE_CACHE_SHIFT);
37058 +               /* FIXME should call reiser4_set_hint() */
37059 +               reiser4_unset_hint(hint);
37060 +       } else {
37061 +               unlock_page(page);
37062 +               reiser4_unset_hint(hint);
37063 +       }
37064 +       assert("vs-979",
37065 +              ergo(result == 0, (PageLocked(page) || PageUptodate(page))));
37066 +       assert("vs-9791", ergo(result != 0, !PageLocked(page)));
37067 +
37068 +       zrelse(coord->node);
37069 +       done_lh(lh);
37070 +
37071 +       save_file_hint(file, hint);
37072 +       kfree(hint);
37073 +
37074 +       /*
37075 +        * FIXME: explain why it is needed. HINT: page allocation in write can
37076 +        * not be done when atom is not NULL because reiser4_writepage can not
37077 +        * kick entd and have to eflush
37078 +        */
37079 +       reiser4_txn_restart(ctx);
37080 +       reiser4_exit_context(ctx);
37081 +       return result;
37082 +}
37083 +
37084 +struct uf_readpages_context {
37085 +       lock_handle lh;
37086 +       coord_t coord;
37087 +};
37088 +
37089 +/* A callback function for readpages_unix_file/read_cache_pages.
37090 + * If the file is build of tails, then return error (-ENOENT).
37091 + *
37092 + * @data -- a pointer to reiser4_readpages_context object,
37093 + *            to save the twig lock and the coord between
37094 + *            read_cache_page iterations.
37095 + * @page -- page to start read.
37096 + */
37097 +static int uf_readpages_filler(void * data, struct page * page)
37098 +{
37099 +       struct uf_readpages_context *rc = data;
37100 +       jnode * node;
37101 +       int ret = 0;
37102 +       reiser4_extent *ext;
37103 +       __u64 ext_index;
37104 +       int cbk_done = 0;
37105 +       struct address_space * mapping = page->mapping;
37106 +
37107 +       if (PageUptodate(page)) {
37108 +               unlock_page(page);
37109 +               return 0;
37110 +       }
37111 +       if (rc->lh.node == 0) {
37112 +               /* no twig lock  - have to do tree search. */
37113 +               reiser4_key key;
37114 +       repeat:
37115 +               unlock_page(page);
37116 +               key_by_inode_and_offset_common(
37117 +                       mapping->host, page_offset(page), &key);
37118 +               ret = coord_by_key(
37119 +                       &get_super_private(mapping->host->i_sb)->tree,
37120 +                       &key, &rc->coord, &rc->lh,
37121 +                       ZNODE_READ_LOCK, FIND_EXACT,
37122 +                       TWIG_LEVEL, TWIG_LEVEL, CBK_UNIQUE, NULL);
37123 +               if (ret)
37124 +                       return ret;
37125 +               lock_page(page);
37126 +               cbk_done = 1;
37127 +       }
37128 +       ret = zload(rc->coord.node);
37129 +       if (ret) {
37130 +               unlock_page(page);
37131 +               return ret;
37132 +       }
37133 +       if (!coord_is_existing_item(&rc->coord) ||
37134 +           !item_is_extent(&rc->coord)) {
37135 +               zrelse(rc->coord.node);
37136 +               unlock_page(page);
37137 +               return RETERR(-EIO);
37138 +       }
37139 +       ext = extent_by_coord(&rc->coord);
37140 +       ext_index = extent_unit_index(&rc->coord);
37141 +       if (page->index < ext_index ||
37142 +           page->index >= ext_index + extent_get_width(ext)) {
37143 +               /* the page index doesn't belong to the extent unit
37144 +                  which the coord points to - release the lock and
37145 +                  repeat with tree search. */
37146 +               zrelse(rc->coord.node);
37147 +               done_lh(&rc->lh);
37148 +               /* we can be here after a CBK call only in case of
37149 +                  corruption of the tree or the tree lookup algorithm bug. */
37150 +               if (unlikely(cbk_done)) {
37151 +                       unlock_page(page);
37152 +                       return RETERR(-EIO);
37153 +               }
37154 +               goto repeat;
37155 +       }
37156 +       node = jnode_of_page(page);
37157 +       if (unlikely(IS_ERR(node))) {
37158 +               zrelse(rc->coord.node);
37159 +               unlock_page(page);
37160 +               return PTR_ERR(node);
37161 +       }
37162 +       ret = reiser4_do_readpage_extent(ext, page->index - ext_index, page);
37163 +       jput(node);
37164 +       zrelse(rc->coord.node);
37165 +       if (ret)
37166 +               unlock_page(page);
37167 +       return ret;
37168 +}
37169 +
37170 +/**
37171 + * readpages_unix_file - called by the readahead code, starts reading for each
37172 + * page of given list of pages
37173 + */
37174 +int readpages_unix_file(
37175 +       struct file *file, struct address_space *mapping,
37176 +       struct list_head *pages, unsigned nr_pages)
37177 +{
37178 +       reiser4_context *ctx;
37179 +       struct uf_readpages_context rc;
37180 +       int ret;
37181 +
37182 +       ctx = reiser4_init_context(mapping->host->i_sb);
37183 +       if (IS_ERR(ctx)) {
37184 +               put_pages_list(pages);
37185 +               return PTR_ERR(ctx);
37186 +       }
37187 +       init_lh(&rc.lh);
37188 +       ret = read_cache_pages(mapping, pages,  uf_readpages_filler, &rc);
37189 +       done_lh(&rc.lh);
37190 +       context_set_commit_async(ctx);
37191 +       /* close the transaction to protect further page allocation from deadlocks */
37192 +       reiser4_txn_restart(ctx);
37193 +       reiser4_exit_context(ctx);
37194 +       return ret;
37195 +}
37196 +
37197 +static reiser4_block_nr unix_file_estimate_read(struct inode *inode,
37198 +                                               loff_t count UNUSED_ARG)
37199 +{
37200 +       /* We should reserve one block, because of updating of the stat data
37201 +          item */
37202 +       assert("vs-1249",
37203 +              inode_file_plugin(inode)->estimate.update ==
37204 +              estimate_update_common);
37205 +       return estimate_update_common(inode);
37206 +}
37207 +
37208 +/* this is called with nonexclusive access obtained, file's container can not change */
37209 +static ssize_t read_file(hint_t *hint, struct file *file,      /* file to read from to */
37210 +                        char __user *buf,      /* address of user-space buffer */
37211 +                        size_t count,  /* number of bytes to read */
37212 +                        loff_t *off)
37213 +{
37214 +       int result;
37215 +       struct inode *inode;
37216 +       flow_t flow;
37217 +       int (*read_f) (struct file *, flow_t *, hint_t *);
37218 +       coord_t *coord;
37219 +       znode *loaded;
37220 +
37221 +       inode = file->f_dentry->d_inode;
37222 +
37223 +       /* build flow */
37224 +       assert("vs-1250",
37225 +              inode_file_plugin(inode)->flow_by_inode ==
37226 +              flow_by_inode_unix_file);
37227 +       result =
37228 +           flow_by_inode_unix_file(inode, buf, 1 /* user space */ , count,
37229 +                                   *off, READ_OP, &flow);
37230 +       if (unlikely(result))
37231 +               return result;
37232 +
37233 +       /* get seal and coord sealed with it from reiser4 private data
37234 +          of struct file.  The coord will tell us where our last read
37235 +          of this file finished, and the seal will help to determine
37236 +          if that location is still valid.
37237 +        */
37238 +       coord = &hint->ext_coord.coord;
37239 +       while (flow.length && result == 0) {
37240 +               result =
37241 +                       find_file_item(hint, &flow.key, ZNODE_READ_LOCK, inode);
37242 +               if (cbk_errored(result))
37243 +                       /* error happened */
37244 +                       break;
37245 +
37246 +               if (coord->between != AT_UNIT) {
37247 +                       /* there were no items corresponding to given offset */
37248 +                       done_lh(hint->ext_coord.lh);
37249 +                       break;
37250 +               }
37251 +
37252 +               loaded = coord->node;
37253 +               result = zload(loaded);
37254 +               if (unlikely(result)) {
37255 +                       done_lh(hint->ext_coord.lh);
37256 +                       break;
37257 +               }
37258 +
37259 +               if (hint->ext_coord.valid == 0)
37260 +                       validate_extended_coord(&hint->ext_coord,
37261 +                                               get_key_offset(&flow.key));
37262 +
37263 +               assert("vs-4", hint->ext_coord.valid == 1);
37264 +               assert("vs-33", hint->ext_coord.lh == &hint->lh);
37265 +               /* call item's read method */
37266 +               read_f = item_plugin_by_coord(coord)->s.file.read;
37267 +               result = read_f(file, &flow, hint);
37268 +               zrelse(loaded);
37269 +               done_lh(hint->ext_coord.lh);
37270 +       }
37271 +
37272 +       return (count - flow.length) ? (count - flow.length) : result;
37273 +}
37274 +
37275 +static ssize_t read_unix_file_container_tails(struct file*, char __user*, size_t, loff_t*);
37276 +
37277 +/**
37278 + * read_unix_file - read of struct file_operations
37279 + * @file: file to read from
37280 + * @buf: address of user-space buffer
37281 + * @read_amount: number of bytes to read
37282 + * @off: position in file to read from
37283 + *
37284 + * This is implementation of vfs's read method of struct file_operations for
37285 + * unix file plugin.
37286 + */
37287 +ssize_t read_unix_file(struct file *file, char __user *buf, size_t read_amount,
37288 +                      loff_t *off)
37289 +{
37290 +       reiser4_context *ctx;
37291 +       ssize_t result;
37292 +       struct inode *inode;
37293 +       unix_file_info_t *uf_info;
37294 +
37295 +       if (unlikely(read_amount == 0))
37296 +               return 0;
37297 +
37298 +       assert("umka-072", file != NULL);
37299 +       assert("umka-074", off != NULL);
37300 +       inode = file->f_dentry->d_inode;
37301 +       assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37302 +
37303 +       ctx = reiser4_init_context(inode->i_sb);
37304 +       if (IS_ERR(ctx))
37305 +               return PTR_ERR(ctx);
37306 +       uf_info = unix_file_inode_data(inode);
37307 +       if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37308 +               get_exclusive_access(uf_info);
37309 +               result = find_file_state(inode, uf_info);
37310 +               if (unlikely(result != 0))
37311 +                       goto out;
37312 +       } else
37313 +               get_nonexclusive_access(uf_info);
37314 +       result = reiser4_grab_space_force(unix_file_estimate_read(inode, read_amount),
37315 +                                         BA_CAN_COMMIT);
37316 +       if (unlikely(result != 0))
37317 +               goto out;
37318 +       if (uf_info->container == UF_CONTAINER_EXTENTS){
37319 +               result = do_sync_read(file, buf, read_amount, off);
37320 +       } else if (uf_info->container == UF_CONTAINER_TAILS ||
37321 +                  reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV) ||
37322 +                  reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37323 +               result = read_unix_file_container_tails(file, buf, read_amount, off);
37324 +       } else {
37325 +               assert("zam-1085", uf_info->container == UF_CONTAINER_EMPTY);
37326 +               result = 0;
37327 +       }
37328 +out:
37329 +       drop_access(uf_info);
37330 +       context_set_commit_async(ctx);
37331 +       reiser4_exit_context(ctx);
37332 +       return result;
37333 +}
37334 +
37335 +static ssize_t read_unix_file_container_tails(
37336 +       struct file *file, char __user *buf, size_t read_amount, loff_t *off)
37337 +{
37338 +       int result;
37339 +       struct inode *inode;
37340 +       hint_t *hint;
37341 +       unix_file_info_t *uf_info;
37342 +       size_t count, read, left;
37343 +       loff_t size;
37344 +
37345 +       assert("umka-072", file != NULL);
37346 +       assert("umka-074", off != NULL);
37347 +       inode = file->f_dentry->d_inode;
37348 +       assert("vs-972", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37349 +
37350 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
37351 +       if (hint == NULL)
37352 +               return RETERR(-ENOMEM);
37353 +
37354 +       result = load_file_hint(file, hint);
37355 +       if (result) {
37356 +               kfree(hint);
37357 +               return result;
37358 +       }
37359 +
37360 +       left = read_amount;
37361 +       count = 0;
37362 +       uf_info = unix_file_inode_data(inode);
37363 +       while (left > 0) {
37364 +               reiser4_txn_restart_current();
37365 +               size = i_size_read(inode);
37366 +               if (*off >= size)
37367 +                       /* position to read from is past the end of file */
37368 +                       break;
37369 +               if (*off + left > size)
37370 +                       left = size - *off;
37371 +               /* faultin user page */
37372 +               result = fault_in_pages_writeable(buf, left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left);
37373 +               if (result)
37374 +                       return RETERR(-EFAULT);
37375 +
37376 +               read = read_file(hint, file, buf,
37377 +                                left > PAGE_CACHE_SIZE ? PAGE_CACHE_SIZE : left,
37378 +                                off);
37379 +               if (read < 0) {
37380 +                       result = read;
37381 +                       break;
37382 +               }
37383 +               left -= read;
37384 +               buf += read;
37385 +
37386 +               /* update position in a file */
37387 +               *off += read;
37388 +               /* total number of read bytes */
37389 +               count += read;
37390 +       }
37391 +       done_lh(&hint->lh);
37392 +       save_file_hint(file, hint);
37393 +       kfree(hint);
37394 +       if (count)
37395 +               file_accessed(file);
37396 +       /* return number of read bytes or error code if nothing is read */
37397 +       return count ? count : result;
37398 +}
37399 +
37400 +/* This function takes care about @file's pages. First of all it checks if
37401 +   filesystems readonly and if so gets out. Otherwise, it throws out all
37402 +   pages of file if it was mapped for read and going to be mapped for write
37403 +   and consists of tails. This is done in order to not manage few copies
37404 +   of the data (first in page cache and second one in tails them selves)
37405 +   for the case of mapping files consisting tails.
37406 +
37407 +   Here also tail2extent conversion is performed if it is allowed and file
37408 +   is going to be written or mapped for write. This functions may be called
37409 +   from write_unix_file() or mmap_unix_file(). */
37410 +static int check_pages_unix_file(struct file *file, struct inode *inode)
37411 +{
37412 +       reiser4_invalidate_pages(inode->i_mapping, 0,
37413 +                                (inode->i_size + PAGE_CACHE_SIZE -
37414 +                                 1) >> PAGE_CACHE_SHIFT, 0);
37415 +       return unpack(file, inode, 0 /* not forever */ );
37416 +}
37417 +
37418 +/**
37419 + * mmap_unix_file - mmap of struct file_operations
37420 + * @file: file to mmap
37421 + * @vma:
37422 + *
37423 + * This is implementation of vfs's mmap method of struct file_operations for
37424 + * unix file plugin. It converts file to extent if necessary. Sets
37425 + * reiser4_inode's flag - REISER4_HAS_MMAP.
37426 + */
37427 +int mmap_unix_file(struct file *file, struct vm_area_struct *vma)
37428 +{
37429 +       reiser4_context *ctx;
37430 +       int result;
37431 +       struct inode *inode;
37432 +       unix_file_info_t *uf_info;
37433 +       reiser4_block_nr needed;
37434 +
37435 +       inode = file->f_dentry->d_inode;
37436 +       ctx = reiser4_init_context(inode->i_sb);
37437 +       if (IS_ERR(ctx))
37438 +               return PTR_ERR(ctx);
37439 +
37440 +       uf_info = unix_file_inode_data(inode);
37441 +
37442 +       get_exclusive_access(uf_info);
37443 +
37444 +       if (!IS_RDONLY(inode) && (vma->vm_flags & (VM_MAYWRITE | VM_SHARED))) {
37445 +               /*
37446 +                * we need file built of extent items. If it is still built of
37447 +                * tail items we have to convert it. Find what items the file
37448 +                * is built of
37449 +                */
37450 +               result = find_file_state(inode, uf_info);
37451 +               if (result != 0) {
37452 +                       drop_exclusive_access(uf_info);
37453 +                       reiser4_exit_context(ctx);
37454 +                       return result;
37455 +               }
37456 +
37457 +               assert("vs-1648", (uf_info->container == UF_CONTAINER_TAILS ||
37458 +                                  uf_info->container == UF_CONTAINER_EXTENTS ||
37459 +                                  uf_info->container == UF_CONTAINER_EMPTY));
37460 +               if (uf_info->container == UF_CONTAINER_TAILS) {
37461 +                       /*
37462 +                        * invalidate all pages and convert file from tails to
37463 +                        * extents
37464 +                        */
37465 +                       result = check_pages_unix_file(file, inode);
37466 +                       if (result) {
37467 +                               drop_exclusive_access(uf_info);
37468 +                               reiser4_exit_context(ctx);
37469 +                               return result;
37470 +                       }
37471 +               }
37472 +       }
37473 +
37474 +       /*
37475 +        * generic_file_mmap will do update_atime. Grab space for stat data
37476 +        * update.
37477 +        */
37478 +       needed = inode_file_plugin(inode)->estimate.update(inode);
37479 +       result = reiser4_grab_space_force(needed, BA_CAN_COMMIT);
37480 +       if (result) {
37481 +               drop_exclusive_access(uf_info);
37482 +               reiser4_exit_context(ctx);
37483 +               return result;
37484 +       }
37485 +
37486 +       result = generic_file_mmap(file, vma);
37487 +       if (result == 0) {
37488 +               /* mark file as having mapping. */
37489 +               reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
37490 +       }
37491 +
37492 +       drop_exclusive_access(uf_info);
37493 +       reiser4_exit_context(ctx);
37494 +       return result;
37495 +}
37496 +
37497 +/**
37498 + * find_first_item
37499 + * @inode:
37500 + *
37501 + * Finds file item which is responsible for first byte in the file.
37502 + */
37503 +static int find_first_item(struct inode *inode)
37504 +{
37505 +       coord_t coord;
37506 +       lock_handle lh;
37507 +       reiser4_key key;
37508 +       int result;
37509 +
37510 +       coord_init_zero(&coord);
37511 +       init_lh(&lh);
37512 +       inode_file_plugin(inode)->key_by_inode(inode, 0, &key);
37513 +       result = find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK,
37514 +                                      inode);
37515 +       if (result == CBK_COORD_FOUND) {
37516 +               if (coord.between == AT_UNIT) {
37517 +                       result = zload(coord.node);
37518 +                       if (result == 0) {
37519 +                               result = item_id_by_coord(&coord);
37520 +                               zrelse(coord.node);
37521 +                               if (result != EXTENT_POINTER_ID &&
37522 +                                   result != FORMATTING_ID)
37523 +                                       result = RETERR(-EIO);
37524 +                       }
37525 +               } else
37526 +                       result = RETERR(-EIO);
37527 +       }
37528 +       done_lh(&lh);
37529 +       return result;
37530 +}
37531 +
37532 +/**
37533 + * open_unix_file
37534 + * @inode:
37535 + * @file:
37536 + *
37537 + * If filesystem is not readonly - complete uncompleted tail conversion if
37538 + * there was one
37539 + */
37540 +int open_unix_file(struct inode *inode, struct file *file)
37541 +{
37542 +       int result;
37543 +       reiser4_context *ctx;
37544 +       unix_file_info_t *uf_info;
37545 +
37546 +       if (IS_RDONLY(inode))
37547 +               return 0;
37548 +
37549 +       if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED))
37550 +               return 0;
37551 +
37552 +       ctx = reiser4_init_context(inode->i_sb);
37553 +       if (IS_ERR(ctx))
37554 +               return PTR_ERR(ctx);
37555 +
37556 +       uf_info = unix_file_inode_data(inode);
37557 +       get_exclusive_access(uf_info);
37558 +
37559 +       /*
37560 +        * it may happen that another process is doing tail conversion. Wait
37561 +        * until it completes
37562 +        */
37563 +       while (1) {
37564 +               if (reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV)) {
37565 +                       drop_exclusive_access(uf_info);
37566 +                       schedule();
37567 +                       get_exclusive_access(uf_info);
37568 +                       continue;
37569 +               }
37570 +               break;
37571 +       }
37572 +
37573 +       if (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
37574 +               /*
37575 +                * other process completed the conversion
37576 +                */
37577 +               drop_exclusive_access(uf_info);
37578 +               reiser4_exit_context(ctx);
37579 +               return 0;
37580 +       }
37581 +
37582 +       /*
37583 +        * file left in semi converted state after unclean shutdown or another
37584 +        * thread is doing conversion and dropped exclusive access which doing
37585 +        * balance dirty pages. Complete the conversion
37586 +        */
37587 +       result = find_first_item(inode);
37588 +       if (result == EXTENT_POINTER_ID)
37589 +               /*
37590 +                * first item is extent, therefore there was incomplete
37591 +                * tail2extent conversion. Complete it
37592 +                */
37593 +               result = tail2extent(unix_file_inode_data(inode));
37594 +       else if (result == FORMATTING_ID)
37595 +               /*
37596 +                * first item is formatting item, therefore there was
37597 +                * incomplete extent2tail conversion. Complete it
37598 +                */
37599 +               result = extent2tail(unix_file_inode_data(inode));
37600 +       else
37601 +               result = -EIO;
37602 +
37603 +       assert("vs-1712",
37604 +              ergo(result == 0,
37605 +                   (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED) &&
37606 +                    !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV))));
37607 +       drop_exclusive_access(uf_info);
37608 +       reiser4_exit_context(ctx);
37609 +       return result;
37610 +}
37611 +
37612 +#define NEITHER_OBTAINED 0
37613 +#define EA_OBTAINED 1
37614 +#define NEA_OBTAINED 2
37615 +
37616 +static void drop_access(unix_file_info_t *uf_info)
37617 +{
37618 +       if (uf_info->exclusive_use)
37619 +               drop_exclusive_access(uf_info);
37620 +       else
37621 +               drop_nonexclusive_access(uf_info);
37622 +}
37623 +
37624 +#define debug_wuf(format, ...) printk("%s: %d: %s: " format "\n", \
37625 +                             __FILE__, __LINE__, __FUNCTION__, ## __VA_ARGS__)
37626 +
37627 +/**
37628 + * write_unix_file - write of struct file_operations
37629 + * @file: file to write to
37630 + * @buf: address of user-space buffer
37631 + * @write_amount: number of bytes to write
37632 + * @off: position in file to write to
37633 + *
37634 + * This is implementation of vfs's write method of struct file_operations for
37635 + * unix file plugin.
37636 + */
37637 +ssize_t write_unix_file(struct file *file, const char __user *buf,
37638 +                       size_t count, loff_t *pos)
37639 +{
37640 +       int result;
37641 +       reiser4_context *ctx;
37642 +       struct inode *inode;
37643 +       unix_file_info_t *uf_info;
37644 +       ssize_t written;
37645 +       int try_free_space;
37646 +       int to_write = PAGE_CACHE_SIZE * WRITE_GRANULARITY;
37647 +       size_t left;
37648 +       ssize_t (*write_op)(struct file *, const char __user *, size_t,
37649 +                           loff_t *pos);
37650 +       int ea;
37651 +       loff_t new_size;
37652 +
37653 +       inode = file->f_dentry->d_inode;
37654 +       ctx = reiser4_init_context(inode->i_sb);
37655 +       if (IS_ERR(ctx))
37656 +               return PTR_ERR(ctx);
37657 +
37658 +       mutex_lock(&inode->i_mutex);
37659 +
37660 +       assert("vs-947", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
37661 +       assert("vs-9471", (!reiser4_inode_get_flag(inode, REISER4_PART_MIXED)));
37662 +
37663 +       /* check amount of bytes to write and writing position */
37664 +       result = generic_write_checks(file, pos, &count, 0);
37665 +       if (result) {
37666 +               mutex_unlock(&inode->i_mutex);
37667 +               context_set_commit_async(ctx);
37668 +               reiser4_exit_context(ctx);
37669 +               return result;
37670 +       }
37671 +
37672 +       result = remove_suid(file->f_dentry);
37673 +       if (result) {
37674 +               mutex_unlock(&inode->i_mutex);
37675 +               context_set_commit_async(ctx);
37676 +               reiser4_exit_context(ctx);
37677 +               return result;
37678 +       }
37679 +       /* remove_suid might create a transaction */
37680 +       reiser4_txn_restart(ctx);
37681 +
37682 +       uf_info = unix_file_inode_data(inode);
37683 +
37684 +       current->backing_dev_info = inode->i_mapping->backing_dev_info;
37685 +       written = 0;
37686 +       try_free_space = 0;
37687 +       left = count;
37688 +       ea = NEITHER_OBTAINED;
37689 +
37690 +       new_size = i_size_read(inode);
37691 +       if (*pos + count > new_size)
37692 +               new_size = *pos + count;
37693 +
37694 +       while (left) {
37695 +               if (left < to_write)
37696 +                       to_write = left;
37697 +
37698 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
37699 +                       get_exclusive_access(uf_info);
37700 +                       ea = EA_OBTAINED;
37701 +                       if (uf_info->container != UF_CONTAINER_EMPTY) {
37702 +                               /* file is made not empty by another process */
37703 +                               drop_exclusive_access(uf_info);
37704 +                               ea = NEITHER_OBTAINED;
37705 +                               continue;
37706 +                       }
37707 +               } else if (uf_info->container == UF_CONTAINER_UNKNOWN) {
37708 +                       /*
37709 +                        * get exclusive access directly just to not have to
37710 +                        * re-obtain it if file will appear empty
37711 +                        */
37712 +                       get_exclusive_access(uf_info);
37713 +                       ea = EA_OBTAINED;
37714 +                       result = find_file_state(inode, uf_info);
37715 +                       if (result) {
37716 +                               drop_exclusive_access(uf_info);
37717 +                               ea = NEITHER_OBTAINED;
37718 +                               break;
37719 +                       }
37720 +               } else {
37721 +                       get_nonexclusive_access(uf_info);
37722 +                       ea = NEA_OBTAINED;
37723 +               }
37724 +
37725 +               /* either EA or NEA is obtained. Choose item write method */
37726 +               if (uf_info->container == UF_CONTAINER_EXTENTS) {
37727 +                       /* file is built of extent items */
37728 +                       write_op = reiser4_write_extent;
37729 +               } else if (uf_info->container == UF_CONTAINER_EMPTY) {
37730 +                       /* file is empty */
37731 +                       if (should_have_notail(uf_info, new_size))
37732 +                               write_op = reiser4_write_extent;
37733 +                       else
37734 +                               write_op = reiser4_write_tail;
37735 +               } else {
37736 +                       /* file is built of tail items */
37737 +                       if (should_have_notail(uf_info, new_size)) {
37738 +                               if (ea == NEA_OBTAINED) {
37739 +                                       drop_nonexclusive_access(uf_info);
37740 +                                       get_exclusive_access(uf_info);
37741 +                                       ea = EA_OBTAINED;
37742 +                               }
37743 +                               if (uf_info->container == UF_CONTAINER_TAILS) {
37744 +                                       /*
37745 +                                        * if file is being convered by another
37746 +                                        * process - wait until it completes
37747 +                                        */
37748 +                                       while (1) {
37749 +                                               if (reiser4_inode_get_flag(inode,
37750 +                                                                          REISER4_PART_IN_CONV)) {
37751 +                                                       drop_exclusive_access(uf_info);
37752 +                                                       schedule();
37753 +                                                       get_exclusive_access(uf_info);
37754 +                                                       continue;
37755 +                                               }
37756 +                                               break;
37757 +                                       }
37758 +                                       if (uf_info->container ==  UF_CONTAINER_TAILS) {
37759 +                                               result = tail2extent(uf_info);
37760 +                                               if (result)
37761 +                                                       break;
37762 +                                       }
37763 +                               }
37764 +                               drop_exclusive_access(uf_info);
37765 +                               ea = NEITHER_OBTAINED;
37766 +                               continue;
37767 +                       }
37768 +                       write_op = reiser4_write_tail;
37769 +               }
37770 +
37771 +               written = write_op(file, buf, to_write, pos);
37772 +               if (written == -ENOSPC && try_free_space) {
37773 +                       drop_access(uf_info);
37774 +                       txnmgr_force_commit_all(inode->i_sb, 0);
37775 +                       try_free_space = 0;
37776 +                       continue;
37777 +               }
37778 +               if (written < 0) {
37779 +                       drop_access(uf_info);
37780 +                       result = written;
37781 +                       break;
37782 +               }
37783 +               /* something is written. */
37784 +               if (uf_info->container == UF_CONTAINER_EMPTY) {
37785 +                       assert("", ea == EA_OBTAINED);
37786 +                       uf_info->container =
37787 +                               (write_op == reiser4_write_extent) ?
37788 +                               UF_CONTAINER_EXTENTS : UF_CONTAINER_TAILS;
37789 +               } else {
37790 +                       assert("", ergo(uf_info->container == UF_CONTAINER_EXTENTS,
37791 +                                       write_op == reiser4_write_extent));
37792 +                       assert("", ergo(uf_info->container == UF_CONTAINER_TAILS,
37793 +                                       write_op == reiser4_write_tail));
37794 +               }
37795 +               if (*pos + written > inode->i_size)
37796 +                       INODE_SET_FIELD(inode, i_size, *pos + written);
37797 +               file_update_time(file);
37798 +               result = reiser4_update_sd(inode);
37799 +               if (result) {
37800 +                       mutex_unlock(&inode->i_mutex);
37801 +                       current->backing_dev_info = NULL;
37802 +                       drop_access(uf_info);
37803 +                       context_set_commit_async(ctx);
37804 +                       reiser4_exit_context(ctx);
37805 +                       return result;
37806 +               }
37807 +               drop_access(uf_info);
37808 +               ea = NEITHER_OBTAINED;
37809 +               reiser4_txn_restart(ctx);
37810 +               current->journal_info = NULL;
37811 +               /*
37812 +                * tell VM how many pages were dirtied. Maybe number of pages
37813 +                * which were dirty already should not be counted
37814 +                */
37815 +               balance_dirty_pages_ratelimited_nr(inode->i_mapping,
37816 +                                                  (written + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE);
37817 +               current->journal_info = ctx;
37818 +
37819 +               left -= written;
37820 +               buf += written;
37821 +               *pos += written;
37822 +       }
37823 +
37824 +       mutex_unlock(&inode->i_mutex);
37825 +
37826 +       if (result == 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
37827 +               reiser4_txn_restart_current();
37828 +               grab_space_enable();
37829 +               result = sync_unix_file(file, file->f_dentry,
37830 +                                       0 /* data and stat data */ );
37831 +               if (result)
37832 +                       warning("reiser4-7", "failed to sync file %llu",
37833 +                               (unsigned long long)get_inode_oid(inode));
37834 +       }
37835 +
37836 +       current->backing_dev_info = NULL;
37837 +
37838 +       reiser4_exit_context(ctx);
37839 +
37840 +       /*
37841 +        * return number of written bytes or error code if nothing is
37842 +        * written. Note, that it does not work correctly in case when
37843 +        * sync_unix_file returns error
37844 +        */
37845 +       return (count - left) ? (count - left) : result;
37846 +}
37847 +
37848 +/**
37849 + * release_unix_file - release of struct file_operations
37850 + * @inode: inode of released file
37851 + * @file: file to release
37852 + *
37853 + * Implementation of release method of struct file_operations for unix file
37854 + * plugin. If last reference to indode is released - convert all extent items
37855 + * into tail items if necessary. Frees reiser4 specific file data.
37856 + */
37857 +int release_unix_file(struct inode *inode, struct file *file)
37858 +{
37859 +       reiser4_context *ctx;
37860 +       unix_file_info_t *uf_info;
37861 +       int result;
37862 +       int in_reiser4;
37863 +
37864 +       in_reiser4 = is_in_reiser4_context();
37865 +
37866 +       ctx = reiser4_init_context(inode->i_sb);
37867 +       if (IS_ERR(ctx))
37868 +               return PTR_ERR(ctx);
37869 +
37870 +       result = 0;
37871 +       if (in_reiser4 == 0) {
37872 +               uf_info = unix_file_inode_data(inode);
37873 +
37874 +               get_exclusive_access(uf_info);
37875 +               if (atomic_read(&file->f_dentry->d_count) == 1 &&
37876 +                   uf_info->container == UF_CONTAINER_EXTENTS &&
37877 +                   !should_have_notail(uf_info, inode->i_size) &&
37878 +                   !rofs_inode(inode)) {
37879 +                       result = extent2tail(uf_info);
37880 +                       if (result != 0) {
37881 +                               warning("nikita-3233",
37882 +                                       "Failed (%d) to convert in %s (%llu)",
37883 +                                       result, __FUNCTION__,
37884 +                                       (unsigned long long)
37885 +                                       get_inode_oid(inode));
37886 +                       }
37887 +               }
37888 +               drop_exclusive_access(uf_info);
37889 +       } else {
37890 +               /*
37891 +                  we are within reiser4 context already. How latter is
37892 +                  possible? Simple:
37893 +
37894 +                  (gdb) bt
37895 +                  #0  get_exclusive_access ()
37896 +                  #2  0xc01e56d3 in release_unix_file ()
37897 +                  #3  0xc01c3643 in reiser4_release ()
37898 +                  #4  0xc014cae0 in __fput ()
37899 +                  #5  0xc013ffc3 in remove_vm_struct ()
37900 +                  #6  0xc0141786 in exit_mmap ()
37901 +                  #7  0xc0118480 in mmput ()
37902 +                  #8  0xc0133205 in oom_kill ()
37903 +                  #9  0xc01332d1 in out_of_memory ()
37904 +                  #10 0xc013bc1d in try_to_free_pages ()
37905 +                  #11 0xc013427b in __alloc_pages ()
37906 +                  #12 0xc013f058 in do_anonymous_page ()
37907 +                  #13 0xc013f19d in do_no_page ()
37908 +                  #14 0xc013f60e in handle_mm_fault ()
37909 +                  #15 0xc01131e5 in do_page_fault ()
37910 +                  #16 0xc0104935 in error_code ()
37911 +                  #17 0xc025c0c6 in __copy_to_user_ll ()
37912 +                  #18 0xc01d496f in reiser4_read_tail ()
37913 +                  #19 0xc01e4def in read_unix_file ()
37914 +                  #20 0xc01c3504 in reiser4_read ()
37915 +                  #21 0xc014bd4f in vfs_read ()
37916 +                  #22 0xc014bf66 in sys_read ()
37917 +                */
37918 +               warning("vs-44", "out of memory?");
37919 +       }
37920 +
37921 +       reiser4_free_file_fsdata(file);
37922 +
37923 +       reiser4_exit_context(ctx);
37924 +       return result;
37925 +}
37926 +
37927 +static void set_file_notail(struct inode *inode)
37928 +{
37929 +       reiser4_inode *state;
37930 +       formatting_plugin *tplug;
37931 +
37932 +       state = reiser4_inode_data(inode);
37933 +       tplug = formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID);
37934 +       force_plugin_pset(inode, PSET_FORMATTING, (reiser4_plugin *)tplug);
37935 +}
37936 +
37937 +/* if file is built of tails - convert it to extents */
37938 +static int unpack(struct file *filp, struct inode *inode, int forever)
37939 +{
37940 +       int result = 0;
37941 +       unix_file_info_t *uf_info;
37942 +
37943 +       uf_info = unix_file_inode_data(inode);
37944 +       assert("vs-1628", ea_obtained(uf_info));
37945 +
37946 +       result = find_file_state(inode, uf_info);
37947 +       if (result)
37948 +               return result;
37949 +       assert("vs-1074", uf_info->container != UF_CONTAINER_UNKNOWN);
37950 +
37951 +       if (uf_info->container == UF_CONTAINER_TAILS) {
37952 +               /*
37953 +                * if file is being convered by another process - wait until it
37954 +                * completes
37955 +                */
37956 +               while (1) {
37957 +                       if (reiser4_inode_get_flag(inode,
37958 +                                                  REISER4_PART_IN_CONV)) {
37959 +                               drop_exclusive_access(uf_info);
37960 +                               schedule();
37961 +                               get_exclusive_access(uf_info);
37962 +                               continue;
37963 +                       }
37964 +                       break;
37965 +               }
37966 +               if (uf_info->container == UF_CONTAINER_TAILS) {
37967 +                       result = tail2extent(uf_info);
37968 +                       if (result)
37969 +                               return result;
37970 +               }
37971 +       }
37972 +       if (forever) {
37973 +               /* safe new formatting plugin in stat data */
37974 +               __u64 tograb;
37975 +
37976 +               set_file_notail(inode);
37977 +
37978 +               grab_space_enable();
37979 +               tograb = inode_file_plugin(inode)->estimate.update(inode);
37980 +               result = reiser4_grab_space(tograb, BA_CAN_COMMIT);
37981 +               result = reiser4_update_sd(inode);
37982 +       }
37983 +
37984 +       return result;
37985 +}
37986 +
37987 +/* implentation of vfs' ioctl method of struct file_operations for unix file
37988 +   plugin
37989 +*/
37990 +int
37991 +ioctl_unix_file(struct inode *inode, struct file *filp,
37992 +               unsigned int cmd, unsigned long arg UNUSED_ARG)
37993 +{
37994 +       reiser4_context *ctx;
37995 +       int result;
37996 +
37997 +       ctx = reiser4_init_context(inode->i_sb);
37998 +       if (IS_ERR(ctx))
37999 +               return PTR_ERR(ctx);
38000 +
38001 +       switch (cmd) {
38002 +       case REISER4_IOC_UNPACK:
38003 +               get_exclusive_access(unix_file_inode_data(inode));
38004 +               result = unpack(filp, inode, 1 /* forever */ );
38005 +               drop_exclusive_access(unix_file_inode_data(inode));
38006 +               break;
38007 +
38008 +       default:
38009 +               result = RETERR(-ENOSYS);
38010 +               break;
38011 +       }
38012 +       reiser4_exit_context(ctx);
38013 +       return result;
38014 +}
38015 +
38016 +/* implentation of vfs' bmap method of struct address_space_operations for unix
38017 +   file plugin
38018 +*/
38019 +sector_t bmap_unix_file(struct address_space * mapping, sector_t lblock)
38020 +{
38021 +       reiser4_context *ctx;
38022 +       sector_t result;
38023 +       reiser4_key key;
38024 +       coord_t coord;
38025 +       lock_handle lh;
38026 +       struct inode *inode;
38027 +       item_plugin *iplug;
38028 +       sector_t block;
38029 +
38030 +       inode = mapping->host;
38031 +
38032 +       ctx = reiser4_init_context(inode->i_sb);
38033 +       if (IS_ERR(ctx))
38034 +               return PTR_ERR(ctx);
38035 +       key_by_inode_and_offset_common(inode,
38036 +                                      (loff_t) lblock * current_blocksize,
38037 +                                      &key);
38038 +
38039 +       init_lh(&lh);
38040 +       result =
38041 +           find_file_item_nohint(&coord, &lh, &key, ZNODE_READ_LOCK, inode);
38042 +       if (cbk_errored(result)) {
38043 +               done_lh(&lh);
38044 +               reiser4_exit_context(ctx);
38045 +               return result;
38046 +       }
38047 +
38048 +       result = zload(coord.node);
38049 +       if (result) {
38050 +               done_lh(&lh);
38051 +               reiser4_exit_context(ctx);
38052 +               return result;
38053 +       }
38054 +
38055 +       iplug = item_plugin_by_coord(&coord);
38056 +       if (iplug->s.file.get_block) {
38057 +               result = iplug->s.file.get_block(&coord, lblock, &block);
38058 +               if (result == 0)
38059 +                       result = block;
38060 +       } else
38061 +               result = RETERR(-EINVAL);
38062 +
38063 +       zrelse(coord.node);
38064 +       done_lh(&lh);
38065 +       reiser4_exit_context(ctx);
38066 +       return result;
38067 +}
38068 +
38069 +/**
38070 + * flow_by_inode_unix_file - initizlize structure flow
38071 + * @inode: inode of file for which read or write is abou
38072 + * @buf: buffer to perform read to or write from
38073 + * @user: flag showing whether @buf is user space or kernel space
38074 + * @size: size of buffer @buf
38075 + * @off: start offset fro read or write
38076 + * @op: READ or WRITE
38077 + * @flow:
38078 + *
38079 + * Initializes fields of @flow: key, size of data, i/o mode (read or write).
38080 + */
38081 +int flow_by_inode_unix_file(struct inode *inode,
38082 +                           const char __user *buf, int user,
38083 +                           loff_t size, loff_t off,
38084 +                           rw_op op, flow_t *flow)
38085 +{
38086 +       assert("nikita-1100", inode != NULL);
38087 +
38088 +       flow->length = size;
38089 +       memcpy(&flow->data, &buf, sizeof(buf));
38090 +       flow->user = user;
38091 +       flow->op = op;
38092 +       assert("nikita-1931", inode_file_plugin(inode) != NULL);
38093 +       assert("nikita-1932",
38094 +              inode_file_plugin(inode)->key_by_inode ==
38095 +              key_by_inode_and_offset_common);
38096 +       /* calculate key of write position and insert it into flow->key */
38097 +       return key_by_inode_and_offset_common(inode, off, &flow->key);
38098 +}
38099 +
38100 +/* plugin->u.file.set_plug_in_sd = NULL
38101 +   plugin->u.file.set_plug_in_inode = NULL
38102 +   plugin->u.file.create_blank_sd = NULL */
38103 +/* plugin->u.file.delete */
38104 +/*
38105 +   plugin->u.file.add_link = reiser4_add_link_common
38106 +   plugin->u.file.rem_link = NULL */
38107 +
38108 +/* plugin->u.file.owns_item
38109 +   this is common_file_owns_item with assertion */
38110 +/* Audited by: green(2002.06.15) */
38111 +int
38112 +owns_item_unix_file(const struct inode *inode /* object to check against */ ,
38113 +                   const coord_t * coord /* coord to check */ )
38114 +{
38115 +       int result;
38116 +
38117 +       result = owns_item_common(inode, coord);
38118 +       if (!result)
38119 +               return 0;
38120 +       if (!plugin_of_group(item_plugin_by_coord(coord),
38121 +                            UNIX_FILE_METADATA_ITEM_TYPE))
38122 +               return 0;
38123 +       assert("vs-547",
38124 +              item_id_by_coord(coord) == EXTENT_POINTER_ID ||
38125 +              item_id_by_coord(coord) == FORMATTING_ID);
38126 +       return 1;
38127 +}
38128 +
38129 +static int setattr_truncate(struct inode *inode, struct iattr *attr)
38130 +{
38131 +       int result;
38132 +       int s_result;
38133 +       loff_t old_size;
38134 +       reiser4_tree *tree;
38135 +
38136 +       inode_check_scale(inode, inode->i_size, attr->ia_size);
38137 +
38138 +       old_size = inode->i_size;
38139 +       tree = reiser4_tree_by_inode(inode);
38140 +
38141 +       result = safe_link_grab(tree, BA_CAN_COMMIT);
38142 +       if (result == 0)
38143 +               result = safe_link_add(inode, SAFE_TRUNCATE);
38144 +       if (result == 0)
38145 +               result = truncate_file_body(inode, attr->ia_size);
38146 +       if (result)
38147 +               warning("vs-1588", "truncate_file failed: oid %lli, "
38148 +                       "old size %lld, new size %lld, retval %d",
38149 +                       (unsigned long long)get_inode_oid(inode),
38150 +                       old_size, attr->ia_size, result);
38151 +
38152 +       s_result = safe_link_grab(tree, BA_CAN_COMMIT);
38153 +       if (s_result == 0)
38154 +               s_result =
38155 +                   safe_link_del(tree, get_inode_oid(inode), SAFE_TRUNCATE);
38156 +       if (s_result != 0) {
38157 +               warning("nikita-3417", "Cannot kill safelink %lli: %i",
38158 +                       (unsigned long long)get_inode_oid(inode), s_result);
38159 +       }
38160 +       safe_link_release(tree);
38161 +       return result;
38162 +}
38163 +
38164 +/* plugin->u.file.setattr method */
38165 +/* This calls inode_setattr and if truncate is in effect it also takes
38166 +   exclusive inode access to avoid races */
38167 +int setattr_unix_file(struct dentry *dentry,   /* Object to change attributes */
38168 +                     struct iattr *attr /* change description */ )
38169 +{
38170 +       int result;
38171 +
38172 +       if (attr->ia_valid & ATTR_SIZE) {
38173 +               reiser4_context *ctx;
38174 +               unix_file_info_t *uf_info;
38175 +
38176 +               /* truncate does reservation itself and requires exclusive
38177 +                  access obtained */
38178 +               ctx = reiser4_init_context(dentry->d_inode->i_sb);
38179 +               if (IS_ERR(ctx))
38180 +                       return PTR_ERR(ctx);
38181 +
38182 +               uf_info = unix_file_inode_data(dentry->d_inode);
38183 +               get_exclusive_access(uf_info);
38184 +               result = setattr_truncate(dentry->d_inode, attr);
38185 +               drop_exclusive_access(uf_info);
38186 +               context_set_commit_async(ctx);
38187 +               reiser4_exit_context(ctx);
38188 +       } else
38189 +               result = reiser4_setattr_common(dentry, attr);
38190 +
38191 +       return result;
38192 +}
38193 +
38194 +/* plugin->u.file.init_inode_data */
38195 +void
38196 +init_inode_data_unix_file(struct inode *inode,
38197 +                         reiser4_object_create_data * crd, int create)
38198 +{
38199 +       unix_file_info_t *data;
38200 +
38201 +       data = unix_file_inode_data(inode);
38202 +       data->container = create ? UF_CONTAINER_EMPTY : UF_CONTAINER_UNKNOWN;
38203 +       init_rwsem(&data->latch);
38204 +       data->tplug = inode_formatting_plugin(inode);
38205 +       data->exclusive_use = 0;
38206 +
38207 +#if REISER4_DEBUG
38208 +       data->ea_owner = NULL;
38209 +       atomic_set(&data->nr_neas, 0);
38210 +#endif
38211 +       init_inode_ordering(inode, crd, create);
38212 +}
38213 +
38214 +/**
38215 + * delete_object_unix_file - delete_object of file_plugin
38216 + * @inode: inode to be deleted
38217 + *
38218 + * Truncates file to length 0, removes stat data and safe link.
38219 + */
38220 +int delete_object_unix_file(struct inode *inode)
38221 +{
38222 +       unix_file_info_t *uf_info;
38223 +       int result;
38224 +
38225 +       if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
38226 +               return 0;
38227 +
38228 +       /* truncate file bogy first */
38229 +       uf_info = unix_file_inode_data(inode);
38230 +       get_exclusive_access(uf_info);
38231 +       result = truncate_file_body(inode, 0 /* size */ );
38232 +       drop_exclusive_access(uf_info);
38233 +
38234 +       if (result)
38235 +               warning("", "failed to truncate file (%llu) on removal: %d",
38236 +                       get_inode_oid(inode), result);
38237 +
38238 +       /* remove stat data and safe link */
38239 +       return reiser4_delete_object_common(inode);
38240 +}
38241 +
38242 +/**
38243 + * sendfile_unix_file - sendfile of struct file_operations
38244 + * @file: file to be sent
38245 + * @ppos: position to start from
38246 + * @count: number of bytes to send
38247 + * @actor: function to copy data
38248 + * @target: where to copy read data
38249 + *
38250 + * Reads @count bytes from @file and calls @actor for every page read. This is
38251 + * needed for loop back devices support.
38252 + */
38253 +ssize_t
38254 +sendfile_unix_file(struct file *file, loff_t *ppos, size_t count,
38255 +                  read_actor_t actor, void *target)
38256 +{
38257 +       reiser4_context *ctx;
38258 +       ssize_t result;
38259 +       struct inode *inode;
38260 +       unix_file_info_t *uf_info;
38261 +
38262 +       inode = file->f_dentry->d_inode;
38263 +       ctx = reiser4_init_context(inode->i_sb);
38264 +       if (IS_ERR(ctx))
38265 +               return PTR_ERR(ctx);
38266 +
38267 +       /*
38268 +        * generic_file_sndfile may want to call update_atime. Grab space for
38269 +        * stat data update
38270 +        */
38271 +       result = reiser4_grab_space(estimate_update_common(inode),
38272 +                                   BA_CAN_COMMIT);
38273 +       if (result)
38274 +               goto error;
38275 +       mutex_lock(&inode->i_mutex);
38276 +       reiser4_inode_set_flag(inode, REISER4_HAS_MMAP);
38277 +       mutex_unlock(&inode->i_mutex);
38278 +
38279 +       uf_info = unix_file_inode_data(inode);
38280 +       get_nonexclusive_access(uf_info);
38281 +       result = generic_file_sendfile(file, ppos, count, actor, target);
38282 +       drop_nonexclusive_access(uf_info);
38283 + error:
38284 +       reiser4_exit_context(ctx);
38285 +       return result;
38286 +}
38287 +
38288 +int
38289 +prepare_write_unix_file(struct file *file, struct page *page,
38290 +                       unsigned from, unsigned to)
38291 +{
38292 +       reiser4_context *ctx;
38293 +       unix_file_info_t *uf_info;
38294 +       int ret;
38295 +
38296 +       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
38297 +       if (IS_ERR(ctx))
38298 +               return PTR_ERR(ctx);
38299 +
38300 +       uf_info = unix_file_inode_data(file->f_dentry->d_inode);
38301 +       get_exclusive_access(uf_info);
38302 +       ret = find_file_state(file->f_dentry->d_inode, uf_info);
38303 +       if (ret == 0) {
38304 +               if (uf_info->container == UF_CONTAINER_TAILS)
38305 +                       ret = -EINVAL;
38306 +               else
38307 +                       ret = do_prepare_write(file, page, from, to);
38308 +       }
38309 +       drop_exclusive_access(uf_info);
38310 +
38311 +       /* don't commit transaction under inode semaphore */
38312 +       context_set_commit_async(ctx);
38313 +       reiser4_exit_context(ctx);
38314 +       return ret;
38315 +}
38316 +
38317 +/*
38318 + * Local variables:
38319 + * c-indentation-style: "K&R"
38320 + * mode-name: "LC"
38321 + * c-basic-offset: 8
38322 + * tab-width: 8
38323 + * fill-column: 79
38324 + * scroll-step: 1
38325 + * End:
38326 + */
38327 diff --git a/fs/reiser4/plugin/file/file.h b/fs/reiser4/plugin/file/file.h
38328 new file mode 100644
38329 index 0000000..e486a88
38330 --- /dev/null
38331 +++ b/fs/reiser4/plugin/file/file.h
38332 @@ -0,0 +1,272 @@
38333 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
38334 + * reiser4/README */
38335 +
38336 +/* this file contains declarations of methods implementing
38337 +   file plugins (UNIX_FILE_PLUGIN_ID, CRYPTCOMPRESS_FILE_PLUGIN_ID
38338 +   and SYMLINK_FILE_PLUGIN_ID) */
38339 +
38340 +#if !defined( __REISER4_FILE_H__ )
38341 +#define __REISER4_FILE_H__
38342 +
38343 +/* declarations of functions implementing UNIX_FILE_PLUGIN_ID file plugin */
38344 +
38345 +/* inode operations */
38346 +int setattr_unix_file(struct dentry *, struct iattr *);
38347 +
38348 +/* file operations */
38349 +ssize_t read_unix_file(struct file *, char __user *buf, size_t read_amount,
38350 +                      loff_t *off);
38351 +ssize_t write_unix_file(struct file *, const char __user *buf, size_t write_amount,
38352 +                       loff_t * off);
38353 +int ioctl_unix_file(struct inode *, struct file *, unsigned int cmd,
38354 +                   unsigned long arg);
38355 +int mmap_unix_file(struct file *, struct vm_area_struct *);
38356 +int open_unix_file(struct inode *, struct file *);
38357 +int release_unix_file(struct inode *, struct file *);
38358 +int sync_unix_file(struct file *, struct dentry *, int datasync);
38359 +ssize_t sendfile_unix_file(struct file *, loff_t *ppos, size_t count,
38360 +                          read_actor_t, void *target);
38361 +
38362 +/* address space operations */
38363 +int readpage_unix_file(struct file *, struct page *);
38364 +int readpages_unix_file(struct file*, struct address_space*, struct list_head*, unsigned);
38365 +int writepages_unix_file(struct address_space *, struct writeback_control *);
38366 +int prepare_write_unix_file(struct file *, struct page *, unsigned from,
38367 +                           unsigned to);
38368 +int commit_write_unix_file(struct file *, struct page *, unsigned from,
38369 +                          unsigned to);
38370 +sector_t bmap_unix_file(struct address_space *, sector_t lblock);
38371 +
38372 +/* file plugin operations */
38373 +int flow_by_inode_unix_file(struct inode *, const char __user *buf,
38374 +                           int user, loff_t, loff_t, rw_op, flow_t *);
38375 +int owns_item_unix_file(const struct inode *, const coord_t *);
38376 +void init_inode_data_unix_file(struct inode *, reiser4_object_create_data *,
38377 +                              int create);
38378 +int delete_object_unix_file(struct inode *);
38379 +
38380 +/*
38381 + * all the write into unix file is performed by item write method. Write method
38382 + * of unix file plugin only decides which item plugin (extent or tail) and in
38383 + * which mode (one from the enum below) to call
38384 + */
38385 +typedef enum {
38386 +       FIRST_ITEM = 1,
38387 +       APPEND_ITEM = 2,
38388 +       OVERWRITE_ITEM = 3
38389 +} write_mode_t;
38390 +
38391 +/* unix file may be in one the following states */
38392 +typedef enum {
38393 +       UF_CONTAINER_UNKNOWN = 0,
38394 +       UF_CONTAINER_TAILS = 1,
38395 +       UF_CONTAINER_EXTENTS = 2,
38396 +       UF_CONTAINER_EMPTY = 3
38397 +} file_container_t;
38398 +
38399 +struct formatting_plugin;
38400 +struct inode;
38401 +
38402 +/* unix file plugin specific part of reiser4 inode */
38403 +typedef struct unix_file_info {
38404 +       /*
38405 +        * this read-write lock protects file containerization change. Accesses
38406 +        * which do not change file containerization (see file_container_t)
38407 +        * (read, readpage, writepage, write (until tail conversion is
38408 +        * involved)) take read-lock. Accesses which modify file
38409 +        * containerization (truncate, conversion from tail to extent and back)
38410 +        * take write-lock.
38411 +        */
38412 +       struct rw_semaphore latch;
38413 +       /* this enum specifies which items are used to build the file */
38414 +       file_container_t container;
38415 +       /*
38416 +        * plugin which controls when file is to be converted to extents and
38417 +        * back to tail
38418 +        */
38419 +       struct formatting_plugin *tplug;
38420 +       /* if this is set, file is in exclusive use */
38421 +       int exclusive_use;
38422 +#if REISER4_DEBUG
38423 +       /* pointer to task struct of thread owning exclusive access to file */
38424 +       void *ea_owner;
38425 +       atomic_t nr_neas;
38426 +       void *last_reader;
38427 +#endif
38428 +} unix_file_info_t;
38429 +
38430 +struct unix_file_info *unix_file_inode_data(const struct inode *inode);
38431 +void get_exclusive_access(unix_file_info_t *);
38432 +void drop_exclusive_access(unix_file_info_t *);
38433 +void get_nonexclusive_access(unix_file_info_t *);
38434 +void drop_nonexclusive_access(unix_file_info_t *);
38435 +int try_to_get_nonexclusive_access(unix_file_info_t *);
38436 +int find_file_item(hint_t *, const reiser4_key *, znode_lock_mode,
38437 +                  struct inode *);
38438 +int find_file_item_nohint(coord_t *, lock_handle *,
38439 +                         const reiser4_key *, znode_lock_mode,
38440 +                         struct inode *);
38441 +
38442 +int load_file_hint(struct file *, hint_t *);
38443 +void save_file_hint(struct file *, const hint_t *);
38444 +
38445 +#include "../item/extent.h"
38446 +#include "../item/tail.h"
38447 +#include "../item/ctail.h"
38448 +
38449 +struct uf_coord {
38450 +       coord_t coord;
38451 +       lock_handle *lh;
38452 +       int valid;
38453 +       union {
38454 +               extent_coord_extension_t extent;
38455 +               tail_coord_extension_t tail;
38456 +               ctail_coord_extension_t ctail;
38457 +       } extension;
38458 +};
38459 +
38460 +#include "../../forward.h"
38461 +#include "../../seal.h"
38462 +#include "../../lock.h"
38463 +
38464 +/*
38465 + * This structure is used to speed up file operations (reads and writes).  A
38466 + * hint is a suggestion about where a key resolved to last time.  A seal
38467 + * indicates whether a node has been modified since a hint was last recorded.
38468 + * You check the seal, and if the seal is still valid, you can use the hint
38469 + * without traversing the tree again.
38470 + */
38471 +struct hint {
38472 +       seal_t seal; /* a seal over last file item accessed */
38473 +       uf_coord_t ext_coord;
38474 +       loff_t offset;
38475 +       znode_lock_mode mode;
38476 +       lock_handle lh;
38477 +};
38478 +
38479 +static inline int hint_is_valid(hint_t * hint)
38480 +{
38481 +       return hint->ext_coord.valid;
38482 +}
38483 +
38484 +static inline void hint_set_valid(hint_t * hint)
38485 +{
38486 +       hint->ext_coord.valid = 1;
38487 +}
38488 +
38489 +static inline void hint_clr_valid(hint_t * hint)
38490 +{
38491 +       hint->ext_coord.valid = 0;
38492 +}
38493 +
38494 +int load_file_hint(struct file *, hint_t *);
38495 +void save_file_hint(struct file *, const hint_t *);
38496 +void hint_init_zero(hint_t *);
38497 +void reiser4_set_hint(hint_t *, const reiser4_key *, znode_lock_mode);
38498 +int hint_is_set(const hint_t *);
38499 +void reiser4_unset_hint(hint_t *);
38500 +
38501 +int reiser4_update_file_size(struct inode *, reiser4_key *, int update_sd);
38502 +int cut_file_items(struct inode *, loff_t new_size, int update_sd,
38503 +                  loff_t cur_size, int (*update_actor) (struct inode *,
38504 +                                                        reiser4_key *, int));
38505 +#if REISER4_DEBUG
38506 +
38507 +/* return 1 is exclusive access is obtained, 0 - otherwise */
38508 +static inline int ea_obtained(unix_file_info_t * uf_info)
38509 +{
38510 +       int ret;
38511 +
38512 +       ret = down_read_trylock(&uf_info->latch);
38513 +       if (ret)
38514 +               up_read(&uf_info->latch);
38515 +       return !ret;
38516 +}
38517 +
38518 +#endif
38519 +
38520 +/* declarations of functions implementing SYMLINK_FILE_PLUGIN_ID file plugin */
38521 +int reiser4_create_symlink(struct inode *symlink, struct inode *dir,
38522 +                          reiser4_object_create_data *);
38523 +void destroy_inode_symlink(struct inode *);
38524 +
38525 +/* declarations of functions implementing CRYPTCOMPRESS_FILE_PLUGIN_ID
38526 +   file plugin */
38527 +
38528 +/* inode operations */
38529 +int setattr_cryptcompress(struct dentry *, struct iattr *);
38530 +int prot_setattr_cryptcompress(struct dentry *, struct iattr *);
38531 +
38532 +/* file operations */
38533 +ssize_t read_cryptcompress(struct file *, char __user *buf, size_t read_amount,
38534 +                          loff_t * off);
38535 +ssize_t prot_read_cryptcompress(struct file *, char __user *buf,
38536 +                               size_t read_amount, loff_t * off);
38537 +
38538 +ssize_t write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38539 +                           loff_t * off, int * conv);
38540 +ssize_t prot_write_cryptcompress(struct file *, const char __user *buf, size_t write_amount,
38541 +                                loff_t * off);
38542 +int mmap_cryptcompress(struct file *, struct vm_area_struct *);
38543 +int prot_mmap_cryptcompress(struct file *, struct vm_area_struct *);
38544 +ssize_t sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38545 +                              read_actor_t actor, void *target);
38546 +ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos, size_t count,
38547 +                                   read_actor_t actor, void *target);
38548 +
38549 +int release_cryptcompress(struct inode *, struct file *);
38550 +int prot_release_cryptcompress(struct inode *, struct file *);
38551 +
38552 +/* address space operations */
38553 +extern int readpage_cryptcompress(struct file *, struct page *);
38554 +extern int writepages_cryptcompress(struct address_space *,
38555 +                                    struct writeback_control *);
38556 +/* file plugin operations */
38557 +int flow_by_inode_cryptcompress(struct inode *, const char __user *buf,
38558 +                               int user, loff_t, loff_t, rw_op, flow_t *);
38559 +int key_by_inode_cryptcompress(struct inode *, loff_t off, reiser4_key *);
38560 +int create_cryptcompress(struct inode *, struct inode *,
38561 +                        reiser4_object_create_data *);
38562 +int delete_object_cryptcompress(struct inode *);
38563 +void init_inode_data_cryptcompress(struct inode *, reiser4_object_create_data *,
38564 +                                  int create);
38565 +int cut_tree_worker_cryptcompress(tap_t *, const reiser4_key * from_key,
38566 +                                 const reiser4_key * to_key,
38567 +                                 reiser4_key * smallest_removed,
38568 +                                 struct inode *object, int truncate,
38569 +                                 int *progress);
38570 +void destroy_inode_cryptcompress(struct inode *);
38571 +int open_object_cryptcompress(struct inode * inode, struct file * file);
38572 +
38573 +extern reiser4_plugin_ops cryptcompress_plugin_ops;
38574 +
38575 +#define WRITE_GRANULARITY 32
38576 +
38577 +int tail2extent(unix_file_info_t *);
38578 +int extent2tail(unix_file_info_t *);
38579 +
38580 +int goto_right_neighbor(coord_t *, lock_handle *);
38581 +int find_or_create_extent(struct page *);
38582 +int equal_to_ldk(znode *, const reiser4_key *);
38583 +
38584 +void init_uf_coord(uf_coord_t *uf_coord, lock_handle *lh);
38585 +
38586 +static inline int cbk_errored(int cbk_result)
38587 +{
38588 +       return (cbk_result != CBK_COORD_NOTFOUND
38589 +               && cbk_result != CBK_COORD_FOUND);
38590 +}
38591 +
38592 +/* __REISER4_FILE_H__ */
38593 +#endif
38594 +
38595 +/*
38596 + * Local variables:
38597 + * c-indentation-style: "K&R"
38598 + * mode-name: "LC"
38599 + * c-basic-offset: 8
38600 + * tab-width: 8
38601 + * fill-column: 79
38602 + * scroll-step: 1
38603 + * End:
38604 +*/
38605 diff --git a/fs/reiser4/plugin/file/file_conversion.c b/fs/reiser4/plugin/file/file_conversion.c
38606 new file mode 100644
38607 index 0000000..2e07b66
38608 --- /dev/null
38609 +++ b/fs/reiser4/plugin/file/file_conversion.c
38610 @@ -0,0 +1,594 @@
38611 +/* Copyright 2001, 2002, 2003 by Hans Reiser,
38612 +   licensing governed by reiser4/README */
38613 +
38614 +/* This file contains hooks that converts (*) cryptcompress files to unix-files,
38615 +   and a set of protected (**) methods of a cryptcompress file plugin to perform
38616 +   such conversion.
38617 +
38618 +(*)
38619 +   The conversion is performed for incompressible files to reduce cpu and memory
38620 +   usage. If first logical cluster (64K by default) of a file is incompressible,
38621 +   then we make a desicion, that the whole file is incompressible.
38622 +   The conversion can be enabled via installing a special compression mode
38623 +   plugin (CONVX_COMPRESSION_MODE_ID, see plugin/compress/compress_mode.c for
38624 +   details).
38625 +
38626 +(**)
38627 +   The protection means serialization of critical sections (readers and writers
38628 +   of @pset->file)
38629 +*/
38630 +
38631 +#include "../../inode.h"
38632 +#include "../cluster.h"
38633 +#include "file.h"
38634 +
38635 +#define conversion_enabled(inode)                                      \
38636 +        (inode_compression_mode_plugin(inode) ==                      \
38637 +         compression_mode_plugin_by_id(CONVX_COMPRESSION_MODE_ID))
38638 +
38639 +
38640 +/* Located sections (readers and writers of @pset->file) are not
38641 +   permanently critical: cryptcompress file can be converted only
38642 +   if the conversion is enabled (see the macrio above). And we don't
38643 +   convert unix files at all.
38644 +   The following helper macro is a sanity check to decide if we
38645 +   need to protect a located section.
38646 +*/
38647 +#define should_protect(inode)                                          \
38648 +       (inode_file_plugin(inode) ==                                    \
38649 +        file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID) &&             \
38650 +        conversion_enabled(inode))
38651 +
38652 +/* All protected methods have prefix "prot" in their names.
38653 +   It is convenient to construct them by usual (unprotected) ones
38654 +   using the following common macros:
38655 +*/
38656 +
38657 +/* Macro for passive protection.
38658 +   method_cryptcompress contains only readers */
38659 +#define PROT_PASSIVE(type, method, args)                               \
38660 +({                                                                     \
38661 +       type _result;                                                   \
38662 +       struct rw_semaphore * guard =                                   \
38663 +               &reiser4_inode_data(inode)->conv_sem;                   \
38664 +                                                                       \
38665 +       if (should_protect(inode)) {                                    \
38666 +               down_read(guard);                                       \
38667 +               if (!should_protect(inode))                             \
38668 +                       up_read(guard);                                 \
38669 +       }                                                               \
38670 +       if (inode_file_plugin(inode) ==                                 \
38671 +           file_plugin_by_id(UNIX_FILE_PLUGIN_ID))                     \
38672 +               _result = method ## _unix_file args;                    \
38673 +       else                                                            \
38674 +               _result = method ## _cryptcompress args;                \
38675 +       if (should_protect(inode))                                      \
38676 +               up_read(guard);                                         \
38677 +       _result;                                                        \
38678 +})
38679 +
38680 +#define PROT_PASSIVE_VOID(method, args)                                        \
38681 +({                                                                     \
38682 +       struct rw_semaphore * guard =                                   \
38683 +               &reiser4_inode_data(inode)->conv_sem;                   \
38684 +                                                                       \
38685 +       if (should_protect(inode)) {                                    \
38686 +               down_read(guard);                                       \
38687 +               if (!should_protect(inode))                             \
38688 +                       up_read(guard);                                 \
38689 +       }                                                               \
38690 +       if (inode_file_plugin(inode) ==                                 \
38691 +           file_plugin_by_id(UNIX_FILE_PLUGIN_ID))                     \
38692 +               method ## _unix_file args;                              \
38693 +       else                                                            \
38694 +               method ## _cryptcompress args;                          \
38695 +       if (should_protect(inode))                                      \
38696 +               up_read(guard);                                         \
38697 +})
38698 +
38699 +/* Macro for active protection.
38700 +   active_expr contains readers and writers; after its
38701 +   evaluation conversion should be disabled */
38702 +#define PROT_ACTIVE(type, method, args, active_expr)                   \
38703 +({                                                                     \
38704 +       type _result = 0;                                               \
38705 +       struct rw_semaphore * guard =                                   \
38706 +               &reiser4_inode_data(inode)->conv_sem;                   \
38707 +       reiser4_context * ctx = reiser4_init_context(inode->i_sb);      \
38708 +       if (IS_ERR(ctx))                                                \
38709 +               return PTR_ERR(ctx);                                    \
38710 +                                                                       \
38711 +       if (should_protect(inode)) {                                    \
38712 +               down_write(guard);                                      \
38713 +               if (should_protect(inode))                              \
38714 +                       _result = active_expr;                          \
38715 +               up_write(guard);                                        \
38716 +       }                                                               \
38717 +       if (_result == 0) {                                             \
38718 +               if (inode_file_plugin(inode) ==                         \
38719 +                   file_plugin_by_id(UNIX_FILE_PLUGIN_ID))             \
38720 +                       _result =  method ## _unix_file args;           \
38721 +               else                                                    \
38722 +                       _result =  method ## _cryptcompress args;       \
38723 +       }                                                               \
38724 +       reiser4_exit_context(ctx);                                      \
38725 +       _result;                                                        \
38726 +})
38727 +
38728 +/* Pass management to the unix-file plugin with "notail" policy */
38729 +static int __cryptcompress2unixfile(struct file *file, struct inode * inode)
38730 +{
38731 +       int result;
38732 +       reiser4_inode *info;
38733 +       unix_file_info_t * uf;
38734 +       info = reiser4_inode_data(inode);
38735 +
38736 +       result = aset_set_unsafe(&info->pset,
38737 +                           PSET_FILE,
38738 +                           (reiser4_plugin *)
38739 +                           file_plugin_by_id(UNIX_FILE_PLUGIN_ID));
38740 +       if (result)
38741 +               return result;
38742 +       result = aset_set_unsafe(&info->pset,
38743 +                           PSET_FORMATTING,
38744 +                           (reiser4_plugin *)
38745 +                           formatting_plugin_by_id(NEVER_TAILS_FORMATTING_ID));
38746 +       if (result)
38747 +               return result;
38748 +       /* get rid of non-standard plugins */
38749 +       info->plugin_mask &= ~cryptcompress_mask;
38750 +       /* get rid of plugin stat-data extension */
38751 +       info->extmask &= ~(1 << PLUGIN_STAT);
38752 +
38753 +       reiser4_inode_clr_flag(inode, REISER4_SDLEN_KNOWN);
38754 +
38755 +       /* FIXME use init_inode_data_unix_file() instead,
38756 +          but aviod init_inode_ordering() */
38757 +       /* Init unix-file specific part of inode */
38758 +       uf = unix_file_inode_data(inode);
38759 +       uf->container = UF_CONTAINER_UNKNOWN;
38760 +       init_rwsem(&uf->latch);
38761 +       uf->tplug = inode_formatting_plugin(inode);
38762 +       uf->exclusive_use = 0;
38763 +#if REISER4_DEBUG
38764 +       uf->ea_owner = NULL;
38765 +       atomic_set(&uf->nr_neas, 0);
38766 +#endif
38767 +       inode->i_op =
38768 +               &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->inode_ops;
38769 +       inode->i_fop =
38770 +               &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->file_ops;
38771 +       inode->i_mapping->a_ops =
38772 +               &file_plugin_by_id(UNIX_FILE_PLUGIN_ID)->as_ops;
38773 +       file->f_op = inode->i_fop;
38774 +       return 0;
38775 +}
38776 +
38777 +#if REISER4_DEBUG
38778 +static int disabled_conversion_inode_ok(struct inode * inode)
38779 +{
38780 +       __u64 extmask = reiser4_inode_data(inode)->extmask;
38781 +       __u16 plugin_mask = reiser4_inode_data(inode)->plugin_mask;
38782 +
38783 +       return ((extmask & (1 << LIGHT_WEIGHT_STAT)) &&
38784 +               (extmask & (1 << UNIX_STAT)) &&
38785 +               (extmask & (1 << LARGE_TIMES_STAT)) &&
38786 +               (extmask & (1 << PLUGIN_STAT)) &&
38787 +               (plugin_mask & (1 << PSET_COMPRESSION_MODE)));
38788 +}
38789 +#endif
38790 +
38791 +/* Assign another mode that will control
38792 +   compression at flush time only */
38793 +static int disable_conversion_no_update_sd(struct inode * inode)
38794 +{
38795 +       int result;
38796 +       result =
38797 +              force_plugin_pset(inode,
38798 +                                PSET_COMPRESSION_MODE,
38799 +                                (reiser4_plugin *)compression_mode_plugin_by_id
38800 +                                (LATTD_COMPRESSION_MODE_ID));
38801 +       assert("edward-1500",
38802 +              ergo(!result, disabled_conversion_inode_ok(inode)));
38803 +       return result;
38804 +}
38805 +
38806 +/* Disable future attempts to check/convert. This function is called by
38807 +   conversion hooks. */
38808 +static int disable_conversion(struct inode * inode)
38809 +{
38810 +       return disable_conversion_no_update_sd(inode);
38811 +}
38812 +
38813 +static int check_position(struct inode * inode,
38814 +                         loff_t pos /* initial position in the file */,
38815 +                         reiser4_cluster_t * clust,
38816 +                         int * check_compress)
38817 +{
38818 +       assert("edward-1505", conversion_enabled(inode));
38819 +       assert("edward-1506", inode->i_size <= inode_cluster_size(inode));
38820 +       /* if file size is more then cluster size, then compressible
38821 +          status must be figured out (i.e. compression was disabled,
38822 +          or file plugin was converted to unix_file) */
38823 +
38824 +       if (pos > inode->i_size)
38825 +               /* first logical cluster will contain a (partial) hole */
38826 +               return disable_conversion(inode);
38827 +       if (inode->i_size == inode_cluster_size(inode))
38828 +               *check_compress = 1;
38829 +       return 0;
38830 +}
38831 +
38832 +static void start_check_compressibility(struct inode * inode,
38833 +                                       reiser4_cluster_t * clust,
38834 +                                       hint_t * hint)
38835 +{
38836 +       assert("edward-1507", clust->index == 1);
38837 +       assert("edward-1508", !tfm_cluster_is_uptodate(&clust->tc));
38838 +       assert("edward-1509", cluster_get_tfm_act(&clust->tc) == TFMA_READ);
38839 +
38840 +       hint_init_zero(hint);
38841 +       clust->hint = hint;
38842 +       clust->index --;
38843 +       clust->nr_pages = count_to_nrpages(fsize_to_count(clust, inode));
38844 +
38845 +       /* first logical cluster (of index #0) must be complete */
38846 +       assert("edward-1510", fsize_to_count(clust, inode) ==
38847 +              inode_cluster_size(inode));
38848 +}
38849 +
38850 +static void finish_check_compressibility(struct inode * inode,
38851 +                                        reiser4_cluster_t * clust,
38852 +                                        hint_t * hint)
38853 +{
38854 +       reiser4_unset_hint(clust->hint);
38855 +       clust->hint = hint;
38856 +       clust->index ++;
38857 +}
38858 +
38859 +#if REISER4_DEBUG
38860 +static int prepped_dclust_ok(hint_t * hint)
38861 +{
38862 +       reiser4_key key;
38863 +       coord_t * coord = &hint->ext_coord.coord;
38864 +
38865 +       item_key_by_coord(coord, &key);
38866 +       return (item_id_by_coord(coord) == CTAIL_ID &&
38867 +               !coord_is_unprepped_ctail(coord) &&
38868 +               (get_key_offset(&key) + nr_units_ctail(coord) ==
38869 +                dclust_get_extension_dsize(hint)));
38870 +}
38871 +#endif
38872 +
38873 +#define fifty_persent(size) (size >> 1)
38874 +/* evaluation of data compressibility */
38875 +#define data_is_compressible(osize, isize)             \
38876 +       (osize < fifty_persent(isize))
38877 +
38878 +/* This is called only once per file life.
38879 +   Read first logical cluster (of index #0) and estimate its compressibility.
38880 +   Save estimation result in @compressible */
38881 +static int read_check_compressibility(struct inode * inode,
38882 +                                     reiser4_cluster_t * clust,
38883 +                                     int * compressible)
38884 +{
38885 +       int i;
38886 +       int result;
38887 +       __u32 dst_len;
38888 +       hint_t tmp_hint;
38889 +       hint_t * cur_hint = clust->hint;
38890 +
38891 +       start_check_compressibility(inode, clust, &tmp_hint);
38892 +
38893 +       result = grab_cluster_pages(inode, clust);
38894 +       if (result)
38895 +               return result;
38896 +       /* Read page cluster here */
38897 +       for (i = 0; i < clust->nr_pages; i++) {
38898 +               struct page *page = clust->pages[i];
38899 +               lock_page(page);
38900 +               result = do_readpage_ctail(inode, clust, page,
38901 +                                          ZNODE_READ_LOCK);
38902 +               unlock_page(page);
38903 +               if (result)
38904 +                       goto error;
38905 +       }
38906 +       tfm_cluster_clr_uptodate(&clust->tc);
38907 +
38908 +       cluster_set_tfm_act(&clust->tc, TFMA_WRITE);
38909 +
38910 +       if (hint_is_valid(&tmp_hint) && !hint_is_unprepped_dclust(&tmp_hint)) {
38911 +               /* lenght of compressed data is known, no need to compress */
38912 +               assert("edward-1511",
38913 +                      znode_is_write_locked(tmp_hint.ext_coord.coord.node));
38914 +               assert("edward-1512",
38915 +                      WITH_DATA(tmp_hint.ext_coord.coord.node,
38916 +                                prepped_dclust_ok(&tmp_hint)));
38917 +               dst_len = dclust_get_extension_dsize(&tmp_hint);
38918 +       }
38919 +       else {
38920 +               tfm_cluster_t * tc = &clust->tc;
38921 +               compression_plugin * cplug = inode_compression_plugin(inode);
38922 +               result = grab_tfm_stream(inode, tc, INPUT_STREAM);
38923 +               if (result)
38924 +                       goto error;
38925 +               for (i = 0; i < clust->nr_pages; i++) {
38926 +                       char *data;
38927 +                       lock_page(clust->pages[i]);
38928 +                       BUG_ON(!PageUptodate(clust->pages[i]));
38929 +                       data = kmap(clust->pages[i]);
38930 +                       memcpy(tfm_stream_data(tc, INPUT_STREAM) + pg_to_off(i),
38931 +                              data, PAGE_CACHE_SIZE);
38932 +                       kunmap(clust->pages[i]);
38933 +                       unlock_page(clust->pages[i]);
38934 +               }
38935 +               result = grab_tfm_stream(inode, tc, OUTPUT_STREAM);
38936 +               if (result)
38937 +                       goto error;
38938 +               result = grab_coa(tc, cplug);
38939 +               if (result)
38940 +                       goto error;
38941 +               tc->len = tc->lsize = fsize_to_count(clust, inode);
38942 +               assert("edward-1513", tc->len == inode_cluster_size(inode));
38943 +               dst_len = tfm_stream_size(tc, OUTPUT_STREAM);
38944 +               cplug->compress(get_coa(tc, cplug->h.id, tc->act),
38945 +                               tfm_input_data(clust), tc->len,
38946 +                               tfm_output_data(clust), &dst_len);
38947 +               assert("edward-1514",
38948 +                      dst_len <= tfm_stream_size(tc, OUTPUT_STREAM));
38949 +       }
38950 +       finish_check_compressibility(inode, clust, cur_hint);
38951 +       *compressible = data_is_compressible(dst_len,
38952 +                                            inode_cluster_size(inode));
38953 +       return 0;
38954 + error:
38955 +       reiser4_release_cluster_pages(clust);
38956 +       return result;
38957 +}
38958 +
38959 +/* Cut disk cluster of index @idx */
38960 +static int cut_disk_cluster(struct inode * inode, cloff_t idx)
38961 +{
38962 +       reiser4_key from, to;
38963 +       assert("edward-1515", inode_file_plugin(inode) ==
38964 +              file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
38965 +       key_by_inode_cryptcompress(inode, clust_to_off(idx, inode), &from);
38966 +       to = from;
38967 +       set_key_offset(&to,
38968 +                      get_key_offset(&from) + inode_cluster_size(inode) - 1);
38969 +       return reiser4_cut_tree(reiser4_tree_by_inode(inode),
38970 +                               &from, &to, inode, 0);
38971 +}
38972 +
38973 +static int reserve_cryptcompress2unixfile(struct inode *inode)
38974 +{
38975 +       reiser4_block_nr unformatted_nodes;
38976 +       reiser4_tree *tree;
38977 +
38978 +       tree = reiser4_tree_by_inode(inode);
38979 +
38980 +       /* number of unformatted nodes which will be created */
38981 +       unformatted_nodes = cluster_nrpages(inode); /* N */
38982 +
38983 +       /*
38984 +        * space required for one iteration of extent->tail conversion:
38985 +        *
38986 +        *     1. kill ctail items
38987 +        *
38988 +        *     2. insert N unformatted nodes
38989 +        *
38990 +        *     3. insert N (worst-case single-block
38991 +        *     extents) extent units.
38992 +        *
38993 +        *     4. drilling to the leaf level by coord_by_key()
38994 +        *
38995 +        *     5. possible update of stat-data
38996 +        *
38997 +        */
38998 +       grab_space_enable();
38999 +       return reiser4_grab_space
39000 +               (2 * tree->height +
39001 +                unformatted_nodes  +
39002 +                unformatted_nodes * estimate_one_insert_into_item(tree) +
39003 +                1 + estimate_one_insert_item(tree) +
39004 +                inode_file_plugin(inode)->estimate.update(inode),
39005 +                BA_CAN_COMMIT);
39006 +}
39007 +
39008 +/* clear flag that indicated conversion and update
39009 +   stat-data with new (unix-file - specific) info */
39010 +static int complete_file_conversion(struct inode *inode)
39011 +{
39012 +       int result;
39013 +
39014 +       grab_space_enable();
39015 +       result =
39016 +           reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
39017 +                              BA_CAN_COMMIT);
39018 +       if (result == 0) {
39019 +               reiser4_inode_clr_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
39020 +               result = reiser4_update_sd(inode);
39021 +       }
39022 +       if (result)
39023 +               warning("edward-1452",
39024 +                       "Converting %llu to unix-file: update sd failed (%i)",
39025 +                       (unsigned long long)get_inode_oid(inode), result);
39026 +       return 0;
39027 +}
39028 +
39029 +
39030 +/* do conversion */
39031 +static int cryptcompress2unixfile(struct file *file, struct inode * inode,
39032 +                                 reiser4_cluster_t * clust)
39033 +{
39034 +       int i;
39035 +       int result = 0;
39036 +       cryptcompress_info_t *cr_info;
39037 +       unix_file_info_t *uf_info;
39038 +
39039 +       assert("edward-1516", clust->pages[0]->index == 0);
39040 +       assert("edward-1517", clust->hint != NULL);
39041 +
39042 +       /* release all cryptcompress-specific recources */
39043 +       cr_info = cryptcompress_inode_data(inode);
39044 +       result = reserve_cryptcompress2unixfile(inode);
39045 +       if (result)
39046 +               goto out;
39047 +       reiser4_inode_set_flag(inode, REISER4_FILE_CONV_IN_PROGRESS);
39048 +       reiser4_unset_hint(clust->hint);
39049 +       result = cut_disk_cluster(inode, 0);
39050 +       if (result)
39051 +               goto out;
39052 +       /* captured jnode of cluster and assotiated resources (pages,
39053 +          reserved disk space) were released by ->kill_hook() method
39054 +          of the item plugin */
39055 +
39056 +       result = __cryptcompress2unixfile(file, inode);
39057 +       if (result)
39058 +               goto out;
39059 +       /* At this point file is managed by unix file plugin */
39060 +
39061 +       uf_info = unix_file_inode_data(inode);
39062 +
39063 +       assert("edward-1518",
39064 +              ergo(jprivate(clust->pages[0]),
39065 +                   !jnode_is_cluster_page(jprivate(clust->pages[0]))));
39066 +       for(i = 0; i < clust->nr_pages; i++) {
39067 +               assert("edward-1519", clust->pages[i]);
39068 +               assert("edward-1520", PageUptodate(clust->pages[i]));
39069 +
39070 +               result = find_or_create_extent(clust->pages[i]);
39071 +               if (result)
39072 +                       break;
39073 +       }
39074 +       if (!result) {
39075 +               uf_info->container = UF_CONTAINER_EXTENTS;
39076 +               complete_file_conversion(inode);
39077 +       }
39078 + out:
39079 +       all_grabbed2free();
39080 +       if (result)
39081 +               warning("edward-1453", "Failed to convert file %llu: %i",
39082 +                       (unsigned long long)get_inode_oid(inode), result);
39083 +       return result;
39084 +}
39085 +
39086 +/* Check, then perform or disable conversion if needed */
39087 +int write_conversion_hook(struct file *file, struct inode * inode, loff_t pos,
39088 +                         reiser4_cluster_t * clust, int * progress)
39089 +{
39090 +       int result;
39091 +       int check_compress = 0;
39092 +       int compressible = 0;
39093 +
39094 +       if (!conversion_enabled(inode))
39095 +               return 0;
39096 +       result = check_position(inode, pos, clust, &check_compress);
39097 +       if (result || !check_compress)
39098 +               return result;
39099 +       result = read_check_compressibility(inode, clust, &compressible);
39100 +       if (result)
39101 +               return result;
39102 +
39103 +       /* At this point page cluster is grabbed and uptodate */
39104 +       if (!compressible) {
39105 +               result = cryptcompress2unixfile(file, inode, clust);
39106 +               if (result == 0)
39107 +                       *progress = 1;
39108 +       }
39109 +       else
39110 +               result = disable_conversion(inode);
39111 +
39112 +       reiser4_release_cluster_pages(clust);
39113 +       return result;
39114 +}
39115 +
39116 +static int setattr_conversion_hook(struct inode * inode, struct iattr *attr)
39117 +{
39118 +       return (attr->ia_valid & ATTR_SIZE ? disable_conversion(inode) : 0);
39119 +}
39120 +
39121 +/* Protected methods of cryptcompress file plugin constructed
39122 +   by the macros above */
39123 +
39124 +/* Wrappers with active protection for:
39125 +   . write_cryptcompress;
39126 +   . setattr_cryptcompress;
39127 +*/
39128 +
39129 +ssize_t prot_write_cryptcompress(struct file *file, const char __user *buf,
39130 +                                size_t count, loff_t *off)
39131 +{
39132 +       int prot = 0;
39133 +       int conv = 0;
39134 +       ssize_t written_cr = 0;
39135 +       ssize_t written_uf = 0;
39136 +       struct inode * inode = file->f_dentry->d_inode;
39137 +       struct rw_semaphore * guard = &reiser4_inode_data(inode)->conv_sem;
39138 +
39139 +       if (should_protect(inode)) {
39140 +               prot = 1;
39141 +               down_write(guard);
39142 +       }
39143 +       written_cr = write_cryptcompress(file, buf, count, off, &conv);
39144 +       if (prot)
39145 +               up_write(guard);
39146 +       if (written_cr < 0)
39147 +               return written_cr;
39148 +       if (conv)
39149 +               written_uf = write_unix_file(file, buf + written_cr,
39150 +                                            count - written_cr, off);
39151 +       return written_cr + (written_uf < 0 ? 0 : written_uf);
39152 +}
39153 +
39154 +int prot_setattr_cryptcompress(struct dentry *dentry, struct iattr *attr)
39155 +{
39156 +       struct inode * inode = dentry->d_inode;
39157 +       return PROT_ACTIVE(int, setattr, (dentry, attr),
39158 +                          setattr_conversion_hook(inode, attr));
39159 +}
39160 +
39161 +/* Wrappers with passive protection for:
39162 +   . read_cryptcomperess;
39163 +   . mmap_cryptcompress;
39164 +   . release_cryptcompress;
39165 +   . sendfile_cryptcompress;
39166 +   . delete_object_cryptcompress.
39167 +*/
39168 +ssize_t prot_read_cryptcompress(struct file * file, char __user * buf,
39169 +                               size_t size, loff_t * off)
39170 +{
39171 +       struct inode * inode = file->f_dentry->d_inode;
39172 +       return PROT_PASSIVE(ssize_t, read, (file, buf, size, off));
39173 +}
39174 +
39175 +int prot_mmap_cryptcompress(struct file *file, struct vm_area_struct *vma)
39176 +{
39177 +       struct inode *inode = file->f_dentry->d_inode;
39178 +       return PROT_PASSIVE(int, mmap, (file, vma));
39179 +}
39180 +
39181 +int prot_release_cryptcompress(struct inode *inode, struct file *file)
39182 +{
39183 +       return PROT_PASSIVE(int, release, (inode, file));
39184 +}
39185 +
39186 +ssize_t prot_sendfile_cryptcompress(struct file *file, loff_t *ppos,
39187 +                                   size_t count, read_actor_t actor,
39188 +                                   void *target)
39189 +{
39190 +       struct inode * inode = file->f_dentry->d_inode;
39191 +       return PROT_PASSIVE(ssize_t, sendfile,
39192 +                           (file, ppos, count, actor, target));
39193 +}
39194 +
39195 +/*
39196 +  Local variables:
39197 +  c-indentation-style: "K&R"
39198 +  mode-name: "LC"
39199 +  c-basic-offset: 8
39200 +  tab-width: 8
39201 +  fill-column: 80
39202 +  scroll-step: 1
39203 +  End:
39204 +*/
39205 diff --git a/fs/reiser4/plugin/file/invert.c b/fs/reiser4/plugin/file/invert.c
39206 new file mode 100644
39207 index 0000000..7349878
39208 --- /dev/null
39209 +++ b/fs/reiser4/plugin/file/invert.c
39210 @@ -0,0 +1,493 @@
39211 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39212 +
39213 +/* Suppose you want to conveniently read and write a large variety of small files conveniently within a single emacs
39214 +   buffer, without having a separate buffer for each 8 byte or so file.  Inverts are the way to do that.  An invert
39215 +   provides you with the contents of a set of subfiles plus its own contents.  It is a file which inherits other files
39216 +   when you read it, and allows you to write to it and through it to the files that it inherits from.  In order for it
39217 +   to know which subfiles each part of your write should go into, there must be delimiters indicating that.  It tries to
39218 +   make that easy for you by providing those delimiters in what you read from it.
39219 +
39220 +  When you read it, an invert performs an inverted assignment.  Instead of taking an assignment command and writing a
39221 +  bunch of files, it takes a bunch of files and composes an assignment command for you to read from it that if executed
39222 +  would create those files.  But which files?  Well, that must be specified in the body of the invert using a special
39223 +  syntax, and that specification is called the invert of the assignment.
39224 +
39225 +  When written to, an invert performs the assignment command that is written
39226 +  to it, and modifies its own body to contain the invert of that
39227 +  assignment.
39228 +
39229 +  In other words, writing to an invert file what you have read from it
39230 +  is the identity operation.
39231 +
39232 +  Malformed assignments cause write errors.  Partial writes are not
39233 +  supported in v4.0, but will be.
39234 +
39235 +  Example:
39236 +
39237 +    If an invert contains:
39238 +
39239 +    /filenameA/<>+"(some text stored in the invert)+/filenameB/<>
39240 +
39241 +======================
39242 +Each element in this definition should be an invert, and all files
39243 +should be called recursively - too.  This is bad. If one of the
39244 +included files in not a regular or invert file, then we can't read
39245 +main file.
39246 +
39247 +I think to make it is possible easier:
39248 +
39249 +internal structure of invert file should be like symlink file. But
39250 +read and write method should be explitely indicated in i/o operation..
39251 +
39252 +By default we read and write (if probably) as symlink and if we
39253 +specify ..invert at reading time that too we can specify it at write time.
39254 +
39255 +example:
39256 +/my_invert_file/..invert<- ( (/filenameA<-"(The contents of filenameA))+"(some text stored in the invert)+(/filenameB<-"(The contents of filenameB) ) )
39257 +will create  /my_invert_file as invert, and will creat /filenameA and /filenameB with specified body.
39258 +
39259 +read of /my_invert_file/..invert will be
39260 +/filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
39261 +
39262 +but read of /my_invert_file/ will be
39263 +The contents of filenameAsome text stored in the invertThe contents of filenameB
39264 +
39265 +we also can creat this file as
39266 +/my_invert_file/<-/filenameA+"(some text stored in the invert)+/filenameB
39267 +will create  /my_invert_file , and use existing files /filenameA and /filenameB.
39268 +
39269 +and when we will read it will be as previously invert file.
39270 +
39271 +This is correct?
39272 +
39273 + vv
39274 +DEMIDOV-FIXME-HANS:
39275 +
39276 +Maybe you are right, but then you must disable writes to /my_invert_file/ and only allow writes to /my_invert_file/..invert
39277 +
39278 +Do you agree?  Discuss it on reiserfs-list....
39279 +
39280 +-Hans
39281 +=======================
39282 +
39283 +  Then a read will return:
39284 +
39285 +    /filenameA<-"(The contents of filenameA)+"(some text stored in the invert)+/filenameB<-"(The contents of filenameB)
39286 +
39287 +    and a write of the line above to the invert will set the contents of
39288 +    the invert and filenameA and filenameB to their original values.
39289 +
39290 +  Note that the contents of an invert have no influence on the effect
39291 +  of a write unless the write is a partial write (and a write of a
39292 +  shorter file without using truncate first is a partial write).
39293 +
39294 +  truncate() has no effect on filenameA and filenameB, it merely
39295 +  resets the value of the invert.
39296 +
39297 +  Writes to subfiles via the invert are implemented by preceding them
39298 +  with truncates.
39299 +
39300 +  Parse failures cause write failures.
39301 +
39302 +  Questions to ponder: should the invert be acted on prior to file
39303 +  close when writing to an open filedescriptor?
39304 +
39305 + Example:
39306 +
39307 + If an invert contains:
39308 +
39309 +   "(This text and a pair of quotes are all that is here.)
39310 +
39311 +Then a read will return:
39312 +
39313 +   "(This text and a pair of quotes are all that is here.)
39314 +
39315 +*/
39316 +
39317 +/* OPEN method places a struct file in memory associated with invert body
39318 +  and returns something like file descriptor to the user for the future access
39319 +  to the invert file.
39320 +  During opening we parse the body of invert and get a list of the 'entryes'
39321 +  (that describes all its subfiles) and place pointer on the first struct in
39322 +  reiserfs-specific part of invert inode (arbitrary decision).
39323 +
39324 +  Each subfile is described by the struct inv_entry that has a pointer @sd on
39325 +  in-core based stat-data and  a pointer on struct file @f (if we find that the
39326 +  subfile uses more then one unformated node (arbitrary decision), we load
39327 +  struct file in memory, otherwise we load base stat-data (and maybe 1-2 bytes
39328 +  of some other information we need)
39329 +
39330 +  Since READ and WRITE methods for inverts were formulated in assignment
39331 +  language, they don't contain arguments 'size' and 'offset' that make sense
39332 +  only in ordinary read/write methods.
39333 +
39334 +  READ method is a combination of two methods:
39335 +  1) ordinary read method (with offset=0, lenght = @f->...->i_size) for entries
39336 +  with @f != 0, this method uses pointer on struct file as an argument
39337 +  2) read method for inode-less files with @sd != 0, this method uses
39338 +  in-core based stat-data instead struct file as an argument.
39339 +  in the first case we don't use pagecache, just copy data that we got after
39340 +  cbk() into userspace.
39341 +
39342 +  WRITE method for invert files is more complex.
39343 +  Besides declared WRITE-interface in assignment languageb above we need
39344 +  to have an opportunity to edit unwrapped body of invert file with some
39345 +  text editor, it means we need GENERIC WRITE METHOD for invert file:
39346 +
39347 +  my_invert_file/..invert <- "string"
39348 +
39349 +  this method parses "string" and looks for correct subfile signatures, also
39350 +  the parsing process splits this "string" on the set of flows in  accordance
39351 +  with the set of subfiles specified by this signarure.
39352 +  The found list of signatures #S is compared with the opened one #I of invert
39353 +  file. If it doesn't have this one (#I==0, it will be so for instance if we
39354 +  have just create this invert file) the write method assignes found signature
39355 +  (#I=#S;) to the invert file. Then if #I==#S, generic write method splits
39356 +  itself to the some write methods for ordinary or light-weight, or call itself
39357 +  recursively for invert files with corresponding flows.
39358 +  I am not sure, but the list of signatures looks like what mr.Demidov means
39359 +  by 'delimiters'.
39360 +
39361 +  The cases when #S<#I (#I<#S) (in the sense of set-theory) are also available
39362 +  and cause delete (create new) subfiles (arbitrary decision - it may looks
39363 +  too complex, but this interface will be the completest). The order of entries
39364 +  of list #S (#I) and inherited order on #I (#S) must coincide.
39365 +  The other parsing results give malformed signature that aborts READ method
39366 +  and releases all resources.
39367 +
39368 +  Format of subfile (entry) signature:
39369 +
39370 +  "START_MAGIC"<>(TYPE="...",LOOKUP_ARG="...")SUBFILE_BODY"END_MAGIC"
39371 +
39372 +  Legend:
39373 +
39374 +    START_MAGIC - keyword indicates the start of subfile signature;
39375 +
39376 +    <> indicates the start of 'subfile metadata', that is the pair
39377 +  (TYPE="...",LOOKUP_ARG="...") in parenthesis separated by comma.
39378 +
39379 +    TYPE - the string "type" indicates the start of one of the three words:
39380 +  - ORDINARY_FILE,
39381 +  - LIGHT_WEIGHT_FILE,
39382 +  - INVERT_FILE;
39383 +
39384 +    LOOKUP_ARG - lookup argument depends on previous type:
39385 +  */
39386 +
39387 + /************************************************************/
39388 + /*       TYPE        *          LOOKUP ARGUMENT             */
39389 + /************************************************************/
39390 + /* LIGH_WEIGHT_FILE  *           stat-data key              */
39391 + /************************************************************/
39392 + /*   ORDINARY_FILE   *             filename                 */
39393 + /************************************************************/
39394 + /*   INVERT_FILE     *             filename                 */
39395 + /************************************************************/
39396 +
39397 + /* where:
39398 +  *stat-data key - the string contains stat data key of this subfile, it will be
39399 +  passed to fast-access lookup method for light-weight files;
39400 +  *filename - pathname of this subfile, iyt well be passed to VFS lookup methods
39401 +  for ordinary and invert files;
39402 +
39403 +  SUBFILE_BODY - data of this subfile (it will go to the flow)
39404 +  END_MAGIC - the keyword indicates the end of subfile signature.
39405 +
39406 +  The other simbols inside the signature interpreted as 'unformatted content',
39407 +  which is available with VFS's read_link() (arbitraruy decision).
39408 +
39409 +  NOTE: Parse method for a body of invert file uses mentioned signatures _without_
39410 +  subfile bodies.
39411 +
39412 +  Now the only unclear thing is WRITE in regular light-weight subfile A that we
39413 +  can describe only in  assignment language:
39414 +
39415 +  A <- "some_string"
39416 +
39417 +  I guess we don't want to change stat-data and body items of file A
39418 +  if this file exist, and size(A) != size("some_string") because this operation is
39419 +  expencive, so we only do the partial write if size(A) > size("some_string")
39420 +  and do truncate of the "some_string", and then do A <- "truncated string", if
39421 +  size(A) < size("some_string"). This decision is also arbitrary..
39422 +  */
39423 +
39424 +/* here is infrastructure for formated flows */
39425 +
39426 +#define SUBFILE_HEADER_MAGIC 0x19196605
39427 +#define FLOW_HEADER_MAGIC 0x01194304
39428 +
39429 +#include "../plugin.h"
39430 +#include "../../debug.h"
39431 +#include "../../forward.h"
39432 +#include "../object.h"
39433 +#include "../item/item.h"
39434 +#include "../item/static_stat.h"
39435 +#include "../../dformat.h"
39436 +#include "../znode.h"
39437 +#include "../inode.h"
39438 +
39439 +#include <linux/types.h>
39440 +#include <linux/fs.h>          /* for struct file  */
39441 +#include <linux/list.h>                /* for struct list_head */
39442 +
39443 +typedef enum {
39444 +       LIGHT_WEIGHT_FILE,
39445 +       ORDINARY_FILE,
39446 +       INVERT_FILE
39447 +} inv_entry_type;
39448 +
39449 +typedef struct flow_header {
39450 +       d32 fl_magic;
39451 +       d16 fl_nr;              /* number of subfiles in the flow */
39452 +};
39453 +
39454 +typedef struct subfile_header {
39455 +       d32 sh_magic;           /* subfile magic */
39456 +       d16 sh_type;            /* type of subfile: light-weight, ordinary, invert */
39457 +       d16 sh_arg_len;         /* lenght of lookup argument (filename, key) */
39458 +       d32 sh_body_len;        /* lenght of subfile body */
39459 +};
39460 +
39461 +/* functions to get/set fields of flow header */
39462 +
39463 +static void fl_set_magic(flow_header * fh, __u32 value)
39464 +{
39465 +       cputod32(value, &fh->fh_magic);
39466 +}
39467 +
39468 +static __u32 fl_get_magic(flow_header * fh)
39469 +{
39470 +       return d32tocpu(&fh->fh_magic);
39471 +}
39472 +static void fl_set_number(flow_header * fh, __u16 value)
39473 +{
39474 +       cputod16(value, &fh->fh_nr);
39475 +}
39476 +static unsigned fl_get_number(flow_header * fh)
39477 +{
39478 +       return d16tocpu(&fh->fh_nr);
39479 +}
39480 +
39481 +/* functions to get/set fields of subfile header */
39482 +
39483 +static void sh_set_magic(subfile_header * sh, __u32 value)
39484 +{
39485 +       cputod32(value, &sh->sh_magic);
39486 +}
39487 +
39488 +static __u32 sh_get_magic(subfile_header * sh)
39489 +{
39490 +       return d32tocpu(&sh->sh_magic);
39491 +}
39492 +static void sh_set_type(subfile_header * sh, __u16 value)
39493 +{
39494 +       cputod16(value, &sh->sh_magic);
39495 +}
39496 +static unsigned sh_get_type(subfile_header * sh)
39497 +{
39498 +       return d16tocpu(&sh->sh_magic);
39499 +}
39500 +static void sh_set_arg_len(subfile_header * sh, __u16 value)
39501 +{
39502 +       cputod16(value, &sh->sh_arg_len);
39503 +}
39504 +static unsigned sh_get_arg_len(subfile_header * sh)
39505 +{
39506 +       return d16tocpu(&sh->sh_arg_len);
39507 +}
39508 +static void sh_set_body_len(subfile_header * sh, __u32 value)
39509 +{
39510 +       cputod32(value, &sh->sh_body_len);
39511 +}
39512 +
39513 +static __u32 sh_get_body_len(subfile_header * sh)
39514 +{
39515 +       return d32tocpu(&sh->sh_body_len);
39516 +}
39517 +
39518 +/* in-core minimal stat-data, light-weight analog of inode */
39519 +
39520 +struct incore_sd_base {
39521 +       umode_t isd_mode;
39522 +       nlink_t isd_nlink;
39523 +       loff_t isd_size;
39524 +       char *isd_data;         /* 'subflow' to write */
39525 +};
39526 +
39527 +/* open invert create a list of invert entries,
39528 +   every entry is represented by structure inv_entry */
39529 +
39530 +struct inv_entry {
39531 +       struct list_head *ie_list;
39532 +       struct file *ie_file;   /* this is NULL if the file doesn't
39533 +                                  have unformated nodes */
39534 +       struct incore_sd_base *ie_sd;   /* inode-less analog of struct file */
39535 +};
39536 +
39537 +/* allocate and init invert entry */
39538 +
39539 +static struct inv_entry *allocate_inv_entry(void)
39540 +{
39541 +       struct inv_entry *inv_entry;
39542 +
39543 +       inv_entry = reiser4_kmalloc(sizeof(struct inv_entry), GFP_KERNEL);
39544 +       if (!inv_entry)
39545 +               return ERR_PTR(RETERR(-ENOMEM));
39546 +       inv_entry->ie_file = NULL;
39547 +       inv_entry->ie_sd = NULL;
39548 +       INIT_LIST_HEAD(&inv_entry->ie_list);
39549 +       return inv_entry;
39550 +}
39551 +
39552 +static int put_inv_entry(struct inv_entry *ientry)
39553 +{
39554 +       int result = 0;
39555 +
39556 +       assert("edward-96", ientry != NULL);
39557 +       assert("edward-97", ientry->ie_list != NULL);
39558 +
39559 +       list_del(ientry->ie_list);
39560 +       if (ientry->ie_sd != NULL) {
39561 +               kfree(ientry->ie_sd);
39562 +               kfree(ientry);
39563 +       }
39564 +       if (ientry->ie_file != NULL)
39565 +               result = filp_close(ientry->file, NULL);
39566 +       return result;
39567 +}
39568 +
39569 +static int allocate_incore_sd_base(struct inv_entry *inv_entry)
39570 +{
39571 +       struct incore_sd_base *isd_base assert("edward-98", inv_entry != NULL);
39572 +       assert("edward-99", inv_entry->ie_inode = NULL);
39573 +       assert("edward-100", inv_entry->ie_sd = NULL);
39574 +
39575 +       isd_base = reiser4_kmalloc(sizeof(struct incore_sd_base), GFP_KERNEL);
39576 +       if (!isd_base)
39577 +               return RETERR(-ENOMEM);
39578 +       inv_entry->ie_sd = isd_base;
39579 +       return 0;
39580 +}
39581 +
39582 +/* this can be installed as ->init_inv_entry () method of
39583 +   item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
39584 +   Copies data from on-disk stat-data format into light-weight analog of inode .
39585 +   Doesn't hanlde stat-data extensions. */
39586 +
39587 +static void sd_base_load(struct inv_entry *inv_entry, char *sd)
39588 +{
39589 +       reiser4_stat_data_base *sd_base;
39590 +
39591 +       assert("edward-101", inv_entry != NULL);
39592 +       assert("edward-101", inv_entry->ie_sd != NULL);
39593 +       assert("edward-102", sd != NULL);
39594 +
39595 +       sd_base = (reiser4_stat_data_base *) sd;
39596 +       inv_entry->incore_sd_base->isd_mode = d16tocpu(&sd_base->mode);
39597 +       inv_entry->incore_sd_base->isd_nlink = d32tocpu(&sd_base->nlink);
39598 +       inv_entry->incore_sd_base->isd_size = d64tocpu(&sd_base->size);
39599 +       inv_entry->incore_sd_base->isd_data = NULL;
39600 +}
39601 +
39602 +/* initialise incore stat-data */
39603 +
39604 +static void init_incore_sd_base(struct inv_entry *inv_entry, coord_t * coord)
39605 +{
39606 +       reiser4_plugin *plugin = item_plugin_by_coord(coord);
39607 +       void *body = item_body_by_coord(coord);
39608 +
39609 +       assert("edward-103", inv_entry != NULL);
39610 +       assert("edward-104", plugin != NULL);
39611 +       assert("edward-105", body != NULL);
39612 +
39613 +       sd_base_load(inv_entry, body);
39614 +}
39615 +
39616 +/* takes a key or filename and allocates new invert_entry,
39617 +   init and adds it into the list,
39618 +   we use lookup_sd_by_key() for light-weight files and VFS lookup by filename */
39619 +
39620 +int get_inv_entry(struct inode *invert_inode,  /* inode of invert's body */
39621 +                 inv_entry_type type,  /* LIGHT-WEIGHT or ORDINARY */
39622 +                 const reiser4_key * key,      /* key of invert entry stat-data */
39623 +                 char *filename,       /* filename of the file to be opened */
39624 +                 int flags, int mode)
39625 +{
39626 +       int result;
39627 +       struct inv_entry *ientry;
39628 +
39629 +       assert("edward-107", invert_inode != NULL);
39630 +
39631 +       ientry = allocate_inv_entry();
39632 +       if (IS_ERR(ientry))
39633 +               return (PTR_ERR(ientry));
39634 +
39635 +       if (type == LIGHT_WEIGHT_FILE) {
39636 +               coord_t coord;
39637 +               lock_handle lh;
39638 +
39639 +               assert("edward-108", key != NULL);
39640 +
39641 +               init_coord(&coord);
39642 +               init_lh(&lh);
39643 +               result =
39644 +                       lookup_sd_by_key(reiser4_tree_by_inode(invert_inode),
39645 +                                        ZNODE_READ_LOCK, &coord, &lh, key);
39646 +               if (result == 0)
39647 +                       init_incore_sd_base(ientry, coord);
39648 +
39649 +               done_lh(&lh);
39650 +               done_coord(&coord);
39651 +               return (result);
39652 +       } else {
39653 +               struct file *file = filp_open(filename, flags, mode);
39654 +               /* FIXME_EDWARD here we need to check if we
39655 +                  did't follow to any mount point */
39656 +
39657 +               assert("edward-108", filename != NULL);
39658 +
39659 +               if (IS_ERR(file))
39660 +                       return (PTR_ERR(file));
39661 +               ientry->ie_file = file;
39662 +               return 0;
39663 +       }
39664 +}
39665 +
39666 +/* takes inode of invert, reads the body of this invert, parses it,
39667 +   opens all invert entries and return pointer on the first inv_entry */
39668 +
39669 +struct inv_entry *open_invert(struct file *invert_file)
39670 +{
39671 +
39672 +}
39673 +
39674 +ssize_t subfile_read(struct *invert_entry, flow * f)
39675 +{
39676 +
39677 +}
39678 +
39679 +ssize_t subfile_write(struct *invert_entry, flow * f)
39680 +{
39681 +
39682 +}
39683 +
39684 +ssize_t invert_read(struct *file, flow * f)
39685 +{
39686 +
39687 +}
39688 +
39689 +ssize_t invert_write(struct *file, flow * f)
39690 +{
39691 +
39692 +}
39693 +
39694 +/* Make Linus happy.
39695 +   Local variables:
39696 +   c-indentation-style: "K&R"
39697 +   mode-name: "LC"
39698 +   c-basic-offset: 8
39699 +   tab-width: 8
39700 +   fill-column: 120
39701 +   scroll-step: 1
39702 +   End:
39703 +*/
39704 diff --git a/fs/reiser4/plugin/file/symfile.c b/fs/reiser4/plugin/file/symfile.c
39705 new file mode 100644
39706 index 0000000..814dfb8
39707 --- /dev/null
39708 +++ b/fs/reiser4/plugin/file/symfile.c
39709 @@ -0,0 +1,87 @@
39710 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39711 +
39712 +/* Symfiles are a generalization of Unix symlinks.
39713 +
39714 +   A symfile when read behaves as though you took its contents and
39715 +   substituted them into the reiser4 naming system as the right hand side
39716 +   of an assignment, and then read that which you had assigned to it.
39717 +
39718 +   A key issue for symfiles is how to implement writes through to
39719 +   subfiles.  In general, one must have some method of determining what
39720 +   of that which is written to the symfile is written to what subfile.
39721 +   This can be done by use of custom plugin methods written by users, or
39722 +   by using a few general methods we provide for those willing to endure
39723 +   the insertion of delimiters into what is read.
39724 +
39725 +   Writing to symfiles without delimiters to denote what is written to
39726 +   what subfile is not supported by any plugins we provide in this
39727 +   release.  Our most sophisticated support for writes is that embodied
39728 +   by the invert plugin (see invert.c).
39729 +
39730 +   A read only version of the /etc/passwd file might be
39731 +   constructed as a symfile whose contents are as follows:
39732 +
39733 +   /etc/passwd/userlines/*
39734 +
39735 +   or
39736 +
39737 +   /etc/passwd/userlines/demidov+/etc/passwd/userlines/edward+/etc/passwd/userlines/reiser+/etc/passwd/userlines/root
39738 +
39739 +   or
39740 +
39741 +   /etc/passwd/userlines/(demidov+edward+reiser+root)
39742 +
39743 +   A symfile with contents
39744 +
39745 +   /filenameA+"(some text stored in the uninvertable symfile)+/filenameB
39746 +
39747 +   will return when read
39748 +
39749 +   The contents of filenameAsome text stored in the uninvertable symfileThe contents of filenameB
39750 +
39751 +   and write of what has been read will not be possible to implement as
39752 +   an identity operation because there are no delimiters denoting the
39753 +   boundaries of what is to be written to what subfile.
39754 +
39755 +   Note that one could make this a read/write symfile if one specified
39756 +   delimiters, and the write method understood those delimiters delimited
39757 +   what was written to subfiles.
39758 +
39759 +   So, specifying the symfile in a manner that allows writes:
39760 +
39761 +   /etc/passwd/userlines/demidov+"(
39762 +   )+/etc/passwd/userlines/edward+"(
39763 +   )+/etc/passwd/userlines/reiser+"(
39764 +   )+/etc/passwd/userlines/root+"(
39765 +   )
39766 +
39767 +   or
39768 +
39769 +   /etc/passwd/userlines/(demidov+"(
39770 +   )+edward+"(
39771 +   )+reiser+"(
39772 +   )+root+"(
39773 +   ))
39774 +
39775 +   and the file demidov might be specified as:
39776 +
39777 +   /etc/passwd/userlines/demidov/username+"(:)+/etc/passwd/userlines/demidov/password+"(:)+/etc/passwd/userlines/demidov/userid+"(:)+/etc/passwd/userlines/demidov/groupid+"(:)+/etc/passwd/userlines/demidov/gecos+"(:)+/etc/passwd/userlines/demidov/home+"(:)+/etc/passwd/userlines/demidov/shell
39778 +
39779 +   or
39780 +
39781 +   /etc/passwd/userlines/demidov/(username+"(:)+password+"(:)+userid+"(:)+groupid+"(:)+gecos+"(:)+home+"(:)+shell)
39782 +
39783 +   Notice that if the file demidov has a carriage return in it, the
39784 +   parsing fails, but then if you put carriage returns in the wrong place
39785 +   in a normal /etc/passwd file it breaks things also.
39786 +
39787 +   Note that it is forbidden to have no text between two interpolations
39788 +   if one wants to be able to define what parts of a write go to what
39789 +   subfiles referenced in an interpolation.
39790 +
39791 +   If one wants to be able to add new lines by writing to the file, one
39792 +   must either write a custom plugin for /etc/passwd that knows how to
39793 +   name an added line, or one must use an invert, or one must use a more
39794 +   sophisticated symfile syntax that we are not planning to write for
39795 +   version 4.0.
39796 +*/
39797 diff --git a/fs/reiser4/plugin/file/symlink.c b/fs/reiser4/plugin/file/symlink.c
39798 new file mode 100644
39799 index 0000000..bcf3ef8
39800 --- /dev/null
39801 +++ b/fs/reiser4/plugin/file/symlink.c
39802 @@ -0,0 +1,95 @@
39803 +/* Copyright 2002, 2003, 2005 by Hans Reiser, licensing governed by reiser4/README */
39804 +
39805 +#include "../../inode.h"
39806 +
39807 +#include <linux/types.h>
39808 +#include <linux/fs.h>
39809 +
39810 +/* file plugin methods specific for symlink files
39811 +   (SYMLINK_FILE_PLUGIN_ID) */
39812 +
39813 +/* this is implementation of create_object method of file plugin for
39814 +   SYMLINK_FILE_PLUGIN_ID
39815 + */
39816 +
39817 +/**
39818 + * reiser4_create_symlink - create_object of file plugin for SYMLINK_FILE_PLUGIN_ID
39819 + * @symlink: inode of symlink object
39820 + * @dir: inode of parent directory
39821 + * @info:  parameters of new object
39822 + *
39823 + * Inserts stat data with symlink extension where into the tree.
39824 + */
39825 +int reiser4_create_symlink(struct inode *symlink,
39826 +                          struct inode *dir UNUSED_ARG,
39827 +                          reiser4_object_create_data *data /* info passed to us
39828 +                                                            * this is filled by
39829 +                                                            * reiser4() syscall
39830 +                                                            * in particular */)
39831 +{
39832 +       int result;
39833 +
39834 +       assert("nikita-680", symlink != NULL);
39835 +       assert("nikita-681", S_ISLNK(symlink->i_mode));
39836 +       assert("nikita-685", reiser4_inode_get_flag(symlink, REISER4_NO_SD));
39837 +       assert("nikita-682", dir != NULL);
39838 +       assert("nikita-684", data != NULL);
39839 +       assert("nikita-686", data->id == SYMLINK_FILE_PLUGIN_ID);
39840 +
39841 +       /*
39842 +        * stat data of symlink has symlink extension in which we store
39843 +        * symlink content, that is, path symlink is pointing to.
39844 +        */
39845 +       reiser4_inode_data(symlink)->extmask |= (1 << SYMLINK_STAT);
39846 +
39847 +       assert("vs-838", symlink->i_private == NULL);
39848 +       symlink->i_private = (void *)data->name;
39849 +
39850 +       assert("vs-843", symlink->i_size == 0);
39851 +       INODE_SET_FIELD(symlink, i_size, strlen(data->name));
39852 +
39853 +       /* insert stat data appended with data->name */
39854 +       result = inode_file_plugin(symlink)->write_sd_by_inode(symlink);
39855 +       if (result) {
39856 +               /* FIXME-VS: Make sure that symlink->i_private is not attached
39857 +                  to kmalloced data */
39858 +               INODE_SET_FIELD(symlink, i_size, 0);
39859 +       } else {
39860 +               assert("vs-849", symlink->i_private
39861 +                      && reiser4_inode_get_flag(symlink,
39862 +                                                REISER4_GENERIC_PTR_USED));
39863 +               assert("vs-850",
39864 +                      !memcmp((char *)symlink->i_private, data->name,
39865 +                              (size_t) symlink->i_size + 1));
39866 +       }
39867 +       return result;
39868 +}
39869 +
39870 +/* this is implementation of destroy_inode method of file plugin for
39871 +   SYMLINK_FILE_PLUGIN_ID
39872 + */
39873 +void destroy_inode_symlink(struct inode *inode)
39874 +{
39875 +       assert("edward-799",
39876 +              inode_file_plugin(inode) ==
39877 +              file_plugin_by_id(SYMLINK_FILE_PLUGIN_ID));
39878 +       assert("edward-800", !is_bad_inode(inode) && is_inode_loaded(inode));
39879 +       assert("edward-801", reiser4_inode_get_flag(inode,
39880 +                                                   REISER4_GENERIC_PTR_USED));
39881 +       assert("vs-839", S_ISLNK(inode->i_mode));
39882 +
39883 +       kfree(inode->i_private);
39884 +       inode->i_private = NULL;
39885 +       reiser4_inode_clr_flag(inode, REISER4_GENERIC_PTR_USED);
39886 +}
39887 +
39888 +/*
39889 +  Local variables:
39890 +  c-indentation-style: "K&R"
39891 +  mode-name: "LC"
39892 +  c-basic-offset: 8
39893 +  tab-width: 8
39894 +  fill-column: 80
39895 +  scroll-step: 1
39896 +  End:
39897 +*/
39898 diff --git a/fs/reiser4/plugin/file/tail_conversion.c b/fs/reiser4/plugin/file/tail_conversion.c
39899 new file mode 100644
39900 index 0000000..b57776f
39901 --- /dev/null
39902 +++ b/fs/reiser4/plugin/file/tail_conversion.c
39903 @@ -0,0 +1,726 @@
39904 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
39905 +
39906 +#include "../../inode.h"
39907 +#include "../../super.h"
39908 +#include "../../page_cache.h"
39909 +#include "../../carry.h"
39910 +#include "../../safe_link.h"
39911 +#include "../../vfs_ops.h"
39912 +
39913 +#include <linux/writeback.h>
39914 +
39915 +/* this file contains:
39916 +   tail2extent and extent2tail */
39917 +
39918 +/* exclusive access to a file is acquired when file state changes: tail2extent, empty2tail, extent2tail, etc */
39919 +void get_exclusive_access(unix_file_info_t * uf_info)
39920 +{
39921 +       assert("nikita-3028", reiser4_schedulable());
39922 +       assert("nikita-3047", LOCK_CNT_NIL(inode_sem_w));
39923 +       assert("nikita-3048", LOCK_CNT_NIL(inode_sem_r));
39924 +       /*
39925 +        * "deadlock avoidance": sometimes we commit a transaction under
39926 +        * rw-semaphore on a file. Such commit can deadlock with another
39927 +        * thread that captured some block (hence preventing atom from being
39928 +        * committed) and waits on rw-semaphore.
39929 +        */
39930 +       reiser4_txn_restart_current();
39931 +       LOCK_CNT_INC(inode_sem_w);
39932 +       down_write(&uf_info->latch);
39933 +       uf_info->exclusive_use = 1;
39934 +       assert("vs-1713", uf_info->ea_owner == NULL);
39935 +       assert("vs-1713", atomic_read(&uf_info->nr_neas) == 0);
39936 +       ON_DEBUG(uf_info->ea_owner = current);
39937 +}
39938 +
39939 +void drop_exclusive_access(unix_file_info_t * uf_info)
39940 +{
39941 +       assert("vs-1714", uf_info->ea_owner == current);
39942 +       assert("vs-1715", atomic_read(&uf_info->nr_neas) == 0);
39943 +       ON_DEBUG(uf_info->ea_owner = NULL);
39944 +       uf_info->exclusive_use = 0;
39945 +       up_write(&uf_info->latch);
39946 +       assert("nikita-3049", LOCK_CNT_NIL(inode_sem_r));
39947 +       assert("nikita-3049", LOCK_CNT_GTZ(inode_sem_w));
39948 +       LOCK_CNT_DEC(inode_sem_w);
39949 +       reiser4_txn_restart_current();
39950 +}
39951 +
39952 +/**
39953 + * nea_grabbed - do something when file semaphore is down_read-ed
39954 + * @uf_info:
39955 + *
39956 + * This is called when nonexclisive access is obtained on file. All it does is
39957 + * for debugging purposes.
39958 + */
39959 +static void nea_grabbed(unix_file_info_t *uf_info)
39960 +{
39961 +#if REISER4_DEBUG
39962 +       LOCK_CNT_INC(inode_sem_r);
39963 +       assert("vs-1716", uf_info->ea_owner == NULL);
39964 +       atomic_inc(&uf_info->nr_neas);
39965 +       uf_info->last_reader = current;
39966 +#endif
39967 +}
39968 +
39969 +/**
39970 + * get_nonexclusive_access - get nonexclusive access to a file
39971 + * @uf_info: unix file specific part of inode to obtain access to
39972 + *
39973 + * Nonexclusive access is obtained on a file before read, write, readpage.
39974 + */
39975 +void get_nonexclusive_access(unix_file_info_t *uf_info)
39976 +{
39977 +       assert("nikita-3029", reiser4_schedulable());
39978 +       assert("nikita-3361", get_current_context()->trans->atom == NULL);
39979 +
39980 +       down_read(&uf_info->latch);
39981 +       nea_grabbed(uf_info);
39982 +}
39983 +
39984 +/**
39985 + * try_to_get_nonexclusive_access - try to get nonexclusive access to a file
39986 + * @uf_info: unix file specific part of inode to obtain access to
39987 + *
39988 + * Non-blocking version of nonexclusive access obtaining.
39989 + */
39990 +int try_to_get_nonexclusive_access(unix_file_info_t *uf_info)
39991 +{
39992 +       int result;
39993 +
39994 +       result = down_read_trylock(&uf_info->latch);
39995 +       if (result)
39996 +               nea_grabbed(uf_info);
39997 +       return result;
39998 +}
39999 +
40000 +void drop_nonexclusive_access(unix_file_info_t * uf_info)
40001 +{
40002 +       assert("vs-1718", uf_info->ea_owner == NULL);
40003 +       assert("vs-1719", atomic_read(&uf_info->nr_neas) > 0);
40004 +       ON_DEBUG(atomic_dec(&uf_info->nr_neas));
40005 +
40006 +       up_read(&uf_info->latch);
40007 +
40008 +       LOCK_CNT_DEC(inode_sem_r);
40009 +       reiser4_txn_restart_current();
40010 +}
40011 +
40012 +/* part of tail2extent. Cut all items covering @count bytes starting from
40013 +   @offset */
40014 +/* Audited by: green(2002.06.15) */
40015 +static int cut_formatting_items(struct inode *inode, loff_t offset, int count)
40016 +{
40017 +       reiser4_key from, to;
40018 +
40019 +       /* AUDIT: How about putting an assertion here, what would check
40020 +          all provided range is covered by tail items only? */
40021 +       /* key of first byte in the range to be cut  */
40022 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
40023 +
40024 +       /* key of last byte in that range */
40025 +       to = from;
40026 +       set_key_offset(&to, (__u64) (offset + count - 1));
40027 +
40028 +       /* cut everything between those keys */
40029 +       return reiser4_cut_tree(reiser4_tree_by_inode(inode), &from, &to,
40030 +                               inode, 0);
40031 +}
40032 +
40033 +static void release_all_pages(struct page **pages, unsigned nr_pages)
40034 +{
40035 +       unsigned i;
40036 +
40037 +       for (i = 0; i < nr_pages; i++) {
40038 +               if (pages[i] == NULL) {
40039 +                       unsigned j;
40040 +                       for (j = i + 1; j < nr_pages; j++)
40041 +                               assert("vs-1620", pages[j] == NULL);
40042 +                       break;
40043 +               }
40044 +               page_cache_release(pages[i]);
40045 +               pages[i] = NULL;
40046 +       }
40047 +}
40048 +
40049 +/* part of tail2extent. replace tail items with extent one. Content of tail
40050 +   items (@count bytes) being cut are copied already into
40051 +   pages. extent_writepage method is called to create extents corresponding to
40052 +   those pages */
40053 +static int replace(struct inode *inode, struct page **pages, unsigned nr_pages, int count)
40054 +{
40055 +       int result;
40056 +       unsigned i;
40057 +       STORE_COUNTERS;
40058 +
40059 +       if (nr_pages == 0)
40060 +               return 0;
40061 +
40062 +       assert("vs-596", pages[0]);
40063 +
40064 +       /* cut copied items */
40065 +       result = cut_formatting_items(inode, page_offset(pages[0]), count);
40066 +       if (result)
40067 +               return result;
40068 +
40069 +       CHECK_COUNTERS;
40070 +
40071 +       /* put into tree replacement for just removed items: extent item, namely */
40072 +       for (i = 0; i < nr_pages; i++) {
40073 +               result = add_to_page_cache_lru(pages[i], inode->i_mapping,
40074 +                                              pages[i]->index,
40075 +                                              mapping_gfp_mask(inode->
40076 +                                                               i_mapping));
40077 +               if (result)
40078 +                       break;
40079 +               unlock_page(pages[i]);
40080 +               result = find_or_create_extent(pages[i]);
40081 +               if (result)
40082 +                       break;
40083 +               SetPageUptodate(pages[i]);
40084 +       }
40085 +       return result;
40086 +}
40087 +
40088 +#define TAIL2EXTENT_PAGE_NUM 3 /* number of pages to fill before cutting tail
40089 +                                * items */
40090 +
40091 +static int reserve_tail2extent_iteration(struct inode *inode)
40092 +{
40093 +       reiser4_block_nr unformatted_nodes;
40094 +       reiser4_tree *tree;
40095 +
40096 +       tree = reiser4_tree_by_inode(inode);
40097 +
40098 +       /* number of unformatted nodes which will be created */
40099 +       unformatted_nodes = TAIL2EXTENT_PAGE_NUM;
40100 +
40101 +       /*
40102 +        * space required for one iteration of extent->tail conversion:
40103 +        *
40104 +        *     1. kill N tail items
40105 +        *
40106 +        *     2. insert TAIL2EXTENT_PAGE_NUM unformatted nodes
40107 +        *
40108 +        *     3. insert TAIL2EXTENT_PAGE_NUM (worst-case single-block
40109 +        *     extents) extent units.
40110 +        *
40111 +        *     4. drilling to the leaf level by coord_by_key()
40112 +        *
40113 +        *     5. possible update of stat-data
40114 +        *
40115 +        */
40116 +       grab_space_enable();
40117 +       return reiser4_grab_space
40118 +           (2 * tree->height +
40119 +            TAIL2EXTENT_PAGE_NUM +
40120 +            TAIL2EXTENT_PAGE_NUM * estimate_one_insert_into_item(tree) +
40121 +            1 + estimate_one_insert_item(tree) +
40122 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
40123 +}
40124 +
40125 +/* clear stat data's flag indicating that conversion is being converted */
40126 +static int complete_conversion(struct inode *inode)
40127 +{
40128 +       int result;
40129 +
40130 +       grab_space_enable();
40131 +       result =
40132 +           reiser4_grab_space(inode_file_plugin(inode)->estimate.update(inode),
40133 +                              BA_CAN_COMMIT);
40134 +       if (result == 0) {
40135 +               reiser4_inode_clr_flag(inode, REISER4_PART_MIXED);
40136 +               result = reiser4_update_sd(inode);
40137 +       }
40138 +       if (result)
40139 +               warning("vs-1696", "Failed to clear converting bit of %llu: %i",
40140 +                       (unsigned long long)get_inode_oid(inode), result);
40141 +       return 0;
40142 +}
40143 +
40144 +/**
40145 + * find_start
40146 + * @inode:
40147 + * @id:
40148 + * @offset:
40149 + *
40150 + * this is used by tail2extent and extent2tail to detect where previous
40151 + * uncompleted conversion stopped
40152 + */
40153 +static int find_start(struct inode *inode, reiser4_plugin_id id, __u64 *offset)
40154 +{
40155 +       int result;
40156 +       lock_handle lh;
40157 +       coord_t coord;
40158 +       unix_file_info_t *ufo;
40159 +       int found;
40160 +       reiser4_key key;
40161 +
40162 +       ufo = unix_file_inode_data(inode);
40163 +       init_lh(&lh);
40164 +       result = 0;
40165 +       found = 0;
40166 +       inode_file_plugin(inode)->key_by_inode(inode, *offset, &key);
40167 +       do {
40168 +               init_lh(&lh);
40169 +               result = find_file_item_nohint(&coord, &lh, &key,
40170 +                                              ZNODE_READ_LOCK, inode);
40171 +
40172 +               if (result == CBK_COORD_FOUND) {
40173 +                       if (coord.between == AT_UNIT) {
40174 +                               /*coord_clear_iplug(&coord); */
40175 +                               result = zload(coord.node);
40176 +                               if (result == 0) {
40177 +                                       if (item_id_by_coord(&coord) == id)
40178 +                                               found = 1;
40179 +                                       else
40180 +                                               item_plugin_by_coord(&coord)->s.
40181 +                                                   file.append_key(&coord,
40182 +                                                                   &key);
40183 +                                       zrelse(coord.node);
40184 +                               }
40185 +                       } else
40186 +                               result = RETERR(-ENOENT);
40187 +               }
40188 +               done_lh(&lh);
40189 +       } while (result == 0 && !found);
40190 +       *offset = get_key_offset(&key);
40191 +       return result;
40192 +}
40193 +
40194 +/**
40195 + * tail2extent
40196 + * @uf_info:
40197 + *
40198 + *
40199 + */
40200 +int tail2extent(unix_file_info_t *uf_info)
40201 +{
40202 +       int result;
40203 +       reiser4_key key;        /* key of next byte to be moved to page */
40204 +       char *p_data;           /* data of page */
40205 +       unsigned page_off = 0,  /* offset within the page where to copy data */
40206 +           count;              /* number of bytes of item which can be
40207 +                                * copied to page */
40208 +       struct page *pages[TAIL2EXTENT_PAGE_NUM];
40209 +       struct page *page;
40210 +       int done;               /* set to 1 when all file is read */
40211 +       char *item;
40212 +       int i;
40213 +       struct inode *inode;
40214 +       int first_iteration;
40215 +       int bytes;
40216 +       __u64 offset;
40217 +
40218 +       assert("nikita-3362", ea_obtained(uf_info));
40219 +       inode = unix_file_info_to_inode(uf_info);
40220 +       assert("nikita-3412", !IS_RDONLY(inode));
40221 +       assert("vs-1649", uf_info->container != UF_CONTAINER_EXTENTS);
40222 +       assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
40223 +
40224 +       offset = 0;
40225 +       first_iteration = 1;
40226 +       result = 0;
40227 +       if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
40228 +               /*
40229 +                * file is marked on disk as there was a conversion which did
40230 +                * not complete due to either crash or some error. Find which
40231 +                * offset tail conversion stopped at
40232 +                */
40233 +               result = find_start(inode, FORMATTING_ID, &offset);
40234 +               if (result == -ENOENT) {
40235 +                       /* no tail items found, everything is converted */
40236 +                       uf_info->container = UF_CONTAINER_EXTENTS;
40237 +                       complete_conversion(inode);
40238 +                       return 0;
40239 +               } else if (result != 0)
40240 +                       /* some other error */
40241 +                       return result;
40242 +               first_iteration = 0;
40243 +       }
40244 +
40245 +       reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
40246 +
40247 +       /* get key of first byte of a file */
40248 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &key);
40249 +
40250 +       done = 0;
40251 +       while (done == 0) {
40252 +               memset(pages, 0, sizeof(pages));
40253 +               result = reserve_tail2extent_iteration(inode);
40254 +               if (result != 0)
40255 +                       goto out;
40256 +               if (first_iteration) {
40257 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
40258 +                       reiser4_update_sd(inode);
40259 +                       first_iteration = 0;
40260 +               }
40261 +               bytes = 0;
40262 +               for (i = 0; i < sizeof_array(pages) && done == 0; i++) {
40263 +                       assert("vs-598",
40264 +                              (get_key_offset(&key) & ~PAGE_CACHE_MASK) == 0);
40265 +                       page = alloc_page(reiser4_ctx_gfp_mask_get());
40266 +                       if (!page) {
40267 +                               result = RETERR(-ENOMEM);
40268 +                               goto error;
40269 +                       }
40270 +
40271 +                       page->index =
40272 +                           (unsigned long)(get_key_offset(&key) >>
40273 +                                           PAGE_CACHE_SHIFT);
40274 +                       /*
40275 +                        * usually when one is going to longterm lock znode (as
40276 +                        * find_file_item does, for instance) he must not hold
40277 +                        * locked pages. However, there is an exception for
40278 +                        * case tail2extent. Pages appearing here are not
40279 +                        * reachable to everyone else, they are clean, they do
40280 +                        * not have jnodes attached so keeping them locked do
40281 +                        * not risk deadlock appearance
40282 +                        */
40283 +                       assert("vs-983", !PagePrivate(page));
40284 +                       reiser4_invalidate_pages(inode->i_mapping, page->index,
40285 +                                                1, 0);
40286 +
40287 +                       for (page_off = 0; page_off < PAGE_CACHE_SIZE;) {
40288 +                               coord_t coord;
40289 +                               lock_handle lh;
40290 +
40291 +                               /* get next item */
40292 +                               /* FIXME: we might want to readahead here */
40293 +                               init_lh(&lh);
40294 +                               result =
40295 +                                   find_file_item_nohint(&coord, &lh, &key,
40296 +                                                         ZNODE_READ_LOCK,
40297 +                                                         inode);
40298 +                               if (result != CBK_COORD_FOUND) {
40299 +                                       /*
40300 +                                        * error happened of not items of file
40301 +                                        * were found
40302 +                                        */
40303 +                                       done_lh(&lh);
40304 +                                       page_cache_release(page);
40305 +                                       goto error;
40306 +                               }
40307 +
40308 +                               if (coord.between == AFTER_UNIT) {
40309 +                                       /*
40310 +                                        * end of file is reached. Padd page
40311 +                                        * with zeros
40312 +                                        */
40313 +                                       done_lh(&lh);
40314 +                                       done = 1;
40315 +                                       p_data = kmap_atomic(page, KM_USER0);
40316 +                                       memset(p_data + page_off, 0,
40317 +                                              PAGE_CACHE_SIZE - page_off);
40318 +                                       kunmap_atomic(p_data, KM_USER0);
40319 +                                       break;
40320 +                               }
40321 +
40322 +                               result = zload(coord.node);
40323 +                               if (result) {
40324 +                                       page_cache_release(page);
40325 +                                       done_lh(&lh);
40326 +                                       goto error;
40327 +                               }
40328 +                               assert("vs-856", coord.between == AT_UNIT);
40329 +                               item = ((char *)item_body_by_coord(&coord)) +
40330 +                                       coord.unit_pos;
40331 +
40332 +                               /* how many bytes to copy */
40333 +                               count =
40334 +                                   item_length_by_coord(&coord) -
40335 +                                   coord.unit_pos;
40336 +                               /* limit length of copy to end of page */
40337 +                               if (count > PAGE_CACHE_SIZE - page_off)
40338 +                                       count = PAGE_CACHE_SIZE - page_off;
40339 +
40340 +                               /*
40341 +                                * copy item (as much as will fit starting from
40342 +                                * the beginning of the item) into the page
40343 +                                */
40344 +                               p_data = kmap_atomic(page, KM_USER0);
40345 +                               memcpy(p_data + page_off, item, count);
40346 +                               kunmap_atomic(p_data, KM_USER0);
40347 +
40348 +                               page_off += count;
40349 +                               bytes += count;
40350 +                               set_key_offset(&key,
40351 +                                              get_key_offset(&key) + count);
40352 +
40353 +                               zrelse(coord.node);
40354 +                               done_lh(&lh);
40355 +                       } /* end of loop which fills one page by content of
40356 +                          * formatting items */
40357 +
40358 +                       if (page_off) {
40359 +                               /* something was copied into page */
40360 +                               pages[i] = page;
40361 +                       } else {
40362 +                               page_cache_release(page);
40363 +                               assert("vs-1648", done == 1);
40364 +                               break;
40365 +                       }
40366 +               } /* end of loop through pages of one conversion iteration */
40367 +
40368 +               if (i > 0) {
40369 +                       result = replace(inode, pages, i, bytes);
40370 +                       release_all_pages(pages, sizeof_array(pages));
40371 +                       if (result)
40372 +                               goto error;
40373 +                       /*
40374 +                        * we have to drop exclusive access to avoid deadlock
40375 +                        * which may happen because called by
40376 +                        * reiser4_writepages capture_unix_file requires to get
40377 +                        * non-exclusive access to a file. It is safe to drop
40378 +                        * EA in the middle of tail2extent conversion because
40379 +                        * write_unix_file/unix_setattr(truncate)/release_unix_file(extent2tail)
40380 +                        * are serialized by reiser4_inode->mutex_write semaphore and
40381 +                        * because read_unix_file works (should at least) on
40382 +                        * partially converted files
40383 +                        */
40384 +                       drop_exclusive_access(uf_info);
40385 +                       /* throttle the conversion */
40386 +                       reiser4_throttle_write(inode);
40387 +                       get_exclusive_access(uf_info);
40388 +
40389 +                       /*
40390 +                        * nobody is allowed to complete conversion but a
40391 +                        * process which started it
40392 +                        */
40393 +                       assert("", reiser4_inode_get_flag(inode,
40394 +                                                         REISER4_PART_MIXED));
40395 +               }
40396 +       }
40397 +
40398 +       reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40399 +
40400 +       if (result == 0) {
40401 +               /* file is converted to extent items */
40402 +               assert("vs-1697", reiser4_inode_get_flag(inode,
40403 +                                                        REISER4_PART_MIXED));
40404 +
40405 +               uf_info->container = UF_CONTAINER_EXTENTS;
40406 +               complete_conversion(inode);
40407 +       } else {
40408 +               /*
40409 +                * conversion is not complete. Inode was already marked as
40410 +                * REISER4_PART_CONV and stat-data were updated at the first
40411 +                * iteration of the loop above.
40412 +                */
40413 +             error:
40414 +               release_all_pages(pages, sizeof_array(pages));
40415 +               warning("nikita-2282", "Partial conversion of %llu: %i",
40416 +                       (unsigned long long)get_inode_oid(inode), result);
40417 +       }
40418 +
40419 +      out:
40420 +       return result;
40421 +}
40422 +
40423 +static int reserve_extent2tail_iteration(struct inode *inode)
40424 +{
40425 +       reiser4_tree *tree;
40426 +
40427 +       tree = reiser4_tree_by_inode(inode);
40428 +       /*
40429 +        * reserve blocks for (in this order):
40430 +        *
40431 +        *     1. removal of extent item
40432 +        *
40433 +        *     2. insertion of tail by insert_flow()
40434 +        *
40435 +        *     3. drilling to the leaf level by coord_by_key()
40436 +        *
40437 +        *     4. possible update of stat-data
40438 +        */
40439 +       grab_space_enable();
40440 +       return reiser4_grab_space
40441 +           (estimate_one_item_removal(tree) +
40442 +            estimate_insert_flow(tree->height) +
40443 +            1 + estimate_one_insert_item(tree) +
40444 +            inode_file_plugin(inode)->estimate.update(inode), BA_CAN_COMMIT);
40445 +}
40446 +
40447 +/* for every page of file: read page, cut part of extent pointing to this page,
40448 +   put data of page tree by tail item */
40449 +int extent2tail(unix_file_info_t *uf_info)
40450 +{
40451 +       int result;
40452 +       struct inode *inode;
40453 +       struct page *page;
40454 +       unsigned long num_pages, i;
40455 +       unsigned long start_page;
40456 +       reiser4_key from;
40457 +       reiser4_key to;
40458 +       unsigned count;
40459 +       __u64 offset;
40460 +
40461 +       assert("nikita-3362", ea_obtained(uf_info));
40462 +       inode = unix_file_info_to_inode(uf_info);
40463 +       assert("nikita-3412", !IS_RDONLY(inode));
40464 +       assert("vs-1649", uf_info->container != UF_CONTAINER_TAILS);
40465 +       assert("", !reiser4_inode_get_flag(inode, REISER4_PART_IN_CONV));
40466 +
40467 +       offset = 0;
40468 +       if (reiser4_inode_get_flag(inode, REISER4_PART_MIXED)) {
40469 +               /*
40470 +                * file is marked on disk as there was a conversion which did
40471 +                * not complete due to either crash or some error. Find which
40472 +                * offset tail conversion stopped at
40473 +                */
40474 +               result = find_start(inode, EXTENT_POINTER_ID, &offset);
40475 +               if (result == -ENOENT) {
40476 +                       /* no extent found, everything is converted */
40477 +                       uf_info->container = UF_CONTAINER_TAILS;
40478 +                       complete_conversion(inode);
40479 +                       return 0;
40480 +               } else if (result != 0)
40481 +                       /* some other error */
40482 +                       return result;
40483 +       }
40484 +
40485 +       reiser4_inode_set_flag(inode, REISER4_PART_IN_CONV);
40486 +
40487 +       /* number of pages in the file */
40488 +       num_pages =
40489 +           (inode->i_size + - offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
40490 +       start_page = offset >> PAGE_CACHE_SHIFT;
40491 +
40492 +       inode_file_plugin(inode)->key_by_inode(inode, offset, &from);
40493 +       to = from;
40494 +
40495 +       result = 0;
40496 +       for (i = 0; i < num_pages; i++) {
40497 +               __u64 start_byte;
40498 +
40499 +               result = reserve_extent2tail_iteration(inode);
40500 +               if (result != 0)
40501 +                       break;
40502 +               if (i == 0 && offset == 0) {
40503 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
40504 +                       reiser4_update_sd(inode);
40505 +               }
40506 +
40507 +               page = read_mapping_page(inode->i_mapping,
40508 +                                        (unsigned)(i + start_page), NULL);
40509 +               if (IS_ERR(page)) {
40510 +                       result = PTR_ERR(page);
40511 +                       break;
40512 +               }
40513 +
40514 +               wait_on_page_locked(page);
40515 +
40516 +               if (!PageUptodate(page)) {
40517 +                       page_cache_release(page);
40518 +                       result = RETERR(-EIO);
40519 +                       break;
40520 +               }
40521 +
40522 +               /* cut part of file we have read */
40523 +               start_byte = (__u64) (i << PAGE_CACHE_SHIFT);
40524 +               set_key_offset(&from, start_byte);
40525 +               set_key_offset(&to, start_byte + PAGE_CACHE_SIZE - 1);
40526 +               /*
40527 +                * reiser4_cut_tree_object() returns -E_REPEAT to allow atom
40528 +                * commits during over-long truncates. But
40529 +                * extent->tail conversion should be performed in one
40530 +                * transaction.
40531 +                */
40532 +               result = reiser4_cut_tree(reiser4_tree_by_inode(inode), &from,
40533 +                                         &to, inode, 0);
40534 +
40535 +               if (result) {
40536 +                       page_cache_release(page);
40537 +                       break;
40538 +               }
40539 +
40540 +               /* put page data into tree via tail_write */
40541 +               count = PAGE_CACHE_SIZE;
40542 +               if ((i == (num_pages - 1)) &&
40543 +                   (inode->i_size & ~PAGE_CACHE_MASK))
40544 +                       /* last page can be incompleted */
40545 +                       count = (inode->i_size & ~PAGE_CACHE_MASK);
40546 +               while (count) {
40547 +                       struct dentry dentry;
40548 +                       struct file file;
40549 +                       loff_t pos;
40550 +
40551 +                       dentry.d_inode = inode;
40552 +                       file.f_dentry = &dentry;
40553 +                       file.private_data = NULL;
40554 +                       file.f_pos = start_byte;
40555 +                       file.private_data = NULL;
40556 +                       pos = start_byte;
40557 +                       result = reiser4_write_tail(&file,
40558 +                                                   (char __user *)kmap(page),
40559 +                                                   count, &pos);
40560 +                       reiser4_free_file_fsdata(&file);
40561 +                       if (result <= 0) {
40562 +                               warning("", "reiser4_write_tail failed");
40563 +                               page_cache_release(page);
40564 +                               reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40565 +                               return result;
40566 +                       }
40567 +                       count -= result;
40568 +               }
40569 +
40570 +               /* release page */
40571 +               lock_page(page);
40572 +               /* page is already detached from jnode and mapping. */
40573 +               assert("vs-1086", page->mapping == NULL);
40574 +               assert("nikita-2690",
40575 +                      (!PagePrivate(page) && jprivate(page) == 0));
40576 +               /* waiting for writeback completion with page lock held is
40577 +                * perfectly valid. */
40578 +               wait_on_page_writeback(page);
40579 +               reiser4_drop_page(page);
40580 +               /* release reference taken by read_cache_page() above */
40581 +               page_cache_release(page);
40582 +
40583 +               drop_exclusive_access(uf_info);
40584 +               /* throttle the conversion */
40585 +               reiser4_throttle_write(inode);
40586 +               get_exclusive_access(uf_info);
40587 +               /*
40588 +                * nobody is allowed to complete conversion but a process which
40589 +                * started it
40590 +                */
40591 +               assert("", reiser4_inode_get_flag(inode, REISER4_PART_MIXED));
40592 +       }
40593 +
40594 +       reiser4_inode_clr_flag(inode, REISER4_PART_IN_CONV);
40595 +
40596 +       if (i == num_pages) {
40597 +               /* file is converted to formatted items */
40598 +               assert("vs-1698", reiser4_inode_get_flag(inode,
40599 +                                                        REISER4_PART_MIXED));
40600 +               assert("vs-1260",
40601 +                      inode_has_no_jnodes(reiser4_inode_data(inode)));
40602 +
40603 +               uf_info->container = UF_CONTAINER_TAILS;
40604 +               complete_conversion(inode);
40605 +               return 0;
40606 +       }
40607 +       /*
40608 +        * conversion is not complete. Inode was already marked as
40609 +        * REISER4_PART_MIXED and stat-data were updated at the first *
40610 +        * iteration of the loop above.
40611 +        */
40612 +       warning("nikita-2282",
40613 +               "Partial conversion of %llu: %lu of %lu: %i",
40614 +               (unsigned long long)get_inode_oid(inode), i,
40615 +               num_pages, result);
40616 +
40617 +       return result;
40618 +}
40619 +
40620 +/*
40621 + * Local variables:
40622 + * c-indentation-style: "K&R"
40623 + * mode-name: "LC"
40624 + * c-basic-offset: 8
40625 + * tab-width: 8
40626 + * fill-column: 79
40627 + * scroll-step: 1
40628 + * End:
40629 + */
40630 diff --git a/fs/reiser4/plugin/file_ops.c b/fs/reiser4/plugin/file_ops.c
40631 new file mode 100644
40632 index 0000000..ef8ba9d
40633 --- /dev/null
40634 +++ b/fs/reiser4/plugin/file_ops.c
40635 @@ -0,0 +1,168 @@
40636 +/* Copyright 2005 by Hans Reiser, licensing governed by
40637 +   reiser4/README */
40638 +
40639 +/* this file contains typical implementations for some of methods of
40640 +   struct file_operations and of struct address_space_operations
40641 +*/
40642 +
40643 +#include "../inode.h"
40644 +#include "object.h"
40645 +
40646 +/* file operations */
40647 +
40648 +/* implementation of vfs's llseek method of struct file_operations for
40649 +   typical directory can be found in readdir_common.c
40650 +*/
40651 +loff_t reiser4_llseek_dir_common(struct file *, loff_t, int origin);
40652 +
40653 +/* implementation of vfs's readdir method of struct file_operations for
40654 +   typical directory can be found in readdir_common.c
40655 +*/
40656 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
40657 +
40658 +/**
40659 + * reiser4_release_dir_common - release of struct file_operations
40660 + * @inode: inode of released file
40661 + * @file: file to release
40662 + *
40663 + * Implementation of release method of struct file_operations for typical
40664 + * directory. All it does is freeing of reiser4 specific file data.
40665 +*/
40666 +int reiser4_release_dir_common(struct inode *inode, struct file *file)
40667 +{
40668 +       reiser4_context *ctx;
40669 +
40670 +       ctx = reiser4_init_context(inode->i_sb);
40671 +       if (IS_ERR(ctx))
40672 +               return PTR_ERR(ctx);
40673 +       reiser4_free_file_fsdata(file);
40674 +       reiser4_exit_context(ctx);
40675 +       return 0;
40676 +}
40677 +
40678 +/* this is common implementation of vfs's fsync method of struct
40679 +   file_operations
40680 +*/
40681 +int reiser4_sync_common(struct file *file, struct dentry *dentry, int datasync)
40682 +{
40683 +       reiser4_context *ctx;
40684 +       int result;
40685 +
40686 +       ctx = reiser4_init_context(dentry->d_inode->i_sb);
40687 +       if (IS_ERR(ctx))
40688 +               return PTR_ERR(ctx);
40689 +       result = txnmgr_force_commit_all(dentry->d_inode->i_sb, 0);
40690 +
40691 +       context_set_commit_async(ctx);
40692 +       reiser4_exit_context(ctx);
40693 +       return result;
40694 +}
40695 +
40696 +/* this is common implementation of vfs's sendfile method of struct
40697 +   file_operations
40698 +
40699 +   Reads @count bytes from @file and calls @actor for every page read. This is
40700 +   needed for loop back devices support.
40701 +*/
40702 +#if 0
40703 +ssize_t
40704 +sendfile_common(struct file *file, loff_t *ppos, size_t count,
40705 +               read_actor_t actor, void *target)
40706 +{
40707 +       reiser4_context *ctx;
40708 +       ssize_t result;
40709 +
40710 +       ctx = reiser4_init_context(file->f_dentry->d_inode->i_sb);
40711 +       if (IS_ERR(ctx))
40712 +               return PTR_ERR(ctx);
40713 +       result = generic_file_sendfile(file, ppos, count, actor, target);
40714 +       reiser4_exit_context(ctx);
40715 +       return result;
40716 +}
40717 +#endif  /*  0  */
40718 +
40719 +/* address space operations */
40720 +
40721 +/* this is common implementation of vfs's prepare_write method of struct
40722 +   address_space_operations
40723 +*/
40724 +int
40725 +prepare_write_common(struct file *file, struct page *page, unsigned from,
40726 +                    unsigned to)
40727 +{
40728 +       reiser4_context *ctx;
40729 +       int result;
40730 +
40731 +       ctx = reiser4_init_context(page->mapping->host->i_sb);
40732 +       result = do_prepare_write(file, page, from, to);
40733 +
40734 +       /* don't commit transaction under inode semaphore */
40735 +       context_set_commit_async(ctx);
40736 +       reiser4_exit_context(ctx);
40737 +
40738 +       return result;
40739 +}
40740 +
40741 +/* this is helper for prepare_write_common and prepare_write_unix_file
40742 + */
40743 +int
40744 +do_prepare_write(struct file *file, struct page *page, unsigned from,
40745 +                unsigned to)
40746 +{
40747 +       int result;
40748 +       file_plugin *fplug;
40749 +       struct inode *inode;
40750 +
40751 +       assert("umka-3099", file != NULL);
40752 +       assert("umka-3100", page != NULL);
40753 +       assert("umka-3095", PageLocked(page));
40754 +
40755 +       if (to - from == PAGE_CACHE_SIZE || PageUptodate(page))
40756 +               return 0;
40757 +
40758 +       inode = page->mapping->host;
40759 +       fplug = inode_file_plugin(inode);
40760 +
40761 +       if (page->mapping->a_ops->readpage == NULL)
40762 +               return RETERR(-EINVAL);
40763 +
40764 +       result = page->mapping->a_ops->readpage(file, page);
40765 +       if (result != 0) {
40766 +               SetPageError(page);
40767 +               ClearPageUptodate(page);
40768 +               /* All reiser4 readpage() implementations should return the
40769 +                * page locked in case of error. */
40770 +               assert("nikita-3472", PageLocked(page));
40771 +       } else {
40772 +               /*
40773 +                * ->readpage() either:
40774 +                *
40775 +                *     1. starts IO against @page. @page is locked for IO in
40776 +                *     this case.
40777 +                *
40778 +                *     2. doesn't start IO. @page is unlocked.
40779 +                *
40780 +                * In either case, page should be locked.
40781 +                */
40782 +               lock_page(page);
40783 +               /*
40784 +                * IO (if any) is completed at this point. Check for IO
40785 +                * errors.
40786 +                */
40787 +               if (!PageUptodate(page))
40788 +                       result = RETERR(-EIO);
40789 +       }
40790 +       assert("umka-3098", PageLocked(page));
40791 +       return result;
40792 +}
40793 +
40794 +/*
40795 + * Local variables:
40796 + * c-indentation-style: "K&R"
40797 + * mode-name: "LC"
40798 + * c-basic-offset: 8
40799 + * tab-width: 8
40800 + * fill-column: 79
40801 + * scroll-step: 1
40802 + * End:
40803 + */
40804 diff --git a/fs/reiser4/plugin/file_ops_readdir.c b/fs/reiser4/plugin/file_ops_readdir.c
40805 new file mode 100644
40806 index 0000000..2bd7826
40807 --- /dev/null
40808 +++ b/fs/reiser4/plugin/file_ops_readdir.c
40809 @@ -0,0 +1,657 @@
40810 +/* Copyright 2005 by Hans Reiser, licensing governed by
40811 + * reiser4/README */
40812 +
40813 +#include "../inode.h"
40814 +
40815 +/* return true, iff @coord points to the valid directory item that is part of
40816 + * @inode directory. */
40817 +static int is_valid_dir_coord(struct inode *inode, coord_t * coord)
40818 +{
40819 +       return plugin_of_group(item_plugin_by_coord(coord),
40820 +                              DIR_ENTRY_ITEM_TYPE) &&
40821 +              inode_file_plugin(inode)->owns_item(inode, coord);
40822 +}
40823 +
40824 +/* compare two logical positions within the same directory */
40825 +static cmp_t dir_pos_cmp(const dir_pos * p1, const dir_pos * p2)
40826 +{
40827 +       cmp_t result;
40828 +
40829 +       assert("nikita-2534", p1 != NULL);
40830 +       assert("nikita-2535", p2 != NULL);
40831 +
40832 +       result = de_id_cmp(&p1->dir_entry_key, &p2->dir_entry_key);
40833 +       if (result == EQUAL_TO) {
40834 +               int diff;
40835 +
40836 +               diff = p1->pos - p2->pos;
40837 +               result =
40838 +                   (diff < 0) ? LESS_THAN : (diff ? GREATER_THAN : EQUAL_TO);
40839 +       }
40840 +       return result;
40841 +}
40842 +
40843 +/* see comment before reiser4_readdir_common() for overview of why "adjustment" is
40844 + * necessary. */
40845 +static void
40846 +adjust_dir_pos(struct file *dir,
40847 +              readdir_pos * readdir_spot, const dir_pos * mod_point, int adj)
40848 +{
40849 +       dir_pos *pos;
40850 +
40851 +       /*
40852 +        * new directory entry was added (adj == +1) or removed (adj == -1) at
40853 +        * the @mod_point. Directory file descriptor @dir is doing readdir and
40854 +        * is currently positioned at @readdir_spot. Latter has to be updated
40855 +        * to maintain stable readdir.
40856 +        */
40857 +       /* directory is positioned to the beginning. */
40858 +       if (readdir_spot->entry_no == 0)
40859 +               return;
40860 +
40861 +       pos = &readdir_spot->position;
40862 +       switch (dir_pos_cmp(mod_point, pos)) {
40863 +       case LESS_THAN:
40864 +               /* @mod_pos is _before_ @readdir_spot, that is, entry was
40865 +                * added/removed on the left (in key order) of current
40866 +                * position. */
40867 +               /* logical number of directory entry readdir is "looking" at
40868 +                * changes */
40869 +               readdir_spot->entry_no += adj;
40870 +               assert("nikita-2577",
40871 +                      ergo(dir != NULL, reiser4_get_dir_fpos(dir) + adj >= 0));
40872 +               if (de_id_cmp(&pos->dir_entry_key,
40873 +                             &mod_point->dir_entry_key) == EQUAL_TO) {
40874 +                       assert("nikita-2575", mod_point->pos < pos->pos);
40875 +                       /*
40876 +                        * if entry added/removed has the same key as current
40877 +                        * for readdir, update counter of duplicate keys in
40878 +                        * @readdir_spot.
40879 +                        */
40880 +                       pos->pos += adj;
40881 +               }
40882 +               break;
40883 +       case GREATER_THAN:
40884 +               /* directory is modified after @pos: nothing to do. */
40885 +               break;
40886 +       case EQUAL_TO:
40887 +               /* cannot insert an entry readdir is looking at, because it
40888 +                  already exists. */
40889 +               assert("nikita-2576", adj < 0);
40890 +               /* directory entry to which @pos points to is being
40891 +                  removed.
40892 +
40893 +                  NOTE-NIKITA: Right thing to do is to update @pos to point
40894 +                  to the next entry. This is complex (we are under spin-lock
40895 +                  for one thing). Just rewind it to the beginning. Next
40896 +                  readdir will have to scan the beginning of
40897 +                  directory. Proper solution is to use semaphore in
40898 +                  spin lock's stead and use rewind_right() here.
40899 +
40900 +                  NOTE-NIKITA: now, semaphore is used, so...
40901 +                */
40902 +               memset(readdir_spot, 0, sizeof *readdir_spot);
40903 +       }
40904 +}
40905 +
40906 +/* scan all file-descriptors for this directory and adjust their
40907 +   positions respectively. Should be used by implementations of
40908 +   add_entry and rem_entry of dir plugin */
40909 +void reiser4_adjust_dir_file(struct inode *dir, const struct dentry *de,
40910 +                            int offset, int adj)
40911 +{
40912 +       reiser4_file_fsdata *scan;
40913 +       dir_pos mod_point;
40914 +
40915 +       assert("nikita-2536", dir != NULL);
40916 +       assert("nikita-2538", de != NULL);
40917 +       assert("nikita-2539", adj != 0);
40918 +
40919 +       build_de_id(dir, &de->d_name, &mod_point.dir_entry_key);
40920 +       mod_point.pos = offset;
40921 +
40922 +       spin_lock_inode(dir);
40923 +
40924 +       /*
40925 +        * new entry was added/removed in directory @dir. Scan all file
40926 +        * descriptors for @dir that are currently involved into @readdir and
40927 +        * update them.
40928 +        */
40929 +
40930 +       list_for_each_entry(scan, get_readdir_list(dir), dir.linkage)
40931 +               adjust_dir_pos(scan->back, &scan->dir.readdir, &mod_point, adj);
40932 +
40933 +       spin_unlock_inode(dir);
40934 +}
40935 +
40936 +/*
40937 + * traverse tree to start/continue readdir from the readdir position @pos.
40938 + */
40939 +static int dir_go_to(struct file *dir, readdir_pos * pos, tap_t * tap)
40940 +{
40941 +       reiser4_key key;
40942 +       int result;
40943 +       struct inode *inode;
40944 +
40945 +       assert("nikita-2554", pos != NULL);
40946 +
40947 +       inode = dir->f_dentry->d_inode;
40948 +       result = inode_dir_plugin(inode)->build_readdir_key(dir, &key);
40949 +       if (result != 0)
40950 +               return result;
40951 +       result = reiser4_object_lookup(inode,
40952 +                                      &key,
40953 +                                      tap->coord,
40954 +                                      tap->lh,
40955 +                                      tap->mode,
40956 +                                      FIND_EXACT,
40957 +                                      LEAF_LEVEL, LEAF_LEVEL,
40958 +                                      0, &tap->ra_info);
40959 +       if (result == CBK_COORD_FOUND)
40960 +               result = rewind_right(tap, (int)pos->position.pos);
40961 +       else {
40962 +               tap->coord->node = NULL;
40963 +               done_lh(tap->lh);
40964 +               result = RETERR(-EIO);
40965 +       }
40966 +       return result;
40967 +}
40968 +
40969 +/*
40970 + * handling of non-unique keys: calculate at what ordinal position within
40971 + * sequence of directory items with identical keys @pos is.
40972 + */
40973 +static int set_pos(struct inode *inode, readdir_pos * pos, tap_t * tap)
40974 +{
40975 +       int result;
40976 +       coord_t coord;
40977 +       lock_handle lh;
40978 +       tap_t scan;
40979 +       de_id *did;
40980 +       reiser4_key de_key;
40981 +
40982 +       coord_init_zero(&coord);
40983 +       init_lh(&lh);
40984 +       reiser4_tap_init(&scan, &coord, &lh, ZNODE_READ_LOCK);
40985 +       reiser4_tap_copy(&scan, tap);
40986 +       reiser4_tap_load(&scan);
40987 +       pos->position.pos = 0;
40988 +
40989 +       did = &pos->position.dir_entry_key;
40990 +
40991 +       if (is_valid_dir_coord(inode, scan.coord)) {
40992 +
40993 +               build_de_id_by_key(unit_key_by_coord(scan.coord, &de_key), did);
40994 +
40995 +               while (1) {
40996 +
40997 +                       result = go_prev_unit(&scan);
40998 +                       if (result != 0)
40999 +                               break;
41000 +
41001 +                       if (!is_valid_dir_coord(inode, scan.coord)) {
41002 +                               result = -EINVAL;
41003 +                               break;
41004 +                       }
41005 +
41006 +                       /* get key of directory entry */
41007 +                       unit_key_by_coord(scan.coord, &de_key);
41008 +                       if (de_id_key_cmp(did, &de_key) != EQUAL_TO) {
41009 +                               /* duplicate-sequence is over */
41010 +                               break;
41011 +                       }
41012 +                       pos->position.pos++;
41013 +               }
41014 +       } else
41015 +               result = RETERR(-ENOENT);
41016 +       reiser4_tap_relse(&scan);
41017 +       reiser4_tap_done(&scan);
41018 +       return result;
41019 +}
41020 +
41021 +/*
41022 + * "rewind" directory to @offset, i.e., set @pos and @tap correspondingly.
41023 + */
41024 +static int dir_rewind(struct file *dir, readdir_pos * pos, tap_t * tap)
41025 +{
41026 +       __u64 destination;
41027 +       __s64 shift;
41028 +       int result;
41029 +       struct inode *inode;
41030 +       loff_t dirpos;
41031 +
41032 +       assert("nikita-2553", dir != NULL);
41033 +       assert("nikita-2548", pos != NULL);
41034 +       assert("nikita-2551", tap->coord != NULL);
41035 +       assert("nikita-2552", tap->lh != NULL);
41036 +
41037 +       dirpos = reiser4_get_dir_fpos(dir);
41038 +       shift = dirpos - pos->fpos;
41039 +       /* this is logical directory entry within @dir which we are rewinding
41040 +        * to */
41041 +       destination = pos->entry_no + shift;
41042 +
41043 +       inode = dir->f_dentry->d_inode;
41044 +       if (dirpos < 0)
41045 +               return RETERR(-EINVAL);
41046 +       else if (destination == 0ll || dirpos == 0) {
41047 +               /* rewind to the beginning of directory */
41048 +               memset(pos, 0, sizeof *pos);
41049 +               return dir_go_to(dir, pos, tap);
41050 +       } else if (destination >= inode->i_size)
41051 +               return RETERR(-ENOENT);
41052 +
41053 +       if (shift < 0) {
41054 +               /* I am afraid of negative numbers */
41055 +               shift = -shift;
41056 +               /* rewinding to the left */
41057 +               if (shift <= (int)pos->position.pos) {
41058 +                       /* destination is within sequence of entries with
41059 +                          duplicate keys. */
41060 +                       result = dir_go_to(dir, pos, tap);
41061 +               } else {
41062 +                       shift -= pos->position.pos;
41063 +                       while (1) {
41064 +                               /* repetitions: deadlock is possible when
41065 +                                  going to the left. */
41066 +                               result = dir_go_to(dir, pos, tap);
41067 +                               if (result == 0) {
41068 +                                       result = rewind_left(tap, shift);
41069 +                                       if (result == -E_DEADLOCK) {
41070 +                                               reiser4_tap_done(tap);
41071 +                                               continue;
41072 +                                       }
41073 +                               }
41074 +                               break;
41075 +                       }
41076 +               }
41077 +       } else {
41078 +               /* rewinding to the right */
41079 +               result = dir_go_to(dir, pos, tap);
41080 +               if (result == 0)
41081 +                       result = rewind_right(tap, shift);
41082 +       }
41083 +       if (result == 0) {
41084 +               result = set_pos(inode, pos, tap);
41085 +               if (result == 0) {
41086 +                       /* update pos->position.pos */
41087 +                       pos->entry_no = destination;
41088 +                       pos->fpos = dirpos;
41089 +               }
41090 +       }
41091 +       return result;
41092 +}
41093 +
41094 +/*
41095 + * Function that is called by common_readdir() on each directory entry while
41096 + * doing readdir. ->filldir callback may block, so we had to release long term
41097 + * lock while calling it. To avoid repeating tree traversal, seal is used. If
41098 + * seal is broken, we return -E_REPEAT. Node is unlocked in this case.
41099 + *
41100 + * Whether node is unlocked in case of any other error is undefined. It is
41101 + * guaranteed to be still locked if success (0) is returned.
41102 + *
41103 + * When ->filldir() wants no more, feed_entry() returns 1, and node is
41104 + * unlocked.
41105 + */
41106 +static int
41107 +feed_entry(struct file *f,
41108 +          readdir_pos * pos, tap_t * tap, filldir_t filldir, void *dirent)
41109 +{
41110 +       item_plugin *iplug;
41111 +       char *name;
41112 +       reiser4_key sd_key;
41113 +       int result;
41114 +       char buf[DE_NAME_BUF_LEN];
41115 +       char name_buf[32];
41116 +       char *local_name;
41117 +       unsigned file_type;
41118 +       seal_t seal;
41119 +       coord_t *coord;
41120 +       reiser4_key entry_key;
41121 +
41122 +       coord = tap->coord;
41123 +       iplug = item_plugin_by_coord(coord);
41124 +
41125 +       /* pointer to name within the node */
41126 +       name = iplug->s.dir.extract_name(coord, buf);
41127 +       assert("nikita-1371", name != NULL);
41128 +
41129 +       /* key of object the entry points to */
41130 +       if (iplug->s.dir.extract_key(coord, &sd_key) != 0)
41131 +               return RETERR(-EIO);
41132 +
41133 +       /* we must release longterm znode lock before calling filldir to avoid
41134 +          deadlock which may happen if filldir causes page fault. So, copy
41135 +          name to intermediate buffer */
41136 +       if (strlen(name) + 1 > sizeof(name_buf)) {
41137 +               local_name = kmalloc(strlen(name) + 1,
41138 +                                    reiser4_ctx_gfp_mask_get());
41139 +               if (local_name == NULL)
41140 +                       return RETERR(-ENOMEM);
41141 +       } else
41142 +               local_name = name_buf;
41143 +
41144 +       strcpy(local_name, name);
41145 +       file_type = iplug->s.dir.extract_file_type(coord);
41146 +
41147 +       unit_key_by_coord(coord, &entry_key);
41148 +       reiser4_seal_init(&seal, coord, &entry_key);
41149 +
41150 +       longterm_unlock_znode(tap->lh);
41151 +
41152 +       /*
41153 +        * send information about directory entry to the ->filldir() filler
41154 +        * supplied to us by caller (VFS).
41155 +        *
41156 +        * ->filldir is entitled to do weird things. For example, ->filldir
41157 +        * supplied by knfsd re-enters file system. Make sure no locks are
41158 +        * held.
41159 +        */
41160 +       assert("nikita-3436", lock_stack_isclean(get_current_lock_stack()));
41161 +
41162 +       reiser4_txn_restart_current();
41163 +       result = filldir(dirent, name, (int)strlen(name),
41164 +                        /* offset of this entry */
41165 +                        f->f_pos,
41166 +                        /* inode number of object bounden by this entry */
41167 +                        oid_to_uino(get_key_objectid(&sd_key)), file_type);
41168 +       if (local_name != name_buf)
41169 +               kfree(local_name);
41170 +       if (result < 0)
41171 +               /* ->filldir() is satisfied. (no space in buffer, IOW) */
41172 +               result = 1;
41173 +       else
41174 +               result = reiser4_seal_validate(&seal, coord, &entry_key,
41175 +                                              tap->lh, tap->mode,
41176 +                                              ZNODE_LOCK_HIPRI);
41177 +       return result;
41178 +}
41179 +
41180 +static void move_entry(readdir_pos * pos, coord_t * coord)
41181 +{
41182 +       reiser4_key de_key;
41183 +       de_id *did;
41184 +
41185 +       /* update @pos */
41186 +       ++pos->entry_no;
41187 +       did = &pos->position.dir_entry_key;
41188 +
41189 +       /* get key of directory entry */
41190 +       unit_key_by_coord(coord, &de_key);
41191 +
41192 +       if (de_id_key_cmp(did, &de_key) == EQUAL_TO)
41193 +               /* we are within sequence of directory entries
41194 +                  with duplicate keys. */
41195 +               ++pos->position.pos;
41196 +       else {
41197 +               pos->position.pos = 0;
41198 +               build_de_id_by_key(&de_key, did);
41199 +       }
41200 +       ++pos->fpos;
41201 +}
41202 +
41203 +/*
41204 + *     STATELESS READDIR
41205 + *
41206 + * readdir support in reiser4 relies on ability to update readdir_pos embedded
41207 + * into reiser4_file_fsdata on each directory modification (name insertion and
41208 + * removal), see reiser4_readdir_common() function below. This obviously doesn't
41209 + * work when reiser4 is accessed over NFS, because NFS doesn't keep any state
41210 + * across client READDIR requests for the same directory.
41211 + *
41212 + * To address this we maintain a "pool" of detached reiser4_file_fsdata
41213 + * (d_cursor). Whenever NFS readdir request comes, we detect this, and try to
41214 + * find detached reiser4_file_fsdata corresponding to previous readdir
41215 + * request. In other words, additional state is maintained on the
41216 + * server. (This is somewhat contrary to the design goals of NFS protocol.)
41217 + *
41218 + * To efficiently detect when our ->readdir() method is called by NFS server,
41219 + * dentry is marked as "stateless" in reiser4_decode_fh() (this is checked by
41220 + * file_is_stateless() function).
41221 + *
41222 + * To find out d_cursor in the pool, we encode client id (cid) in the highest
41223 + * bits of NFS readdir cookie: when first readdir request comes to the given
41224 + * directory from the given client, cookie is set to 0. This situation is
41225 + * detected, global cid_counter is incremented, and stored in highest bits of
41226 + * all direntry offsets returned to the client, including last one. As the
41227 + * only valid readdir cookie is one obtained as direntry->offset, we are
41228 + * guaranteed that next readdir request (continuing current one) will have
41229 + * current cid in the highest bits of starting readdir cookie. All d_cursors
41230 + * are hashed into per-super-block hash table by (oid, cid) key.
41231 + *
41232 + * In addition d_cursors are placed into per-super-block radix tree where they
41233 + * are keyed by oid alone. This is necessary to efficiently remove them during
41234 + * rmdir.
41235 + *
41236 + * At last, currently unused d_cursors are linked into special list. This list
41237 + * is used d_cursor_shrink to reclaim d_cursors on memory pressure.
41238 + *
41239 + */
41240 +
41241 +/*
41242 + * prepare for readdir.
41243 + */
41244 +static int dir_readdir_init(struct file *f, tap_t * tap, readdir_pos ** pos)
41245 +{
41246 +       struct inode *inode;
41247 +       reiser4_file_fsdata *fsdata;
41248 +       int result;
41249 +
41250 +       assert("nikita-1359", f != NULL);
41251 +       inode = f->f_dentry->d_inode;
41252 +       assert("nikita-1360", inode != NULL);
41253 +
41254 +       if (!S_ISDIR(inode->i_mode))
41255 +               return RETERR(-ENOTDIR);
41256 +
41257 +       /* try to find detached readdir state */
41258 +       result = reiser4_attach_fsdata(f, inode);
41259 +       if (result != 0)
41260 +               return result;
41261 +
41262 +       fsdata = reiser4_get_file_fsdata(f);
41263 +       assert("nikita-2571", fsdata != NULL);
41264 +       if (IS_ERR(fsdata))
41265 +               return PTR_ERR(fsdata);
41266 +
41267 +       /* add file descriptor to the readdir list hanging of directory
41268 +        * inode. This list is used to scan "readdirs-in-progress" while
41269 +        * inserting or removing names in the directory. */
41270 +       spin_lock_inode(inode);
41271 +       if (list_empty_careful(&fsdata->dir.linkage))
41272 +               list_add(&fsdata->dir.linkage, get_readdir_list(inode));
41273 +       *pos = &fsdata->dir.readdir;
41274 +       spin_unlock_inode(inode);
41275 +
41276 +       /* move @tap to the current position */
41277 +       return dir_rewind(f, *pos, tap);
41278 +}
41279 +
41280 +/* this is implementation of vfs's llseek method of struct file_operations for
41281 +   typical directory
41282 +   See comment before reiser4_readdir_common() for explanation.
41283 +*/
41284 +loff_t reiser4_llseek_dir_common(struct file * file, loff_t off, int origin)
41285 +{
41286 +       reiser4_context *ctx;
41287 +       loff_t result;
41288 +       struct inode *inode;
41289 +
41290 +       inode = file->f_dentry->d_inode;
41291 +
41292 +       ctx = reiser4_init_context(inode->i_sb);
41293 +       if (IS_ERR(ctx))
41294 +               return PTR_ERR(ctx);
41295 +
41296 +       mutex_lock(&inode->i_mutex);
41297 +
41298 +       /* update ->f_pos */
41299 +       result = default_llseek(file, off, origin);
41300 +       if (result >= 0) {
41301 +               int ff;
41302 +               coord_t coord;
41303 +               lock_handle lh;
41304 +               tap_t tap;
41305 +               readdir_pos *pos;
41306 +
41307 +               coord_init_zero(&coord);
41308 +               init_lh(&lh);
41309 +               reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
41310 +
41311 +               ff = dir_readdir_init(file, &tap, &pos);
41312 +               reiser4_detach_fsdata(file);
41313 +               if (ff != 0)
41314 +                       result = (loff_t) ff;
41315 +               reiser4_tap_done(&tap);
41316 +       }
41317 +       reiser4_detach_fsdata(file);
41318 +       mutex_unlock(&inode->i_mutex);
41319 +
41320 +       reiser4_exit_context(ctx);
41321 +       return result;
41322 +}
41323 +
41324 +/* this is common implementation of vfs's readdir method of struct
41325 +   file_operations
41326 +
41327 +   readdir problems:
41328 +
41329 +   readdir(2)/getdents(2) interface is based on implicit assumption that
41330 +   readdir can be restarted from any particular point by supplying file system
41331 +   with off_t-full of data. That is, file system fills ->d_off field in struct
41332 +   dirent and later user passes ->d_off to the seekdir(3), which is, actually,
41333 +   implemented by glibc as lseek(2) on directory.
41334 +
41335 +   Reiser4 cannot restart readdir from 64 bits of data, because two last
41336 +   components of the key of directory entry are unknown, which given 128 bits:
41337 +   locality and type fields in the key of directory entry are always known, to
41338 +   start readdir() from given point objectid and offset fields have to be
41339 +   filled.
41340 +
41341 +   Traditional UNIX API for scanning through directory
41342 +   (readdir/seekdir/telldir/opendir/closedir/rewindir/getdents) is based on the
41343 +   assumption that directory is structured very much like regular file, in
41344 +   particular, it is implied that each name within given directory (directory
41345 +   entry) can be uniquely identified by scalar offset and that such offset is
41346 +   stable across the life-time of the name is identifies.
41347 +
41348 +   This is manifestly not so for reiser4. In reiser4 the only stable unique
41349 +   identifies for the directory entry is its key that doesn't fit into
41350 +   seekdir/telldir API.
41351 +
41352 +   solution:
41353 +
41354 +   Within each file descriptor participating in readdir-ing of directory
41355 +   plugin/dir/dir.h:readdir_pos is maintained. This structure keeps track of
41356 +   the "current" directory entry that file descriptor looks at. It contains a
41357 +   key of directory entry (plus some additional info to deal with non-unique
41358 +   keys that we wouldn't dwell onto here) and a logical position of this
41359 +   directory entry starting from the beginning of the directory, that is
41360 +   ordinal number of this entry in the readdir order.
41361 +
41362 +   Obviously this logical position is not stable in the face of directory
41363 +   modifications. To work around this, on each addition or removal of directory
41364 +   entry all file descriptors for directory inode are scanned and their
41365 +   readdir_pos are updated accordingly (adjust_dir_pos()).
41366 +*/
41367 +int reiser4_readdir_common(struct file *f /* directory file being read */,
41368 +                          void *dirent /* opaque data passed to us by VFS */,
41369 +                          filldir_t filld /* filler function passed to us
41370 +                                           * by VFS */)
41371 +{
41372 +       reiser4_context *ctx;
41373 +       int result;
41374 +       struct inode *inode;
41375 +       coord_t coord;
41376 +       lock_handle lh;
41377 +       tap_t tap;
41378 +       readdir_pos *pos;
41379 +
41380 +       assert("nikita-1359", f != NULL);
41381 +       inode = f->f_dentry->d_inode;
41382 +       assert("nikita-1360", inode != NULL);
41383 +
41384 +       if (!S_ISDIR(inode->i_mode))
41385 +               return RETERR(-ENOTDIR);
41386 +
41387 +       ctx = reiser4_init_context(inode->i_sb);
41388 +       if (IS_ERR(ctx))
41389 +               return PTR_ERR(ctx);
41390 +
41391 +       coord_init_zero(&coord);
41392 +       init_lh(&lh);
41393 +       reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
41394 +
41395 +       reiser4_readdir_readahead_init(inode, &tap);
41396 +
41397 +      repeat:
41398 +       result = dir_readdir_init(f, &tap, &pos);
41399 +       if (result == 0) {
41400 +               result = reiser4_tap_load(&tap);
41401 +               /* scan entries one by one feeding them to @filld */
41402 +               while (result == 0) {
41403 +                       coord_t *coord;
41404 +
41405 +                       coord = tap.coord;
41406 +                       assert("nikita-2572", coord_is_existing_unit(coord));
41407 +                       assert("nikita-3227", is_valid_dir_coord(inode, coord));
41408 +
41409 +                       result = feed_entry(f, pos, &tap, filld, dirent);
41410 +                       if (result > 0) {
41411 +                               break;
41412 +                       } else if (result == 0) {
41413 +                               ++f->f_pos;
41414 +                               result = go_next_unit(&tap);
41415 +                               if (result == -E_NO_NEIGHBOR ||
41416 +                                   result == -ENOENT) {
41417 +                                       result = 0;
41418 +                                       break;
41419 +                               } else if (result == 0) {
41420 +                                       if (is_valid_dir_coord(inode, coord))
41421 +                                               move_entry(pos, coord);
41422 +                                       else
41423 +                                               break;
41424 +                               }
41425 +                       } else if (result == -E_REPEAT) {
41426 +                               /* feed_entry() had to restart. */
41427 +                               ++f->f_pos;
41428 +                               reiser4_tap_relse(&tap);
41429 +                               goto repeat;
41430 +                       } else
41431 +                               warning("vs-1617",
41432 +                                       "reiser4_readdir_common: unexpected error %d",
41433 +                                       result);
41434 +               }
41435 +               reiser4_tap_relse(&tap);
41436 +
41437 +               if (result >= 0)
41438 +                       f->f_version = inode->i_version;
41439 +       } else if (result == -E_NO_NEIGHBOR || result == -ENOENT)
41440 +               result = 0;
41441 +       reiser4_tap_done(&tap);
41442 +       reiser4_detach_fsdata(f);
41443 +
41444 +       /* try to update directory's atime */
41445 +       if (reiser4_grab_space_force(inode_file_plugin(inode)->estimate.update(inode),
41446 +                              BA_CAN_COMMIT) != 0)
41447 +               warning("", "failed to update atime on readdir: %llu",
41448 +                       get_inode_oid(inode));
41449 +       else
41450 +               file_accessed(f);
41451 +
41452 +       context_set_commit_async(ctx);
41453 +       reiser4_exit_context(ctx);
41454 +
41455 +       return (result <= 0) ? result : 0;
41456 +}
41457 +
41458 +/*
41459 + * Local variables:
41460 + * c-indentation-style: "K&R"
41461 + * mode-name: "LC"
41462 + * c-basic-offset: 8
41463 + * tab-width: 8
41464 + * fill-column: 79
41465 + * End:
41466 + */
41467 diff --git a/fs/reiser4/plugin/file_plugin_common.c b/fs/reiser4/plugin/file_plugin_common.c
41468 new file mode 100644
41469 index 0000000..55d9047
41470 --- /dev/null
41471 +++ b/fs/reiser4/plugin/file_plugin_common.c
41472 @@ -0,0 +1,1007 @@
41473 +/* Copyright 2005 by Hans Reiser, licensing governed by
41474 +   reiser4/README */
41475 +
41476 +/* this file contains typical implementations for most of methods of
41477 +   file plugin
41478 +*/
41479 +
41480 +#include "../inode.h"
41481 +#include "object.h"
41482 +#include "../safe_link.h"
41483 +
41484 +#include <linux/quotaops.h>
41485 +
41486 +static int insert_new_sd(struct inode *inode);
41487 +static int update_sd(struct inode *inode);
41488 +
41489 +/* this is common implementation of write_sd_by_inode method of file plugin
41490 +   either insert stat data or update it
41491 + */
41492 +int write_sd_by_inode_common(struct inode *inode /* object to save */ )
41493 +{
41494 +       int result;
41495 +
41496 +       assert("nikita-730", inode != NULL);
41497 +
41498 +       if (reiser4_inode_get_flag(inode, REISER4_NO_SD))
41499 +               /* object doesn't have stat-data yet */
41500 +               result = insert_new_sd(inode);
41501 +       else
41502 +               result = update_sd(inode);
41503 +       if (result != 0 && result != -ENAMETOOLONG && result != -ENOMEM)
41504 +               /* Don't issue warnings about "name is too long" */
41505 +               warning("nikita-2221", "Failed to save sd for %llu: %i",
41506 +                       (unsigned long long)get_inode_oid(inode), result);
41507 +       return result;
41508 +}
41509 +
41510 +/* this is common implementation of key_by_inode method of file plugin
41511 + */
41512 +int
41513 +key_by_inode_and_offset_common(struct inode *inode, loff_t off,
41514 +                              reiser4_key * key)
41515 +{
41516 +       reiser4_key_init(key);
41517 +       set_key_locality(key, reiser4_inode_data(inode)->locality_id);
41518 +       set_key_ordering(key, get_inode_ordering(inode));
41519 +       set_key_objectid(key, get_inode_oid(inode));    /*FIXME: inode->i_ino */
41520 +       set_key_type(key, KEY_BODY_MINOR);
41521 +       set_key_offset(key, (__u64) off);
41522 +       return 0;
41523 +}
41524 +
41525 +/* this is common implementation of set_plug_in_inode method of file plugin
41526 + */
41527 +int set_plug_in_inode_common(struct inode *object /* inode to set plugin on */ ,
41528 +                            struct inode *parent /* parent object */ ,
41529 +                            reiser4_object_create_data * data  /* creational
41530 +                                                                * data */ )
41531 +{
41532 +       __u64 mask;
41533 +
41534 +       object->i_mode = data->mode;
41535 +       /* this should be plugin decision */
41536 +       object->i_uid = current->fsuid;
41537 +       object->i_mtime = object->i_atime = object->i_ctime = CURRENT_TIME;
41538 +
41539 +       /* support for BSD style group-id assignment. See mount's manual page
41540 +          description of bsdgroups ext2 mount options for more details */
41541 +       if (reiser4_is_set(object->i_sb, REISER4_BSD_GID))
41542 +               object->i_gid = parent->i_gid;
41543 +       else if (parent->i_mode & S_ISGID) {
41544 +               /* parent directory has sguid bit */
41545 +               object->i_gid = parent->i_gid;
41546 +               if (S_ISDIR(object->i_mode))
41547 +                       /* sguid is inherited by sub-directories */
41548 +                       object->i_mode |= S_ISGID;
41549 +       } else
41550 +               object->i_gid = current->fsgid;
41551 +
41552 +       /* this object doesn't have stat-data yet */
41553 +       reiser4_inode_set_flag(object, REISER4_NO_SD);
41554 +#if 0
41555 +       /* this is now called after all inode plugins are initialized:
41556 +          do_create_vfs_child after adjust_to_parent */
41557 +       /* setup inode and file-operations for this inode */
41558 +       setup_inode_ops(object, data);
41559 +#endif
41560 +       object->i_nlink = 0;
41561 +       reiser4_seal_init(&reiser4_inode_data(object)->sd_seal, NULL, NULL);
41562 +       mask = (1 << UNIX_STAT) | (1 << LIGHT_WEIGHT_STAT);
41563 +       if (!reiser4_is_set(object->i_sb, REISER4_32_BIT_TIMES))
41564 +               mask |= (1 << LARGE_TIMES_STAT);
41565 +
41566 +       reiser4_inode_data(object)->extmask = mask;
41567 +       return 0;
41568 +}
41569 +
41570 +/* this is common implementation of adjust_to_parent method of file plugin for
41571 +   regular files
41572 + */
41573 +int adjust_to_parent_common(struct inode *object /* new object */ ,
41574 +                           struct inode *parent /* parent directory */ ,
41575 +                           struct inode *root /* root directory */ )
41576 +{
41577 +       assert("nikita-2165", object != NULL);
41578 +       if (parent == NULL)
41579 +               parent = root;
41580 +       assert("nikita-2069", parent != NULL);
41581 +
41582 +       /*
41583 +        * inherit missing plugins from parent
41584 +        */
41585 +
41586 +       grab_plugin_pset(object, parent, PSET_FILE);
41587 +       grab_plugin_pset(object, parent, PSET_SD);
41588 +       grab_plugin_pset(object, parent, PSET_FORMATTING);
41589 +       grab_plugin_pset(object, parent, PSET_PERM);
41590 +       return 0;
41591 +}
41592 +
41593 +/* this is common implementation of adjust_to_parent method of file plugin for
41594 +   typical directories
41595 + */
41596 +int adjust_to_parent_common_dir(struct inode *object /* new object */ ,
41597 +                               struct inode *parent /* parent directory */ ,
41598 +                               struct inode *root /* root directory */ )
41599 +{
41600 +       int result = 0;
41601 +       pset_member memb;
41602 +
41603 +       assert("nikita-2166", object != NULL);
41604 +       if (parent == NULL)
41605 +               parent = root;
41606 +       assert("nikita-2167", parent != NULL);
41607 +
41608 +       /*
41609 +        * inherit missing plugins from parent
41610 +        */
41611 +       for (memb = 0; memb < PSET_LAST; ++memb) {
41612 +               result = grab_plugin_pset(object, parent, memb);
41613 +               if (result != 0)
41614 +                       break;
41615 +       }
41616 +       return result;
41617 +}
41618 +
41619 +int adjust_to_parent_cryptcompress(struct inode *object /* new object */ ,
41620 +                                  struct inode *parent /* parent directory */,
41621 +                                  struct inode *root /* root directory */)
41622 +{
41623 +       int result;
41624 +       result = adjust_to_parent_common(object, parent, root);
41625 +       if (result)
41626 +               return result;
41627 +       assert("edward-1416", parent != NULL);
41628 +
41629 +       grab_plugin_pset(object, parent, PSET_CLUSTER);
41630 +       grab_plugin_pset(object, parent, PSET_CIPHER);
41631 +       grab_plugin_pset(object, parent, PSET_DIGEST);
41632 +       grab_plugin_pset(object, parent, PSET_COMPRESSION);
41633 +       grab_plugin_pset(object, parent, PSET_COMPRESSION_MODE);
41634 +
41635 +       return 0;
41636 +}
41637 +
41638 +/* this is common implementation of create_object method of file plugin
41639 + */
41640 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
41641 +                                reiser4_object_create_data * data)
41642 +{
41643 +       reiser4_block_nr reserve;
41644 +       assert("nikita-744", object != NULL);
41645 +       assert("nikita-745", parent != NULL);
41646 +       assert("nikita-747", data != NULL);
41647 +       assert("nikita-748", reiser4_inode_get_flag(object, REISER4_NO_SD));
41648 +
41649 +       reserve = estimate_create_common(object);
41650 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
41651 +               return RETERR(-ENOSPC);
41652 +       return write_sd_by_inode_common(object);
41653 +}
41654 +
41655 +static int common_object_delete_no_reserve(struct inode *inode);
41656 +
41657 +/**
41658 + * reiser4_delete_object_common - delete_object of file_plugin
41659 + * @inode: inode to be deleted
41660 + *
41661 + * This is common implementation of delete_object method of file_plugin. It
41662 + * applies to object its deletion consists of removing two items - stat data
41663 + * and safe-link.
41664 + */
41665 +int reiser4_delete_object_common(struct inode *inode)
41666 +{
41667 +       int result;
41668 +
41669 +       assert("nikita-1477", inode != NULL);
41670 +       /* FIXME: if file body deletion failed (i/o error, for instance),
41671 +          inode->i_size can be != 0 here */
41672 +       assert("nikita-3420", inode->i_size == 0 || S_ISLNK(inode->i_mode));
41673 +       assert("nikita-3421", inode->i_nlink == 0);
41674 +
41675 +       if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
41676 +               reiser4_block_nr reserve;
41677 +
41678 +               /* grab space which is needed to remove 2 items from the tree:
41679 +                  stat data and safe-link */
41680 +               reserve = 2 *
41681 +                 estimate_one_item_removal(reiser4_tree_by_inode(inode));
41682 +               if (reiser4_grab_space_force(reserve,
41683 +                                            BA_RESERVED | BA_CAN_COMMIT))
41684 +                       return RETERR(-ENOSPC);
41685 +               result = common_object_delete_no_reserve(inode);
41686 +       } else
41687 +               result = 0;
41688 +       return result;
41689 +}
41690 +
41691 +/**
41692 + * reiser4_delete_dir_common - delete_object of file_plugin
41693 + * @inode: inode to be deleted
41694 + *
41695 + * This is common implementation of delete_object method of file_plugin for
41696 + * typical directory. It calls done method of dir_plugin to remove "." and
41697 + * removes stat data and safe-link.
41698 + */
41699 +int reiser4_delete_dir_common(struct inode *inode)
41700 +{
41701 +       int result;
41702 +       dir_plugin *dplug;
41703 +
41704 +       assert("", (get_current_context() &&
41705 +                   get_current_context()->trans->atom == NULL));
41706 +
41707 +       dplug = inode_dir_plugin(inode);
41708 +       assert("vs-1101", dplug && dplug->done);
41709 +
41710 +       /* kill cursors which might be attached to inode */
41711 +       reiser4_kill_cursors(inode);
41712 +
41713 +       /* grab space enough for removing two items */
41714 +       if (reiser4_grab_space
41715 +           (2 * estimate_one_item_removal(reiser4_tree_by_inode(inode)),
41716 +            BA_RESERVED | BA_CAN_COMMIT))
41717 +               return RETERR(-ENOSPC);
41718 +
41719 +       result = dplug->done(inode);
41720 +       if (!result)
41721 +               result = common_object_delete_no_reserve(inode);
41722 +       return result;
41723 +}
41724 +
41725 +/* this is common implementation of add_link method of file plugin
41726 + */
41727 +int reiser4_add_link_common(struct inode *object, struct inode *parent)
41728 +{
41729 +       /*
41730 +        * increment ->i_nlink and update ->i_ctime
41731 +        */
41732 +
41733 +       INODE_INC_FIELD(object, i_nlink);
41734 +       object->i_ctime = CURRENT_TIME;
41735 +       return 0;
41736 +}
41737 +
41738 +/* this is common implementation of rem_link method of file plugin
41739 + */
41740 +int reiser4_rem_link_common(struct inode *object, struct inode *parent)
41741 +{
41742 +       assert("nikita-2021", object != NULL);
41743 +       assert("nikita-2163", object->i_nlink > 0);
41744 +
41745 +       /*
41746 +        * decrement ->i_nlink and update ->i_ctime
41747 +        */
41748 +
41749 +       INODE_DEC_FIELD(object, i_nlink);
41750 +       object->i_ctime = CURRENT_TIME;
41751 +       return 0;
41752 +}
41753 +
41754 +/* this is common implementation of rem_link method of file plugin for typical
41755 +   directory
41756 +*/
41757 +int rem_link_common_dir(struct inode *object, struct inode *parent UNUSED_ARG)
41758 +{
41759 +       assert("nikita-20211", object != NULL);
41760 +       assert("nikita-21631", object->i_nlink > 0);
41761 +
41762 +       /*
41763 +        * decrement ->i_nlink and update ->i_ctime
41764 +        */
41765 +       INODE_DEC_FIELD(object, i_nlink);
41766 +       if (object->i_nlink == 1)
41767 +               INODE_DEC_FIELD(object, i_nlink);
41768 +       object->i_ctime = CURRENT_TIME;
41769 +       return 0;
41770 +}
41771 +
41772 +/* this is common implementation of owns_item method of file plugin
41773 +   compare objectids of keys in inode and coord */
41774 +int owns_item_common(const struct inode *inode,        /* object to check
41775 +                                                * against */
41776 +                    const coord_t * coord /* coord to check */ )
41777 +{
41778 +       reiser4_key item_key;
41779 +       reiser4_key file_key;
41780 +
41781 +       assert("nikita-760", inode != NULL);
41782 +       assert("nikita-761", coord != NULL);
41783 +
41784 +       return coord_is_existing_item(coord) &&
41785 +           (get_key_objectid(build_sd_key(inode, &file_key)) ==
41786 +            get_key_objectid(item_key_by_coord(coord, &item_key)));
41787 +}
41788 +
41789 +/* this is common implementation of owns_item method of file plugin
41790 +   for typical directory
41791 +*/
41792 +int owns_item_common_dir(const struct inode *inode,    /* object to check against */
41793 +                        const coord_t * coord /* coord of item to check */ )
41794 +{
41795 +       reiser4_key item_key;
41796 +
41797 +       assert("nikita-1335", inode != NULL);
41798 +       assert("nikita-1334", coord != NULL);
41799 +
41800 +       if (plugin_of_group(item_plugin_by_coord(coord), DIR_ENTRY_ITEM_TYPE))
41801 +               return get_key_locality(item_key_by_coord(coord, &item_key)) ==
41802 +                   get_inode_oid(inode);
41803 +       else
41804 +               return owns_item_common(inode, coord);
41805 +}
41806 +
41807 +/* this is common implementation of can_add_link method of file plugin
41808 +   checks whether yet another hard links to this object can be added
41809 +*/
41810 +int can_add_link_common(const struct inode *object /* object to check */ )
41811 +{
41812 +       assert("nikita-732", object != NULL);
41813 +
41814 +       /* inode->i_nlink is unsigned int, so just check for integer
41815 +          overflow */
41816 +       return object->i_nlink + 1 != 0;
41817 +}
41818 +
41819 +/* this is common implementation of can_rem_link method of file plugin for
41820 +   typical directory
41821 +*/
41822 +int can_rem_link_common_dir(const struct inode *inode)
41823 +{
41824 +       /* is_dir_empty() returns 0 is dir is empty */
41825 +       return !is_dir_empty(inode);
41826 +}
41827 +
41828 +/* this is common implementation of detach method of file plugin for typical
41829 +   directory
41830 +*/
41831 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent)
41832 +{
41833 +       dir_plugin *dplug;
41834 +
41835 +       dplug = inode_dir_plugin(child);
41836 +       assert("nikita-2883", dplug != NULL);
41837 +       assert("nikita-2884", dplug->detach != NULL);
41838 +       return dplug->detach(child, parent);
41839 +}
41840 +
41841 +/* this is common implementation of bind method of file plugin for typical
41842 +   directory
41843 +*/
41844 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent)
41845 +{
41846 +       dir_plugin *dplug;
41847 +
41848 +       dplug = inode_dir_plugin(child);
41849 +       assert("nikita-2646", dplug != NULL);
41850 +       return dplug->attach(child, parent);
41851 +}
41852 +
41853 +static int process_truncate(struct inode *, __u64 size);
41854 +
41855 +/* this is common implementation of safelink method of file plugin
41856 + */
41857 +int safelink_common(struct inode *object, reiser4_safe_link_t link, __u64 value)
41858 +{
41859 +       int result;
41860 +
41861 +       assert("vs-1705", get_current_context()->trans->atom == NULL);
41862 +       if (link == SAFE_UNLINK)
41863 +               /* nothing to do. iput() in the caller (process_safelink) will
41864 +                * finish with file */
41865 +               result = 0;
41866 +       else if (link == SAFE_TRUNCATE)
41867 +               result = process_truncate(object, value);
41868 +       else {
41869 +               warning("nikita-3438", "Unrecognized safe-link type: %i", link);
41870 +               result = RETERR(-EIO);
41871 +       }
41872 +       return result;
41873 +}
41874 +
41875 +/* this is common implementation of estimate.create method of file plugin
41876 +   can be used when object creation involves insertion of one item (usually stat
41877 +   data) into tree
41878 +*/
41879 +reiser4_block_nr estimate_create_common(const struct inode * object)
41880 +{
41881 +       return estimate_one_insert_item(reiser4_tree_by_inode(object));
41882 +}
41883 +
41884 +/* this is common implementation of estimate.create method of file plugin for
41885 +   typical directory
41886 +   can be used when directory creation involves insertion of two items (usually
41887 +   stat data and item containing "." and "..") into tree
41888 +*/
41889 +reiser4_block_nr estimate_create_common_dir(const struct inode * object)
41890 +{
41891 +       return 2 * estimate_one_insert_item(reiser4_tree_by_inode(object));
41892 +}
41893 +
41894 +/* this is common implementation of estimate.update method of file plugin
41895 +   can be used when stat data update does not do more than inserting a unit
41896 +   into a stat data item which is probably true for most cases
41897 +*/
41898 +reiser4_block_nr estimate_update_common(const struct inode * inode)
41899 +{
41900 +       return estimate_one_insert_into_item(reiser4_tree_by_inode(inode));
41901 +}
41902 +
41903 +/* this is common implementation of estimate.unlink method of file plugin
41904 + */
41905 +reiser4_block_nr
41906 +estimate_unlink_common(const struct inode * object UNUSED_ARG,
41907 +                      const struct inode * parent UNUSED_ARG)
41908 +{
41909 +       return 0;
41910 +}
41911 +
41912 +/* this is common implementation of estimate.unlink method of file plugin for
41913 +   typical directory
41914 +*/
41915 +reiser4_block_nr
41916 +estimate_unlink_common_dir(const struct inode * object,
41917 +                          const struct inode * parent)
41918 +{
41919 +       dir_plugin *dplug;
41920 +
41921 +       dplug = inode_dir_plugin(object);
41922 +       assert("nikita-2888", dplug != NULL);
41923 +       assert("nikita-2887", dplug->estimate.unlink != NULL);
41924 +       return dplug->estimate.unlink(object, parent);
41925 +}
41926 +
41927 +char *wire_write_common(struct inode *inode, char *start)
41928 +{
41929 +       return build_inode_onwire(inode, start);
41930 +}
41931 +
41932 +char *wire_read_common(char *addr, reiser4_object_on_wire * obj)
41933 +{
41934 +       return extract_obj_key_id_from_onwire(addr, &obj->u.std.key_id);
41935 +}
41936 +
41937 +struct dentry *wire_get_common(struct super_block *sb,
41938 +                              reiser4_object_on_wire * obj)
41939 +{
41940 +       struct inode *inode;
41941 +       struct dentry *dentry;
41942 +       reiser4_key key;
41943 +
41944 +       extract_key_from_id(&obj->u.std.key_id, &key);
41945 +       inode = reiser4_iget(sb, &key, 1);
41946 +       if (!IS_ERR(inode)) {
41947 +               reiser4_iget_complete(inode);
41948 +               dentry = d_alloc_anon(inode);
41949 +               if (dentry == NULL) {
41950 +                       iput(inode);
41951 +                       dentry = ERR_PTR(-ENOMEM);
41952 +               } else
41953 +                       dentry->d_op = &get_super_private(sb)->ops.dentry;
41954 +       } else if (PTR_ERR(inode) == -ENOENT)
41955 +               /*
41956 +                * inode wasn't found at the key encoded in the file
41957 +                * handle. Hence, file handle is stale.
41958 +                */
41959 +               dentry = ERR_PTR(RETERR(-ESTALE));
41960 +       else
41961 +               dentry = (void *)inode;
41962 +       return dentry;
41963 +}
41964 +
41965 +int wire_size_common(struct inode *inode)
41966 +{
41967 +       return inode_onwire_size(inode);
41968 +}
41969 +
41970 +void wire_done_common(reiser4_object_on_wire * obj)
41971 +{
41972 +       /* nothing to do */
41973 +}
41974 +
41975 +/* helper function to print errors */
41976 +static void key_warning(const reiser4_key * key /* key to print */ ,
41977 +                       const struct inode *inode,
41978 +                       int code /* error code to print */ )
41979 +{
41980 +       assert("nikita-716", key != NULL);
41981 +
41982 +       if (code != -ENOMEM) {
41983 +               warning("nikita-717", "Error for inode %llu (%i)",
41984 +                       (unsigned long long)get_key_objectid(key), code);
41985 +               reiser4_print_key("for key", key);
41986 +       }
41987 +}
41988 +
41989 +/* NIKITA-FIXME-HANS: perhaps this function belongs in another file? */
41990 +#if REISER4_DEBUG
41991 +static void
41992 +check_inode_seal(const struct inode *inode,
41993 +                const coord_t * coord, const reiser4_key * key)
41994 +{
41995 +       reiser4_key unit_key;
41996 +
41997 +       unit_key_by_coord(coord, &unit_key);
41998 +       assert("nikita-2752",
41999 +              WITH_DATA_RET(coord->node, 1, keyeq(key, &unit_key)));
42000 +       assert("nikita-2753", get_inode_oid(inode) == get_key_objectid(key));
42001 +}
42002 +
42003 +static void check_sd_coord(coord_t * coord, const reiser4_key * key)
42004 +{
42005 +       reiser4_key ukey;
42006 +
42007 +       coord_clear_iplug(coord);
42008 +       if (zload(coord->node))
42009 +               return;
42010 +
42011 +       if (!coord_is_existing_unit(coord) ||
42012 +           !item_plugin_by_coord(coord) ||
42013 +           !keyeq(unit_key_by_coord(coord, &ukey), key) ||
42014 +           (znode_get_level(coord->node) != LEAF_LEVEL) ||
42015 +           !item_is_statdata(coord)) {
42016 +               warning("nikita-1901", "Conspicuous seal");
42017 +               reiser4_print_key("key", key);
42018 +               print_coord("coord", coord, 1);
42019 +               impossible("nikita-2877", "no way");
42020 +       }
42021 +       zrelse(coord->node);
42022 +}
42023 +
42024 +#else
42025 +#define check_inode_seal(inode, coord, key) noop
42026 +#define check_sd_coord(coord, key) noop
42027 +#endif
42028 +
42029 +/* insert new stat-data into tree. Called with inode state
42030 +    locked. Return inode state locked. */
42031 +static int insert_new_sd(struct inode *inode /* inode to create sd for */ )
42032 +{
42033 +       int result;
42034 +       reiser4_key key;
42035 +       coord_t coord;
42036 +       reiser4_item_data data;
42037 +       char *area;
42038 +       reiser4_inode *ref;
42039 +       lock_handle lh;
42040 +       oid_t oid;
42041 +
42042 +       assert("nikita-723", inode != NULL);
42043 +       assert("nikita-3406", reiser4_inode_get_flag(inode, REISER4_NO_SD));
42044 +
42045 +       ref = reiser4_inode_data(inode);
42046 +       spin_lock_inode(inode);
42047 +
42048 +       if (ref->plugin_mask != 0)
42049 +               /* inode has non-standard plugins */
42050 +               inode_set_extension(inode, PLUGIN_STAT);
42051 +       /*
42052 +        * prepare specification of new item to be inserted
42053 +        */
42054 +
42055 +       data.iplug = inode_sd_plugin(inode);
42056 +       data.length = data.iplug->s.sd.save_len(inode);
42057 +       spin_unlock_inode(inode);
42058 +
42059 +       data.data = NULL;
42060 +       data.user = 0;
42061 +/* could be optimized for case where there is only one node format in
42062 + * use in the filesystem, probably there are lots of such
42063 + * places we could optimize for only one node layout.... -Hans */
42064 +       if (data.length > reiser4_tree_by_inode(inode)->nplug->max_item_size()){
42065 +               /* This is silly check, but we don't know actual node where
42066 +                  insertion will go into. */
42067 +               return RETERR(-ENAMETOOLONG);
42068 +       }
42069 +       oid = oid_allocate(inode->i_sb);
42070 +/* NIKITA-FIXME-HANS: what is your opinion on whether this error check should be encapsulated into oid_allocate? */
42071 +       if (oid == ABSOLUTE_MAX_OID)
42072 +               return RETERR(-EOVERFLOW);
42073 +
42074 +       set_inode_oid(inode, oid);
42075 +
42076 +       coord_init_zero(&coord);
42077 +       init_lh(&lh);
42078 +
42079 +       result = insert_by_key(reiser4_tree_by_inode(inode),
42080 +                              build_sd_key(inode, &key), &data, &coord, &lh,
42081 +                              /* stat data lives on a leaf level */
42082 +                              LEAF_LEVEL, CBK_UNIQUE);
42083 +
42084 +       /* we don't want to re-check that somebody didn't insert
42085 +          stat-data while we were doing io, because if it did,
42086 +          insert_by_key() returned error. */
42087 +       /* but what _is_ possible is that plugin for inode's stat-data,
42088 +          list of non-standard plugins or their state would change
42089 +          during io, so that stat-data wouldn't fit into sd. To avoid
42090 +          this race we keep inode_state lock. This lock has to be
42091 +          taken each time you access inode in a way that would cause
42092 +          changes in sd size: changing plugins etc.
42093 +        */
42094 +
42095 +       if (result == IBK_INSERT_OK) {
42096 +               coord_clear_iplug(&coord);
42097 +               result = zload(coord.node);
42098 +               if (result == 0) {
42099 +                       /* have we really inserted stat data? */
42100 +                       assert("nikita-725", item_is_statdata(&coord));
42101 +
42102 +                       /* inode was just created. It is inserted into hash
42103 +                          table, but no directory entry was yet inserted into
42104 +                          parent. So, inode is inaccessible through
42105 +                          ->lookup(). All places that directly grab inode
42106 +                          from hash-table (like old knfsd), should check
42107 +                          IMMUTABLE flag that is set by common_create_child.
42108 +                        */
42109 +                       assert("nikita-3240", data.iplug != NULL);
42110 +                       assert("nikita-3241", data.iplug->s.sd.save != NULL);
42111 +                       area = item_body_by_coord(&coord);
42112 +                       result = data.iplug->s.sd.save(inode, &area);
42113 +                       znode_make_dirty(coord.node);
42114 +                       if (result == 0) {
42115 +                               /* object has stat-data now */
42116 +                               reiser4_inode_clr_flag(inode, REISER4_NO_SD);
42117 +                               reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
42118 +                               /* initialise stat-data seal */
42119 +                               reiser4_seal_init(&ref->sd_seal, &coord, &key);
42120 +                               ref->sd_coord = coord;
42121 +                               check_inode_seal(inode, &coord, &key);
42122 +                       } else if (result != -ENOMEM)
42123 +                               /*
42124 +                                * convert any other error code to -EIO to
42125 +                                * avoid confusing user level with unexpected
42126 +                                * errors.
42127 +                                */
42128 +                               result = RETERR(-EIO);
42129 +                       zrelse(coord.node);
42130 +               }
42131 +       }
42132 +       done_lh(&lh);
42133 +
42134 +       if (result != 0)
42135 +               key_warning(&key, inode, result);
42136 +       else
42137 +               oid_count_allocated();
42138 +
42139 +       return result;
42140 +}
42141 +
42142 +/* find sd of inode in a tree, deal with errors */
42143 +int lookup_sd(struct inode *inode /* inode to look sd for */ ,
42144 +             znode_lock_mode lock_mode /* lock mode */ ,
42145 +             coord_t * coord /* resulting coord */ ,
42146 +             lock_handle * lh /* resulting lock handle */ ,
42147 +             const reiser4_key * key /* resulting key */ ,
42148 +             int silent)
42149 +{
42150 +       int result;
42151 +       __u32 flags;
42152 +
42153 +       assert("nikita-1692", inode != NULL);
42154 +       assert("nikita-1693", coord != NULL);
42155 +       assert("nikita-1694", key != NULL);
42156 +
42157 +       /* look for the object's stat data in a tree.
42158 +          This returns in "node" pointer to a locked znode and in "pos"
42159 +          position of an item found in node. Both are only valid if
42160 +          coord_found is returned. */
42161 +       flags = (lock_mode == ZNODE_WRITE_LOCK) ? CBK_FOR_INSERT : 0;
42162 +       flags |= CBK_UNIQUE;
42163 +       /*
42164 +        * traverse tree to find stat data. We cannot use vroot here, because
42165 +        * it only covers _body_ of the file, and stat data don't belong
42166 +        * there.
42167 +        */
42168 +       result = coord_by_key(reiser4_tree_by_inode(inode),
42169 +                             key,
42170 +                             coord,
42171 +                             lh,
42172 +                             lock_mode,
42173 +                             FIND_EXACT, LEAF_LEVEL, LEAF_LEVEL, flags, NULL);
42174 +       if (REISER4_DEBUG && result == 0)
42175 +               check_sd_coord(coord, key);
42176 +
42177 +       if (result != 0 && !silent)
42178 +               key_warning(key, inode, result);
42179 +       return result;
42180 +}
42181 +
42182 +static int
42183 +locate_inode_sd(struct inode *inode,
42184 +               reiser4_key * key, coord_t * coord, lock_handle * lh)
42185 +{
42186 +       reiser4_inode *state;
42187 +       seal_t seal;
42188 +       int result;
42189 +
42190 +       assert("nikita-3483", inode != NULL);
42191 +
42192 +       state = reiser4_inode_data(inode);
42193 +       spin_lock_inode(inode);
42194 +       *coord = state->sd_coord;
42195 +       coord_clear_iplug(coord);
42196 +       seal = state->sd_seal;
42197 +       spin_unlock_inode(inode);
42198 +
42199 +       build_sd_key(inode, key);
42200 +       if (reiser4_seal_is_set(&seal)) {
42201 +               /* first, try to use seal */
42202 +               result = reiser4_seal_validate(&seal,
42203 +                                              coord,
42204 +                                              key,
42205 +                                              lh, ZNODE_WRITE_LOCK,
42206 +                                              ZNODE_LOCK_LOPRI);
42207 +               if (result == 0)
42208 +                       check_sd_coord(coord, key);
42209 +       } else
42210 +               result = -E_REPEAT;
42211 +
42212 +       if (result != 0) {
42213 +               coord_init_zero(coord);
42214 +               result = lookup_sd(inode, ZNODE_WRITE_LOCK, coord, lh, key, 0);
42215 +       }
42216 +       return result;
42217 +}
42218 +
42219 +#if REISER4_DEBUG
42220 +static int all_but_offset_key_eq(const reiser4_key * k1, const reiser4_key * k2)
42221 +{
42222 +       return (get_key_locality(k1) == get_key_locality(k2) &&
42223 +               get_key_type(k1) == get_key_type(k2) &&
42224 +               get_key_band(k1) == get_key_band(k2) &&
42225 +               get_key_ordering(k1) == get_key_ordering(k2) &&
42226 +               get_key_objectid(k1) == get_key_objectid(k2));
42227 +}
42228 +
42229 +#include "../tree_walk.h"
42230 +
42231 +/* make some checks before and after stat-data resize operation */
42232 +static int check_sd_resize(struct inode * inode, coord_t * coord,
42233 +                          int length, int progress /* 1 means after resize */)
42234 +{
42235 +       int ret = 0;
42236 +       lock_handle left_lock;
42237 +       coord_t left_coord;
42238 +       reiser4_key left_key;
42239 +       reiser4_key key;
42240 +
42241 +       if (inode_file_plugin(inode) !=
42242 +           file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID))
42243 +               return 0;
42244 +       if (!length)
42245 +               return 0;
42246 +       if (coord->item_pos != 0)
42247 +               return 0;
42248 +
42249 +       init_lh(&left_lock);
42250 +       ret = reiser4_get_left_neighbor(&left_lock,
42251 +                                       coord->node,
42252 +                                       ZNODE_WRITE_LOCK,
42253 +                                       GN_CAN_USE_UPPER_LEVELS);
42254 +       if (ret == -E_REPEAT || ret == -E_NO_NEIGHBOR ||
42255 +           ret == -ENOENT || ret == -EINVAL
42256 +           || ret == -E_DEADLOCK) {
42257 +               ret = 0;
42258 +               goto exit;
42259 +       }
42260 +       ret = zload(left_lock.node);
42261 +       if (ret)
42262 +               goto exit;
42263 +       coord_init_last_unit(&left_coord, left_lock.node);
42264 +       item_key_by_coord(&left_coord, &left_key);
42265 +       item_key_by_coord(coord, &key);
42266 +
42267 +       if (all_but_offset_key_eq(&key, &left_key))
42268 +               /* corruption occured */
42269 +               ret = 1;
42270 +       zrelse(left_lock.node);
42271 + exit:
42272 +       done_lh(&left_lock);
42273 +       return ret;
42274 +}
42275 +#endif
42276 +
42277 +/* update stat-data at @coord */
42278 +static int
42279 +update_sd_at(struct inode *inode, coord_t * coord, reiser4_key * key,
42280 +            lock_handle * lh)
42281 +{
42282 +       int result;
42283 +       reiser4_item_data data;
42284 +       char *area;
42285 +       reiser4_inode *state;
42286 +       znode *loaded;
42287 +
42288 +       state = reiser4_inode_data(inode);
42289 +
42290 +       coord_clear_iplug(coord);
42291 +       result = zload(coord->node);
42292 +       if (result != 0)
42293 +               return result;
42294 +       loaded = coord->node;
42295 +
42296 +       spin_lock_inode(inode);
42297 +       assert("nikita-728", inode_sd_plugin(inode) != NULL);
42298 +       data.iplug = inode_sd_plugin(inode);
42299 +
42300 +       /* if inode has non-standard plugins, add appropriate stat data
42301 +        * extension */
42302 +       if (state->extmask & (1 << PLUGIN_STAT)) {
42303 +               if (state->plugin_mask == 0)
42304 +                       inode_clr_extension(inode, PLUGIN_STAT);
42305 +       } else if (state->plugin_mask != 0)
42306 +               inode_set_extension(inode, PLUGIN_STAT);
42307 +
42308 +       if (state->extmask & (1 << HEIR_STAT)) {
42309 +               if (state->heir_mask == 0)
42310 +                       inode_clr_extension(inode, HEIR_STAT);
42311 +       } else if (state->heir_mask != 0)
42312 +                       inode_set_extension(inode, HEIR_STAT);
42313 +
42314 +       /* data.length is how much space to add to (or remove
42315 +          from if negative) sd */
42316 +       if (!reiser4_inode_get_flag(inode, REISER4_SDLEN_KNOWN)) {
42317 +               /* recalculate stat-data length */
42318 +               data.length =
42319 +                   data.iplug->s.sd.save_len(inode) -
42320 +                   item_length_by_coord(coord);
42321 +               reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
42322 +       } else
42323 +               data.length = 0;
42324 +       spin_unlock_inode(inode);
42325 +
42326 +       /* if on-disk stat data is of different length than required
42327 +          for this inode, resize it */
42328 +
42329 +       if (data.length != 0) {
42330 +               data.data = NULL;
42331 +               data.user = 0;
42332 +
42333 +               assert("edward-1441",
42334 +                      !check_sd_resize(inode, coord,
42335 +                                       data.length, 0/* before resize */));
42336 +
42337 +               /* insertion code requires that insertion point (coord) was
42338 +                * between units. */
42339 +               coord->between = AFTER_UNIT;
42340 +               result = reiser4_resize_item(coord, &data, key, lh,
42341 +                                            COPI_DONT_SHIFT_LEFT);
42342 +               if (result != 0) {
42343 +                       key_warning(key, inode, result);
42344 +                       zrelse(loaded);
42345 +                       return result;
42346 +               }
42347 +               if (loaded != coord->node) {
42348 +                 /* reiser4_resize_item moved coord to another node.
42349 +                    Zload it */
42350 +                       zrelse(loaded);
42351 +                       coord_clear_iplug(coord);
42352 +                       result = zload(coord->node);
42353 +                       if (result != 0)
42354 +                               return result;
42355 +                       loaded = coord->node;
42356 +               }
42357 +               assert("edward-1442",
42358 +                      !check_sd_resize(inode, coord,
42359 +                                       data.length, 1/* after resize */));
42360 +       }
42361 +       area = item_body_by_coord(coord);
42362 +       spin_lock_inode(inode);
42363 +       result = data.iplug->s.sd.save(inode, &area);
42364 +       znode_make_dirty(coord->node);
42365 +
42366 +       /* re-initialise stat-data seal */
42367 +
42368 +       /*
42369 +        * coord.between was possibly skewed from AT_UNIT when stat-data size
42370 +        * was changed and new extensions were pasted into item.
42371 +        */
42372 +       coord->between = AT_UNIT;
42373 +       reiser4_seal_init(&state->sd_seal, coord, key);
42374 +       state->sd_coord = *coord;
42375 +       spin_unlock_inode(inode);
42376 +       check_inode_seal(inode, coord, key);
42377 +       zrelse(loaded);
42378 +       return result;
42379 +}
42380 +
42381 +/* Update existing stat-data in a tree. Called with inode state locked. Return
42382 +   inode state locked. */
42383 +static int update_sd(struct inode *inode /* inode to update sd for */ )
42384 +{
42385 +       int result;
42386 +       reiser4_key key;
42387 +       coord_t coord;
42388 +       lock_handle lh;
42389 +
42390 +       assert("nikita-726", inode != NULL);
42391 +
42392 +       /* no stat-data, nothing to update?! */
42393 +       assert("nikita-3482", !reiser4_inode_get_flag(inode, REISER4_NO_SD));
42394 +
42395 +       init_lh(&lh);
42396 +
42397 +       result = locate_inode_sd(inode, &key, &coord, &lh);
42398 +       if (result == 0)
42399 +               result = update_sd_at(inode, &coord, &key, &lh);
42400 +       done_lh(&lh);
42401 +
42402 +       return result;
42403 +}
42404 +
42405 +/* helper for reiser4_delete_object_common and reiser4_delete_dir_common.
42406 +   Remove object stat data. Space for that must be reserved by caller before
42407 +*/
42408 +static int
42409 +common_object_delete_no_reserve(struct inode *inode /* object to remove */ )
42410 +{
42411 +       int result;
42412 +
42413 +       assert("nikita-1477", inode != NULL);
42414 +
42415 +       if (!reiser4_inode_get_flag(inode, REISER4_NO_SD)) {
42416 +               reiser4_key sd_key;
42417 +
42418 +               DQUOT_FREE_INODE(inode);
42419 +               DQUOT_DROP(inode);
42420 +
42421 +               build_sd_key(inode, &sd_key);
42422 +               result =
42423 +                   reiser4_cut_tree(reiser4_tree_by_inode(inode),
42424 +                                    &sd_key, &sd_key, NULL, 0);
42425 +               if (result == 0) {
42426 +                       reiser4_inode_set_flag(inode, REISER4_NO_SD);
42427 +                       result = oid_release(inode->i_sb, get_inode_oid(inode));
42428 +                       if (result == 0) {
42429 +                               oid_count_released();
42430 +
42431 +                               result = safe_link_del(reiser4_tree_by_inode(inode),
42432 +                                                      get_inode_oid(inode),
42433 +                                                      SAFE_UNLINK);
42434 +                       }
42435 +               }
42436 +       } else
42437 +               result = 0;
42438 +       return result;
42439 +}
42440 +
42441 +/* helper for safelink_common */
42442 +static int process_truncate(struct inode *inode, __u64 size)
42443 +{
42444 +       int result;
42445 +       struct iattr attr;
42446 +       file_plugin *fplug;
42447 +       reiser4_context *ctx;
42448 +       struct dentry dentry;
42449 +
42450 +       assert("vs-21", is_in_reiser4_context());
42451 +       ctx = reiser4_init_context(inode->i_sb);
42452 +       assert("vs-22", !IS_ERR(ctx));
42453 +
42454 +       attr.ia_size = size;
42455 +       attr.ia_valid = ATTR_SIZE | ATTR_CTIME;
42456 +       fplug = inode_file_plugin(inode);
42457 +
42458 +       mutex_lock(&inode->i_mutex);
42459 +       assert("vs-1704", get_current_context()->trans->atom == NULL);
42460 +       dentry.d_inode = inode;
42461 +       result = inode->i_op->setattr(&dentry, &attr);
42462 +       mutex_unlock(&inode->i_mutex);
42463 +
42464 +       context_set_commit_async(ctx);
42465 +       reiser4_exit_context(ctx);
42466 +
42467 +       return result;
42468 +}
42469 +
42470 +/*
42471 +  Local variables:
42472 +  c-indentation-style: "K&R"
42473 +  mode-name: "LC"
42474 +  c-basic-offset: 8
42475 +  tab-width: 8
42476 +  fill-column: 80
42477 +  scroll-step: 1
42478 +  End:
42479 +*/
42480 diff --git a/fs/reiser4/plugin/hash.c b/fs/reiser4/plugin/hash.c
42481 new file mode 100644
42482 index 0000000..70f1e40
42483 --- /dev/null
42484 +++ b/fs/reiser4/plugin/hash.c
42485 @@ -0,0 +1,353 @@
42486 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
42487 + * reiser4/README */
42488 +
42489 +/* Hash functions */
42490 +
42491 +#include "../debug.h"
42492 +#include "plugin_header.h"
42493 +#include "plugin.h"
42494 +#include "../super.h"
42495 +#include "../inode.h"
42496 +
42497 +#include <linux/types.h>
42498 +
42499 +/* old rupasov (yura) hash */
42500 +static __u64 hash_rupasov(const unsigned char *name /* name to hash */ ,
42501 +                         int len /* @name's length */ )
42502 +{
42503 +       int i;
42504 +       int j;
42505 +       int pow;
42506 +       __u64 a;
42507 +       __u64 c;
42508 +
42509 +       assert("nikita-672", name != NULL);
42510 +       assert("nikita-673", len >= 0);
42511 +
42512 +       for (pow = 1, i = 1; i < len; ++i)
42513 +               pow = pow * 10;
42514 +
42515 +       if (len == 1)
42516 +               a = name[0] - 48;
42517 +       else
42518 +               a = (name[0] - 48) * pow;
42519 +
42520 +       for (i = 1; i < len; ++i) {
42521 +               c = name[i] - 48;
42522 +               for (pow = 1, j = i; j < len - 1; ++j)
42523 +                       pow = pow * 10;
42524 +               a = a + c * pow;
42525 +       }
42526 +       for (; i < 40; ++i) {
42527 +               c = '0' - 48;
42528 +               for (pow = 1, j = i; j < len - 1; ++j)
42529 +                       pow = pow * 10;
42530 +               a = a + c * pow;
42531 +       }
42532 +
42533 +       for (; i < 256; ++i) {
42534 +               c = i;
42535 +               for (pow = 1, j = i; j < len - 1; ++j)
42536 +                       pow = pow * 10;
42537 +               a = a + c * pow;
42538 +       }
42539 +
42540 +       a = a << 7;
42541 +       return a;
42542 +}
42543 +
42544 +/* r5 hash */
42545 +static __u64 hash_r5(const unsigned char *name /* name to hash */ ,
42546 +                    int len UNUSED_ARG /* @name's length */ )
42547 +{
42548 +       __u64 a = 0;
42549 +
42550 +       assert("nikita-674", name != NULL);
42551 +       assert("nikita-675", len >= 0);
42552 +
42553 +       while (*name) {
42554 +               a += *name << 4;
42555 +               a += *name >> 4;
42556 +               a *= 11;
42557 +               name++;
42558 +       }
42559 +       return a;
42560 +}
42561 +
42562 +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function
42563 +     H0 = Key
42564 +     Hi = E Mi(Hi-1) + Hi-1
42565 +
42566 +   (see Applied Cryptography, 2nd edition, p448).
42567 +
42568 +   Jeremy Fitzhardinge <jeremy@zip.com.au> 1998
42569 +
42570 +   Jeremy has agreed to the contents of reiserfs/README. -Hans
42571 +
42572 +   This code was blindly upgraded to __u64 by s/__u32/__u64/g.
42573 +*/
42574 +static __u64 hash_tea(const unsigned char *name /* name to hash */ ,
42575 +                     int len /* @name's length */ )
42576 +{
42577 +       __u64 k[] = { 0x9464a485u, 0x542e1a94u, 0x3e846bffu, 0xb75bcfc3u };
42578 +
42579 +       __u64 h0 = k[0], h1 = k[1];
42580 +       __u64 a, b, c, d;
42581 +       __u64 pad;
42582 +       int i;
42583 +
42584 +       assert("nikita-676", name != NULL);
42585 +       assert("nikita-677", len >= 0);
42586 +
42587 +#define DELTA 0x9E3779B9u
42588 +#define FULLROUNDS 10          /* 32 is overkill, 16 is strong crypto */
42589 +#define PARTROUNDS 6           /* 6 gets complete mixing */
42590 +
42591 +/* a, b, c, d - data; h0, h1 - accumulated hash */
42592 +#define TEACORE(rounds)                                                        \
42593 +       do {                                                            \
42594 +               __u64 sum = 0;                                          \
42595 +               int n = rounds;                                         \
42596 +               __u64 b0, b1;                                           \
42597 +                                                                       \
42598 +               b0 = h0;                                                \
42599 +               b1 = h1;                                                \
42600 +                                                                       \
42601 +               do                                                      \
42602 +               {                                                       \
42603 +                       sum += DELTA;                                   \
42604 +                       b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \
42605 +                       b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \
42606 +               } while(--n);                                           \
42607 +                                                                       \
42608 +               h0 += b0;                                               \
42609 +               h1 += b1;                                               \
42610 +       } while(0)
42611 +
42612 +       pad = (__u64) len | ((__u64) len << 8);
42613 +       pad |= pad << 16;
42614 +
42615 +       while (len >= 16) {
42616 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42617 +                   16 | (__u64) name[3] << 24;
42618 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42619 +                   16 | (__u64) name[7] << 24;
42620 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42621 +                   16 | (__u64) name[11] << 24;
42622 +               d = (__u64) name[12] | (__u64) name[13] << 8 | (__u64) name[14]
42623 +                   << 16 | (__u64) name[15] << 24;
42624 +
42625 +               TEACORE(PARTROUNDS);
42626 +
42627 +               len -= 16;
42628 +               name += 16;
42629 +       }
42630 +
42631 +       if (len >= 12) {
42632 +               //assert(len < 16);
42633 +               if (len >= 16)
42634 +                       *(int *)0 = 0;
42635 +
42636 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42637 +                   16 | (__u64) name[3] << 24;
42638 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42639 +                   16 | (__u64) name[7] << 24;
42640 +               c = (__u64) name[8] | (__u64) name[9] << 8 | (__u64) name[10] <<
42641 +                   16 | (__u64) name[11] << 24;
42642 +
42643 +               d = pad;
42644 +               for (i = 12; i < len; i++) {
42645 +                       d <<= 8;
42646 +                       d |= name[i];
42647 +               }
42648 +       } else if (len >= 8) {
42649 +               //assert(len < 12);
42650 +               if (len >= 12)
42651 +                       *(int *)0 = 0;
42652 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42653 +                   16 | (__u64) name[3] << 24;
42654 +               b = (__u64) name[4] | (__u64) name[5] << 8 | (__u64) name[6] <<
42655 +                   16 | (__u64) name[7] << 24;
42656 +
42657 +               c = d = pad;
42658 +               for (i = 8; i < len; i++) {
42659 +                       c <<= 8;
42660 +                       c |= name[i];
42661 +               }
42662 +       } else if (len >= 4) {
42663 +               //assert(len < 8);
42664 +               if (len >= 8)
42665 +                       *(int *)0 = 0;
42666 +               a = (__u64) name[0] | (__u64) name[1] << 8 | (__u64) name[2] <<
42667 +                   16 | (__u64) name[3] << 24;
42668 +
42669 +               b = c = d = pad;
42670 +               for (i = 4; i < len; i++) {
42671 +                       b <<= 8;
42672 +                       b |= name[i];
42673 +               }
42674 +       } else {
42675 +               //assert(len < 4);
42676 +               if (len >= 4)
42677 +                       *(int *)0 = 0;
42678 +               a = b = c = d = pad;
42679 +               for (i = 0; i < len; i++) {
42680 +                       a <<= 8;
42681 +                       a |= name[i];
42682 +               }
42683 +       }
42684 +
42685 +       TEACORE(FULLROUNDS);
42686 +
42687 +/*     return 0;*/
42688 +       return h0 ^ h1;
42689 +
42690 +}
42691 +
42692 +/* classical 64 bit Fowler/Noll/Vo-1 (FNV-1) hash.
42693 +
42694 +   See http://www.isthe.com/chongo/tech/comp/fnv/ for details.
42695 +
42696 +   Excerpts:
42697 +
42698 +     FNV hashes are designed to be fast while maintaining a low collision
42699 +     rate.
42700 +
42701 +     [This version also seems to preserve lexicographical order locally.]
42702 +
42703 +     FNV hash algorithms and source code have been released into the public
42704 +     domain.
42705 +
42706 +*/
42707 +static __u64 hash_fnv1(const unsigned char *name /* name to hash */ ,
42708 +                      int len UNUSED_ARG /* @name's length */ )
42709 +{
42710 +       unsigned long long a = 0xcbf29ce484222325ull;
42711 +       const unsigned long long fnv_64_prime = 0x100000001b3ull;
42712 +
42713 +       assert("nikita-678", name != NULL);
42714 +       assert("nikita-679", len >= 0);
42715 +
42716 +       /* FNV-1 hash each octet in the buffer */
42717 +       for (; *name; ++name) {
42718 +               /* multiply by the 32 bit FNV magic prime mod 2^64 */
42719 +               a *= fnv_64_prime;
42720 +               /* xor the bottom with the current octet */
42721 +               a ^= (unsigned long long)(*name);
42722 +       }
42723 +       /* return our new hash value */
42724 +       return a;
42725 +}
42726 +
42727 +/* degenerate hash function used to simplify testing of non-unique key
42728 +   handling */
42729 +static __u64 hash_deg(const unsigned char *name UNUSED_ARG /* name to hash */ ,
42730 +                     int len UNUSED_ARG /* @name's length */ )
42731 +{
42732 +       return 0xc0c0c0c010101010ull;
42733 +}
42734 +
42735 +static int change_hash(struct inode *inode,
42736 +                      reiser4_plugin * plugin,
42737 +                      pset_member memb)
42738 +{
42739 +       int result;
42740 +
42741 +       assert("nikita-3503", inode != NULL);
42742 +       assert("nikita-3504", plugin != NULL);
42743 +
42744 +       assert("nikita-3505", is_reiser4_inode(inode));
42745 +       assert("nikita-3507", plugin->h.type_id == REISER4_HASH_PLUGIN_TYPE);
42746 +
42747 +       if (!plugin_of_group(inode_file_plugin(inode), REISER4_DIRECTORY_FILE))
42748 +               return RETERR(-EINVAL);
42749 +
42750 +       result = 0;
42751 +       if (inode_hash_plugin(inode) == NULL ||
42752 +           inode_hash_plugin(inode)->h.id != plugin->h.id) {
42753 +               if (is_dir_empty(inode) == 0)
42754 +                       result = aset_set_unsafe(&reiser4_inode_data(inode)->pset,
42755 +                                                PSET_HASH, plugin);
42756 +               else
42757 +                       result = RETERR(-ENOTEMPTY);
42758 +
42759 +       }
42760 +       return result;
42761 +}
42762 +
42763 +static reiser4_plugin_ops hash_plugin_ops = {
42764 +       .init = NULL,
42765 +       .load = NULL,
42766 +       .save_len = NULL,
42767 +       .save = NULL,
42768 +       .change = change_hash
42769 +};
42770 +
42771 +/* hash plugins */
42772 +hash_plugin hash_plugins[LAST_HASH_ID] = {
42773 +       [RUPASOV_HASH_ID] = {
42774 +               .h = {
42775 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42776 +                       .id = RUPASOV_HASH_ID,
42777 +                       .pops = &hash_plugin_ops,
42778 +                       .label = "rupasov",
42779 +                       .desc = "Original Yura's hash",
42780 +                       .linkage = {NULL, NULL}
42781 +               },
42782 +               .hash = hash_rupasov
42783 +       },
42784 +       [R5_HASH_ID] = {
42785 +               .h = {
42786 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42787 +                       .id = R5_HASH_ID,
42788 +                       .pops = &hash_plugin_ops,
42789 +                       .label = "r5",
42790 +                       .desc = "r5 hash",
42791 +                       .linkage = {NULL, NULL}
42792 +               },
42793 +               .hash = hash_r5
42794 +       },
42795 +       [TEA_HASH_ID] = {
42796 +               .h = {
42797 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42798 +                       .id = TEA_HASH_ID,
42799 +                       .pops = &hash_plugin_ops,
42800 +                       .label = "tea",
42801 +                       .desc = "tea hash",
42802 +                       .linkage = {NULL, NULL}
42803 +               },
42804 +               .hash = hash_tea
42805 +       },
42806 +       [FNV1_HASH_ID] = {
42807 +               .h = {
42808 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42809 +                       .id = FNV1_HASH_ID,
42810 +                       .pops = &hash_plugin_ops,
42811 +                       .label = "fnv1",
42812 +                       .desc = "fnv1 hash",
42813 +                       .linkage = {NULL, NULL}
42814 +               },
42815 +               .hash = hash_fnv1
42816 +       },
42817 +       [DEGENERATE_HASH_ID] = {
42818 +               .h = {
42819 +                       .type_id = REISER4_HASH_PLUGIN_TYPE,
42820 +                       .id = DEGENERATE_HASH_ID,
42821 +                       .pops = &hash_plugin_ops,
42822 +                       .label = "degenerate hash",
42823 +                       .desc = "Degenerate hash: only for testing",
42824 +                       .linkage = {NULL, NULL}
42825 +               },
42826 +               .hash = hash_deg
42827 +       }
42828 +};
42829 +
42830 +/* Make Linus happy.
42831 +   Local variables:
42832 +   c-indentation-style: "K&R"
42833 +   mode-name: "LC"
42834 +   c-basic-offset: 8
42835 +   tab-width: 8
42836 +   fill-column: 120
42837 +   End:
42838 +*/
42839 diff --git a/fs/reiser4/plugin/inode_ops.c b/fs/reiser4/plugin/inode_ops.c
42840 new file mode 100644
42841 index 0000000..48430f7
42842 --- /dev/null
42843 +++ b/fs/reiser4/plugin/inode_ops.c
42844 @@ -0,0 +1,897 @@
42845 +/*
42846 + * Copyright 2005 by Hans Reiser, licensing governed by reiser4/README
42847 + */
42848 +
42849 +/*
42850 + * this file contains typical implementations for most of methods of struct
42851 + * inode_operations
42852 + */
42853 +
42854 +#include "../inode.h"
42855 +#include "../safe_link.h"
42856 +
42857 +#include <linux/quotaops.h>
42858 +#include <linux/namei.h>
42859 +
42860 +static int create_vfs_object(struct inode *parent, struct dentry *dentry,
42861 +                     reiser4_object_create_data *data);
42862 +
42863 +/**
42864 + * reiser4_create_common - create of inode operations
42865 + * @parent: inode of parent directory
42866 + * @dentry: dentry of new object to create
42867 + * @mode: the permissions to use
42868 + * @nameidata:
42869 + *
42870 + * This is common implementation of vfs's create method of struct
42871 + * inode_operations.
42872 + * Creates regular file using file plugin from parent directory plugin set.
42873 + */
42874 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
42875 +                         int mode, struct nameidata *nameidata)
42876 +{
42877 +       reiser4_object_create_data data;
42878 +       file_plugin *fplug;
42879 +
42880 +       memset(&data, 0, sizeof data);
42881 +       data.mode = S_IFREG | mode;
42882 +       fplug = child_create_plugin(parent) ? : inode_create_plugin(parent);
42883 +       if (!plugin_of_group(fplug, REISER4_REGULAR_FILE)) {
42884 +               warning("vpf-1900", "'%s' is not a regular file plugin.",
42885 +                       fplug->h.label);
42886 +               return RETERR(-EIO);
42887 +       }
42888 +       data.id = fplug->h.id;
42889 +       return create_vfs_object(parent, dentry, &data);
42890 +}
42891 +
42892 +int reiser4_lookup_name(struct inode *dir, struct dentry *, reiser4_key *);
42893 +void check_light_weight(struct inode *inode, struct inode *parent);
42894 +
42895 +/**
42896 + * reiser4_lookup_common - lookup of inode operations
42897 + * @parent: inode of directory to lookup into
42898 + * @dentry: name to look for
42899 + * @nameidata:
42900 + *
42901 + * This is common implementation of vfs's lookup method of struct
42902 + * inode_operations.
42903 + */
42904 +struct dentry *reiser4_lookup_common(struct inode *parent,
42905 +                                    struct dentry *dentry,
42906 +                                    struct nameidata *nameidata)
42907 +{
42908 +       reiser4_context *ctx;
42909 +       int result;
42910 +       struct dentry *new;
42911 +       struct inode *inode;
42912 +       reiser4_dir_entry_desc entry;
42913 +
42914 +       ctx = reiser4_init_context(parent->i_sb);
42915 +       if (IS_ERR(ctx))
42916 +               return (struct dentry *)ctx;
42917 +
42918 +       /* set up operations on dentry. */
42919 +       dentry->d_op = &get_super_private(parent->i_sb)->ops.dentry;
42920 +
42921 +       result = reiser4_lookup_name(parent, dentry, &entry.key);
42922 +       if (result) {
42923 +               context_set_commit_async(ctx);
42924 +               reiser4_exit_context(ctx);
42925 +               if (result == -ENOENT) {
42926 +                       /* object not found */
42927 +                       if (!IS_DEADDIR(parent))
42928 +                               d_add(dentry, NULL);
42929 +                       return NULL;
42930 +               }
42931 +               return ERR_PTR(result);
42932 +       }
42933 +
42934 +       inode = reiser4_iget(parent->i_sb, &entry.key, 0);
42935 +       if (IS_ERR(inode)) {
42936 +               context_set_commit_async(ctx);
42937 +               reiser4_exit_context(ctx);
42938 +               return ERR_PTR(PTR_ERR(inode));
42939 +       }
42940 +
42941 +       /* success */
42942 +       check_light_weight(inode, parent);
42943 +       new = d_splice_alias(inode, dentry);
42944 +       reiser4_iget_complete(inode);
42945 +
42946 +       /* prevent balance_dirty_pages() from being called: we don't want to
42947 +        * do this under directory i_mutex. */
42948 +       context_set_commit_async(ctx);
42949 +       reiser4_exit_context(ctx);
42950 +       return new;
42951 +}
42952 +
42953 +static reiser4_block_nr common_estimate_link(struct inode *parent,
42954 +                                            struct inode *object);
42955 +int reiser4_update_dir(struct inode *);
42956 +
42957 +/**
42958 + * reiser4_link_common - link of inode operations
42959 + * @existing: dentry of object which is to get new name
42960 + * @parent: directory where new name is to be created
42961 + * @newname: new name
42962 + *
42963 + * This is common implementation of vfs's link method of struct
42964 + * inode_operations.
42965 + */
42966 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
42967 +                       struct dentry *newname)
42968 +{
42969 +       reiser4_context *ctx;
42970 +       int result;
42971 +       struct inode *object;
42972 +       dir_plugin *parent_dplug;
42973 +       reiser4_dir_entry_desc entry;
42974 +       reiser4_object_create_data data;
42975 +       reiser4_block_nr reserve;
42976 +
42977 +       ctx = reiser4_init_context(parent->i_sb);
42978 +       if (IS_ERR(ctx))
42979 +               return PTR_ERR(ctx);
42980 +
42981 +       assert("nikita-1431", existing != NULL);
42982 +       assert("nikita-1432", parent != NULL);
42983 +       assert("nikita-1433", newname != NULL);
42984 +
42985 +       object = existing->d_inode;
42986 +       assert("nikita-1434", object != NULL);
42987 +
42988 +       /* check for race with create_object() */
42989 +       if (reiser4_inode_get_flag(object, REISER4_IMMUTABLE)) {
42990 +               context_set_commit_async(ctx);
42991 +               reiser4_exit_context(ctx);
42992 +               return RETERR(-E_REPEAT);
42993 +       }
42994 +
42995 +       parent_dplug = inode_dir_plugin(parent);
42996 +
42997 +       memset(&entry, 0, sizeof entry);
42998 +       entry.obj = object;
42999 +
43000 +       data.mode = object->i_mode;
43001 +       data.id = inode_file_plugin(object)->h.id;
43002 +
43003 +       reserve = common_estimate_link(parent, existing->d_inode);
43004 +       if ((__s64) reserve < 0) {
43005 +               context_set_commit_async(ctx);
43006 +               reiser4_exit_context(ctx);
43007 +               return reserve;
43008 +       }
43009 +
43010 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
43011 +               context_set_commit_async(ctx);
43012 +               reiser4_exit_context(ctx);
43013 +               return RETERR(-ENOSPC);
43014 +       }
43015 +
43016 +       /*
43017 +        * Subtle race handling: sys_link() doesn't take i_mutex on @parent. It
43018 +        * means that link(2) can race against unlink(2) or rename(2), and
43019 +        * inode is dead (->i_nlink == 0) when reiser4_link() is entered.
43020 +        *
43021 +        * For such inode we have to undo special processing done in
43022 +        * reiser4_unlink() viz. creation of safe-link.
43023 +        */
43024 +       if (unlikely(object->i_nlink == 0)) {
43025 +               result = safe_link_del(reiser4_tree_by_inode(object),
43026 +                                      get_inode_oid(object), SAFE_UNLINK);
43027 +               if (result != 0) {
43028 +                       context_set_commit_async(ctx);
43029 +                       reiser4_exit_context(ctx);
43030 +                       return result;
43031 +               }
43032 +       }
43033 +
43034 +       /* increment nlink of @existing and update its stat data */
43035 +       result = reiser4_add_nlink(object, parent, 1);
43036 +       if (result == 0) {
43037 +               /* add entry to the parent */
43038 +               result =
43039 +                   parent_dplug->add_entry(parent, newname, &data, &entry);
43040 +               if (result != 0) {
43041 +                       /* failed to add entry to the parent, decrement nlink
43042 +                          of @existing */
43043 +                       reiser4_del_nlink(object, parent, 1);
43044 +                       /*
43045 +                        * now, if that failed, we have a file with too big
43046 +                        * nlink---space leak, much better than directory
43047 +                        * entry pointing to nowhere
43048 +                        */
43049 +               }
43050 +       }
43051 +       if (result == 0) {
43052 +               atomic_inc(&object->i_count);
43053 +               /*
43054 +                * Upon successful completion, link() shall mark for update
43055 +                * the st_ctime field of the file. Also, the st_ctime and
43056 +                * st_mtime fields of the directory that contains the new
43057 +                * entry shall be marked for update. --SUS
43058 +                */
43059 +               result = reiser4_update_dir(parent);
43060 +       }
43061 +       if (result == 0)
43062 +               d_instantiate(newname, existing->d_inode);
43063 +
43064 +       context_set_commit_async(ctx);
43065 +       reiser4_exit_context(ctx);
43066 +       return result;
43067 +}
43068 +
43069 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim);
43070 +
43071 +/**
43072 + * reiser4_unlink_common - unlink of inode operations
43073 + * @parent: inode of directory to remove name from
43074 + * @victim: name to be removed
43075 + *
43076 + * This is common implementation of vfs's unlink method of struct
43077 + * inode_operations.
43078 + */
43079 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim)
43080 +{
43081 +       reiser4_context *ctx;
43082 +       int result;
43083 +       struct inode *object;
43084 +       file_plugin *fplug;
43085 +
43086 +       ctx = reiser4_init_context(parent->i_sb);
43087 +       if (IS_ERR(ctx))
43088 +               return PTR_ERR(ctx);
43089 +
43090 +       object = victim->d_inode;
43091 +       fplug = inode_file_plugin(object);
43092 +       assert("nikita-2882", fplug->detach != NULL);
43093 +
43094 +       result = unlink_check_and_grab(parent, victim);
43095 +       if (result != 0) {
43096 +               context_set_commit_async(ctx);
43097 +               reiser4_exit_context(ctx);
43098 +               return result;
43099 +       }
43100 +
43101 +       result = fplug->detach(object, parent);
43102 +       if (result == 0) {
43103 +               dir_plugin *parent_dplug;
43104 +               reiser4_dir_entry_desc entry;
43105 +
43106 +               parent_dplug = inode_dir_plugin(parent);
43107 +               memset(&entry, 0, sizeof entry);
43108 +
43109 +               /* first, delete directory entry */
43110 +               result = parent_dplug->rem_entry(parent, victim, &entry);
43111 +               if (result == 0) {
43112 +                       /*
43113 +                        * if name was removed successfully, we _have_ to
43114 +                        * return 0 from this function, because upper level
43115 +                        * caller (vfs_{rmdir,unlink}) expect this.
43116 +                        *
43117 +                        * now that directory entry is removed, update
43118 +                        * stat-data
43119 +                        */
43120 +                       reiser4_del_nlink(object, parent, 1);
43121 +                       /*
43122 +                        * Upon successful completion, unlink() shall mark for
43123 +                        * update the st_ctime and st_mtime fields of the
43124 +                        * parent directory. Also, if the file's link count is
43125 +                        * not 0, the st_ctime field of the file shall be
43126 +                        * marked for update. --SUS
43127 +                        */
43128 +                       reiser4_update_dir(parent);
43129 +                       /* add safe-link for this file */
43130 +                       if (object->i_nlink == 0)
43131 +                               safe_link_add(object, SAFE_UNLINK);
43132 +               }
43133 +       }
43134 +
43135 +       if (unlikely(result != 0)) {
43136 +               if (result != -ENOMEM)
43137 +                       warning("nikita-3398", "Cannot unlink %llu (%i)",
43138 +                               (unsigned long long)get_inode_oid(object),
43139 +                               result);
43140 +               /* if operation failed commit pending inode modifications to
43141 +                * the stat-data */
43142 +               reiser4_update_sd(object);
43143 +               reiser4_update_sd(parent);
43144 +       }
43145 +
43146 +       reiser4_release_reserved(object->i_sb);
43147 +
43148 +       /* @object's i_ctime was updated by ->rem_link() method(). */
43149 +
43150 +       /* @victim can be already removed from the disk by this time. Inode is
43151 +          then marked so that iput() wouldn't try to remove stat data. But
43152 +          inode itself is still there.
43153 +        */
43154 +
43155 +       /*
43156 +        * we cannot release directory semaphore here, because name has
43157 +        * already been deleted, but dentry (@victim) still exists.  Prevent
43158 +        * balance_dirty_pages() from being called on exiting this context: we
43159 +        * don't want to do this under directory i_mutex.
43160 +        */
43161 +       context_set_commit_async(ctx);
43162 +       reiser4_exit_context(ctx);
43163 +       return result;
43164 +}
43165 +
43166 +/**
43167 + * reiser4_symlink_common - symlink of inode operations
43168 + * @parent: inode of parent directory
43169 + * @dentry: dentry of object to be created
43170 + * @linkname: string symlink is to contain
43171 + *
43172 + * This is common implementation of vfs's symlink method of struct
43173 + * inode_operations.
43174 + * Creates object using file plugin SYMLINK_FILE_PLUGIN_ID.
43175 + */
43176 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
43177 +                          const char *linkname)
43178 +{
43179 +       reiser4_object_create_data data;
43180 +
43181 +       memset(&data, 0, sizeof data);
43182 +       data.name = linkname;
43183 +       data.id = SYMLINK_FILE_PLUGIN_ID;
43184 +       data.mode = S_IFLNK | S_IRWXUGO;
43185 +       return create_vfs_object(parent, dentry, &data);
43186 +}
43187 +
43188 +/**
43189 + * reiser4_mkdir_common - mkdir of inode operations
43190 + * @parent: inode of parent directory
43191 + * @dentry: dentry of object to be created
43192 + * @mode: the permissions to use
43193 + *
43194 + * This is common implementation of vfs's mkdir method of struct
43195 + * inode_operations.
43196 + * Creates object using file plugin DIRECTORY_FILE_PLUGIN_ID.
43197 + */
43198 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode)
43199 +{
43200 +       reiser4_object_create_data data;
43201 +
43202 +       memset(&data, 0, sizeof data);
43203 +       data.mode = S_IFDIR | mode;
43204 +       data.id = DIRECTORY_FILE_PLUGIN_ID;
43205 +       return create_vfs_object(parent, dentry, &data);
43206 +}
43207 +
43208 +/**
43209 + * reiser4_mknod_common - mknod of inode operations
43210 + * @parent: inode of parent directory
43211 + * @dentry: dentry of object to be created
43212 + * @mode: the permissions to use and file type
43213 + * @rdev: minor and major of new device file
43214 + *
43215 + * This is common implementation of vfs's mknod method of struct
43216 + * inode_operations.
43217 + * Creates object using file plugin SPECIAL_FILE_PLUGIN_ID.
43218 + */
43219 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
43220 +                        int mode, dev_t rdev)
43221 +{
43222 +       reiser4_object_create_data data;
43223 +
43224 +       memset(&data, 0, sizeof data);
43225 +       data.mode = mode;
43226 +       data.rdev = rdev;
43227 +       data.id = SPECIAL_FILE_PLUGIN_ID;
43228 +       return create_vfs_object(parent, dentry, &data);
43229 +}
43230 +
43231 +/*
43232 + * implementation of vfs's rename method of struct inode_operations for typical
43233 + * directory is in inode_ops_rename.c
43234 + */
43235 +
43236 +/**
43237 + * reiser4_follow_link_common - follow_link of inode operations
43238 + * @dentry: dentry of symlink
43239 + * @data:
43240 + *
43241 + * This is common implementation of vfs's followlink method of struct
43242 + * inode_operations.
43243 + * Assumes that inode's i_private points to the content of symbolic link.
43244 + */
43245 +void *reiser4_follow_link_common(struct dentry *dentry, struct nameidata *nd)
43246 +{
43247 +       assert("vs-851", S_ISLNK(dentry->d_inode->i_mode));
43248 +
43249 +       if (!dentry->d_inode->i_private
43250 +           || !reiser4_inode_get_flag(dentry->d_inode,
43251 +                                      REISER4_GENERIC_PTR_USED))
43252 +               return ERR_PTR(RETERR(-EINVAL));
43253 +       nd_set_link(nd, dentry->d_inode->i_private);
43254 +       return NULL;
43255 +}
43256 +
43257 +/**
43258 + * reiser4_permission_common - permission of inode operations
43259 + * @inode: inode to check permissions for
43260 + * @mask: mode bits to check permissions for
43261 + * @nameidata:
43262 + *
43263 + * Uses generic function to check for rwx permissions.
43264 + */
43265 +int reiser4_permission_common(struct inode *inode, int mask,
43266 +                             struct nameidata *nameidata)
43267 +{
43268 +       return generic_permission(inode, mask, NULL);
43269 +}
43270 +
43271 +static int setattr_reserve(reiser4_tree *);
43272 +
43273 +/* this is common implementation of vfs's setattr method of struct
43274 +   inode_operations
43275 +*/
43276 +int reiser4_setattr_common(struct dentry *dentry, struct iattr *attr)
43277 +{
43278 +       reiser4_context *ctx;
43279 +       struct inode *inode;
43280 +       int result;
43281 +
43282 +       inode = dentry->d_inode;
43283 +       result = inode_change_ok(inode, attr);
43284 +       if (result)
43285 +               return result;
43286 +
43287 +       ctx = reiser4_init_context(inode->i_sb);
43288 +       if (IS_ERR(ctx))
43289 +               return PTR_ERR(ctx);
43290 +
43291 +       assert("nikita-3119", !(attr->ia_valid & ATTR_SIZE));
43292 +
43293 +       /*
43294 +        * grab disk space and call standard inode_setattr().
43295 +        */
43296 +       result = setattr_reserve(reiser4_tree_by_inode(inode));
43297 +       if (!result) {
43298 +               if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid)
43299 +                   || (attr->ia_valid & ATTR_GID
43300 +                       && attr->ia_gid != inode->i_gid)) {
43301 +                       result = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
43302 +                       if (result) {
43303 +                               context_set_commit_async(ctx);
43304 +                               reiser4_exit_context(ctx);
43305 +                               return result;
43306 +                       }
43307 +               }
43308 +               result = inode_setattr(inode, attr);
43309 +               if (!result)
43310 +                       reiser4_update_sd(inode);
43311 +       }
43312 +
43313 +       context_set_commit_async(ctx);
43314 +       reiser4_exit_context(ctx);
43315 +       return result;
43316 +}
43317 +
43318 +/* this is common implementation of vfs's getattr method of struct
43319 +   inode_operations
43320 +*/
43321 +int reiser4_getattr_common(struct vfsmount *mnt UNUSED_ARG,
43322 +                          struct dentry *dentry, struct kstat *stat)
43323 +{
43324 +       struct inode *obj;
43325 +
43326 +       assert("nikita-2298", dentry != NULL);
43327 +       assert("nikita-2299", stat != NULL);
43328 +       assert("nikita-2300", dentry->d_inode != NULL);
43329 +
43330 +       obj = dentry->d_inode;
43331 +
43332 +       stat->dev = obj->i_sb->s_dev;
43333 +       stat->ino = oid_to_uino(get_inode_oid(obj));
43334 +       stat->mode = obj->i_mode;
43335 +       /* don't confuse userland with huge nlink. This is not entirely
43336 +        * correct, because nlink_t is not necessary 16 bit signed. */
43337 +       stat->nlink = min(obj->i_nlink, (typeof(obj->i_nlink)) 0x7fff);
43338 +       stat->uid = obj->i_uid;
43339 +       stat->gid = obj->i_gid;
43340 +       stat->rdev = obj->i_rdev;
43341 +       stat->atime = obj->i_atime;
43342 +       stat->mtime = obj->i_mtime;
43343 +       stat->ctime = obj->i_ctime;
43344 +       stat->size = obj->i_size;
43345 +       stat->blocks =
43346 +           (inode_get_bytes(obj) + VFS_BLKSIZE - 1) >> VFS_BLKSIZE_BITS;
43347 +       /* "preferred" blocksize for efficient file system I/O */
43348 +       stat->blksize = get_super_private(obj->i_sb)->optimal_io_size;
43349 +
43350 +       return 0;
43351 +}
43352 +
43353 +/* Estimate the maximum amount of nodes which might be allocated or changed on
43354 +   typical new object creation. Typical creation consists of calling create
43355 +   method of file plugin, adding directory entry to parent and update parent
43356 +   directory's stat data.
43357 +*/
43358 +static reiser4_block_nr estimate_create_vfs_object(struct inode *parent,       /* parent object */
43359 +                                                  struct inode *object
43360 +                                                  /* object */ )
43361 +{
43362 +       assert("vpf-309", parent != NULL);
43363 +       assert("vpf-307", object != NULL);
43364 +
43365 +       return
43366 +           /* object creation estimation */
43367 +           inode_file_plugin(object)->estimate.create(object) +
43368 +           /* stat data of parent directory estimation */
43369 +           inode_file_plugin(parent)->estimate.update(parent) +
43370 +           /* adding entry estimation */
43371 +           inode_dir_plugin(parent)->estimate.add_entry(parent) +
43372 +           /* to undo in the case of failure */
43373 +           inode_dir_plugin(parent)->estimate.rem_entry(parent);
43374 +}
43375 +
43376 +/* Create child in directory.
43377 +
43378 +   . get object's plugin
43379 +   . get fresh inode
43380 +   . initialize inode
43381 +   . add object's stat-data
43382 +   . initialize object's directory
43383 +   . add entry to the parent
43384 +   . instantiate dentry
43385 +
43386 +*/
43387 +static int do_create_vfs_child(reiser4_object_create_data * data,      /* parameters of new
43388 +                                                                          object */
43389 +                              struct inode **retobj)
43390 +{
43391 +       int result;
43392 +
43393 +       struct dentry *dentry;  /* parent object */
43394 +       struct inode *parent;   /* new name */
43395 +
43396 +       dir_plugin *par_dir;    /* directory plugin on the parent */
43397 +       dir_plugin *obj_dir;    /* directory plugin on the new object */
43398 +       file_plugin *obj_plug;  /* object plugin on the new object */
43399 +       struct inode *object;   /* new object */
43400 +       reiser4_block_nr reserve;
43401 +
43402 +       reiser4_dir_entry_desc entry;   /* new directory entry */
43403 +
43404 +       assert("nikita-1420", data != NULL);
43405 +       parent = data->parent;
43406 +       dentry = data->dentry;
43407 +
43408 +       assert("nikita-1418", parent != NULL);
43409 +       assert("nikita-1419", dentry != NULL);
43410 +
43411 +       /* check, that name is acceptable for parent */
43412 +       par_dir = inode_dir_plugin(parent);
43413 +       if (par_dir->is_name_acceptable &&
43414 +           !par_dir->is_name_acceptable(parent,
43415 +                                        dentry->d_name.name,
43416 +                                        (int)dentry->d_name.len))
43417 +               return RETERR(-ENAMETOOLONG);
43418 +
43419 +       result = 0;
43420 +       obj_plug = file_plugin_by_id((int)data->id);
43421 +       if (obj_plug == NULL) {
43422 +               warning("nikita-430", "Cannot find plugin %i", data->id);
43423 +               return RETERR(-ENOENT);
43424 +       }
43425 +       object = new_inode(parent->i_sb);
43426 +       if (object == NULL)
43427 +               return RETERR(-ENOMEM);
43428 +       /* we'll update i_nlink below */
43429 +       object->i_nlink = 0;
43430 +       /* new_inode() initializes i_ino to "arbitrary" value. Reset it to 0,
43431 +        * to simplify error handling: if some error occurs before i_ino is
43432 +        * initialized with oid, i_ino should already be set to some
43433 +        * distinguished value. */
43434 +       object->i_ino = 0;
43435 +
43436 +       /* So that on error iput will be called. */
43437 +       *retobj = object;
43438 +
43439 +       if (DQUOT_ALLOC_INODE(object)) {
43440 +               DQUOT_DROP(object);
43441 +               object->i_flags |= S_NOQUOTA;
43442 +               return RETERR(-EDQUOT);
43443 +       }
43444 +
43445 +       memset(&entry, 0, sizeof entry);
43446 +       entry.obj = object;
43447 +
43448 +       set_plugin(&reiser4_inode_data(object)->pset, PSET_FILE,
43449 +                  file_plugin_to_plugin(obj_plug));
43450 +       result = obj_plug->set_plug_in_inode(object, parent, data);
43451 +       if (result) {
43452 +               warning("nikita-431", "Cannot install plugin %i on %llx",
43453 +                       data->id, (unsigned long long)get_inode_oid(object));
43454 +               DQUOT_FREE_INODE(object);
43455 +               object->i_flags |= S_NOQUOTA;
43456 +               return result;
43457 +       }
43458 +
43459 +       /* reget plugin after installation */
43460 +       obj_plug = inode_file_plugin(object);
43461 +
43462 +       if (obj_plug->create_object == NULL) {
43463 +               DQUOT_FREE_INODE(object);
43464 +               object->i_flags |= S_NOQUOTA;
43465 +               return RETERR(-EPERM);
43466 +       }
43467 +
43468 +       /* if any of hash, tail, sd or permission plugins for newly created
43469 +          object are not set yet set them here inheriting them from parent
43470 +          directory
43471 +        */
43472 +       assert("nikita-2070", obj_plug->adjust_to_parent != NULL);
43473 +       result = obj_plug->adjust_to_parent(object,
43474 +                                           parent,
43475 +                                           object->i_sb->s_root->d_inode);
43476 +       if (result == 0)
43477 +               result = finish_pset(object);
43478 +       if (result != 0) {
43479 +               warning("nikita-432", "Cannot inherit from %llx to %llx",
43480 +                       (unsigned long long)get_inode_oid(parent),
43481 +                       (unsigned long long)get_inode_oid(object));
43482 +               DQUOT_FREE_INODE(object);
43483 +               object->i_flags |= S_NOQUOTA;
43484 +               return result;
43485 +       }
43486 +
43487 +       /* setup inode and file-operations for this inode */
43488 +       setup_inode_ops(object, data);
43489 +
43490 +       /* call file plugin's method to initialize plugin specific part of
43491 +        * inode */
43492 +       if (obj_plug->init_inode_data)
43493 +               obj_plug->init_inode_data(object, data, 1 /*create */ );
43494 +
43495 +       /* obtain directory plugin (if any) for new object. */
43496 +       obj_dir = inode_dir_plugin(object);
43497 +       if (obj_dir != NULL && obj_dir->init == NULL) {
43498 +               DQUOT_FREE_INODE(object);
43499 +               object->i_flags |= S_NOQUOTA;
43500 +               return RETERR(-EPERM);
43501 +       }
43502 +
43503 +       reiser4_inode_data(object)->locality_id = get_inode_oid(parent);
43504 +
43505 +       reserve = estimate_create_vfs_object(parent, object);
43506 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT)) {
43507 +               DQUOT_FREE_INODE(object);
43508 +               object->i_flags |= S_NOQUOTA;
43509 +               return RETERR(-ENOSPC);
43510 +       }
43511 +
43512 +       /* mark inode `immutable'. We disable changes to the file being
43513 +          created until valid directory entry for it is inserted. Otherwise,
43514 +          if file were expanded and insertion of directory entry fails, we
43515 +          have to remove file, but we only alloted enough space in
43516 +          transaction to remove _empty_ file. 3.x code used to remove stat
43517 +          data in different transaction thus possibly leaking disk space on
43518 +          crash. This all only matters if it's possible to access file
43519 +          without name, for example, by inode number
43520 +        */
43521 +       reiser4_inode_set_flag(object, REISER4_IMMUTABLE);
43522 +
43523 +       /* create empty object, this includes allocation of new objectid. For
43524 +          directories this implies creation of dot and dotdot  */
43525 +       assert("nikita-2265", reiser4_inode_get_flag(object, REISER4_NO_SD));
43526 +
43527 +       /* mark inode as `loaded'. From this point onward
43528 +          reiser4_delete_inode() will try to remove its stat-data. */
43529 +       reiser4_inode_set_flag(object, REISER4_LOADED);
43530 +
43531 +       result = obj_plug->create_object(object, parent, data);
43532 +       if (result != 0) {
43533 +               reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
43534 +               if (result != -ENAMETOOLONG && result != -ENOMEM)
43535 +                       warning("nikita-2219",
43536 +                               "Failed to create sd for %llu",
43537 +                               (unsigned long long)get_inode_oid(object));
43538 +               DQUOT_FREE_INODE(object);
43539 +               object->i_flags |= S_NOQUOTA;
43540 +               return result;
43541 +       }
43542 +
43543 +       if (obj_dir != NULL)
43544 +               result = obj_dir->init(object, parent, data);
43545 +       if (result == 0) {
43546 +               assert("nikita-434", !reiser4_inode_get_flag(object,
43547 +                                                            REISER4_NO_SD));
43548 +               /* insert inode into VFS hash table */
43549 +               insert_inode_hash(object);
43550 +               /* create entry */
43551 +               result = par_dir->add_entry(parent, dentry, data, &entry);
43552 +               if (result == 0) {
43553 +                       result = reiser4_add_nlink(object, parent, 0);
43554 +                       /* If O_CREAT is set and the file did not previously
43555 +                          exist, upon successful completion, open() shall
43556 +                          mark for update the st_atime, st_ctime, and
43557 +                          st_mtime fields of the file and the st_ctime and
43558 +                          st_mtime fields of the parent directory. --SUS
43559 +                        */
43560 +                       /* @object times are already updated by
43561 +                          reiser4_add_nlink() */
43562 +                       if (result == 0)
43563 +                               reiser4_update_dir(parent);
43564 +                       if (result != 0)
43565 +                               /* cleanup failure to add nlink */
43566 +                               par_dir->rem_entry(parent, dentry, &entry);
43567 +               }
43568 +               if (result != 0)
43569 +                       /* cleanup failure to add entry */
43570 +                       obj_plug->detach(object, parent);
43571 +       } else if (result != -ENOMEM)
43572 +               warning("nikita-2219", "Failed to initialize dir for %llu: %i",
43573 +                       (unsigned long long)get_inode_oid(object), result);
43574 +
43575 +       /*
43576 +        * update stat-data, committing all pending modifications to the inode
43577 +        * fields.
43578 +        */
43579 +       reiser4_update_sd(object);
43580 +       if (result != 0) {
43581 +               DQUOT_FREE_INODE(object);
43582 +               object->i_flags |= S_NOQUOTA;
43583 +               /* if everything was ok (result == 0), parent stat-data is
43584 +                * already updated above (update_parent_dir()) */
43585 +               reiser4_update_sd(parent);
43586 +               /* failure to create entry, remove object */
43587 +               obj_plug->delete_object(object);
43588 +       }
43589 +
43590 +       /* file has name now, clear immutable flag */
43591 +       reiser4_inode_clr_flag(object, REISER4_IMMUTABLE);
43592 +
43593 +       /* on error, iput() will call ->delete_inode(). We should keep track
43594 +          of the existence of stat-data for this inode and avoid attempt to
43595 +          remove it in reiser4_delete_inode(). This is accomplished through
43596 +          REISER4_NO_SD bit in inode.u.reiser4_i.plugin.flags
43597 +        */
43598 +       return result;
43599 +}
43600 +
43601 +/* this is helper for common implementations of reiser4_mkdir, reiser4_create,
43602 +   reiser4_mknod and reiser4_symlink
43603 +*/
43604 +static int
43605 +create_vfs_object(struct inode *parent,
43606 +                 struct dentry *dentry, reiser4_object_create_data * data)
43607 +{
43608 +       reiser4_context *ctx;
43609 +       int result;
43610 +       struct inode *child;
43611 +
43612 +       ctx = reiser4_init_context(parent->i_sb);
43613 +       if (IS_ERR(ctx))
43614 +               return PTR_ERR(ctx);
43615 +       context_set_commit_async(ctx);
43616 +
43617 +       data->parent = parent;
43618 +       data->dentry = dentry;
43619 +       child = NULL;
43620 +       result = do_create_vfs_child(data, &child);
43621 +       if (unlikely(result != 0)) {
43622 +               if (child != NULL) {
43623 +                       reiser4_make_bad_inode(child);
43624 +                       iput(child);
43625 +               }
43626 +       } else
43627 +               d_instantiate(dentry, child);
43628 +
43629 +       reiser4_exit_context(ctx);
43630 +       return result;
43631 +}
43632 +
43633 +/* helper for link_common. Estimate disk space necessary to add a link
43634 +   from @parent to @object
43635 +*/
43636 +static reiser4_block_nr common_estimate_link(struct inode *parent,     /* parent directory */
43637 +                                            struct inode *object
43638 +                                            /* object to which new link is being cerated */
43639 +                                            )
43640 +{
43641 +       reiser4_block_nr res = 0;
43642 +       file_plugin *fplug;
43643 +       dir_plugin *dplug;
43644 +
43645 +       assert("vpf-317", object != NULL);
43646 +       assert("vpf-318", parent != NULL);
43647 +
43648 +       fplug = inode_file_plugin(object);
43649 +       dplug = inode_dir_plugin(parent);
43650 +       /* VS-FIXME-HANS: why do we do fplug->estimate.update(object) twice instead of multiplying by 2? */
43651 +       /* reiser4_add_nlink(object) */
43652 +       res += fplug->estimate.update(object);
43653 +       /* add_entry(parent) */
43654 +       res += dplug->estimate.add_entry(parent);
43655 +       /* reiser4_del_nlink(object) */
43656 +       res += fplug->estimate.update(object);
43657 +       /* update_dir(parent) */
43658 +       res += inode_file_plugin(parent)->estimate.update(parent);
43659 +       /* safe-link */
43660 +       res += estimate_one_item_removal(reiser4_tree_by_inode(object));
43661 +
43662 +       return res;
43663 +}
43664 +
43665 +/* Estimate disk space necessary to remove a link between @parent and
43666 +   @object.
43667 +*/
43668 +static reiser4_block_nr estimate_unlink(struct inode *parent,  /* parent directory */
43669 +                                       struct inode *object
43670 +                                       /* object to which new link is being cerated */
43671 +                                       )
43672 +{
43673 +       reiser4_block_nr res = 0;
43674 +       file_plugin *fplug;
43675 +       dir_plugin *dplug;
43676 +
43677 +       assert("vpf-317", object != NULL);
43678 +       assert("vpf-318", parent != NULL);
43679 +
43680 +       fplug = inode_file_plugin(object);
43681 +       dplug = inode_dir_plugin(parent);
43682 +
43683 +       /* rem_entry(parent) */
43684 +       res += dplug->estimate.rem_entry(parent);
43685 +       /* reiser4_del_nlink(object) */
43686 +       res += fplug->estimate.update(object);
43687 +       /* update_dir(parent) */
43688 +       res += inode_file_plugin(parent)->estimate.update(parent);
43689 +       /* fplug->unlink */
43690 +       res += fplug->estimate.unlink(object, parent);
43691 +       /* safe-link */
43692 +       res += estimate_one_insert_item(reiser4_tree_by_inode(object));
43693 +
43694 +       return res;
43695 +}
43696 +
43697 +/* helper for reiser4_unlink_common. Estimate and grab space for unlink. */
43698 +static int unlink_check_and_grab(struct inode *parent, struct dentry *victim)
43699 +{
43700 +       file_plugin *fplug;
43701 +       struct inode *child;
43702 +       int result;
43703 +
43704 +       result = 0;
43705 +       child = victim->d_inode;
43706 +       fplug = inode_file_plugin(child);
43707 +
43708 +       /* check for race with create_object() */
43709 +       if (reiser4_inode_get_flag(child, REISER4_IMMUTABLE))
43710 +               return RETERR(-E_REPEAT);
43711 +       /* object being deleted should have stat data */
43712 +       assert("vs-949", !reiser4_inode_get_flag(child, REISER4_NO_SD));
43713 +
43714 +       /* ask object plugin */
43715 +       if (fplug->can_rem_link != NULL && !fplug->can_rem_link(child))
43716 +               return RETERR(-ENOTEMPTY);
43717 +
43718 +       result = (int)estimate_unlink(parent, child);
43719 +       if (result < 0)
43720 +               return result;
43721 +
43722 +       return reiser4_grab_reserved(child->i_sb, result, BA_CAN_COMMIT);
43723 +}
43724 +
43725 +/* helper for reiser4_setattr_common */
43726 +static int setattr_reserve(reiser4_tree * tree)
43727 +{
43728 +       assert("vs-1096", is_grab_enabled(get_current_context()));
43729 +       return reiser4_grab_space(estimate_one_insert_into_item(tree),
43730 +                                 BA_CAN_COMMIT);
43731 +}
43732 +
43733 +/* helper function. Standards require that for many file-system operations
43734 +   on success ctime and mtime of parent directory is to be updated. */
43735 +int reiser4_update_dir(struct inode *dir)
43736 +{
43737 +       assert("nikita-2525", dir != NULL);
43738 +
43739 +       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
43740 +       return reiser4_update_sd(dir);
43741 +}
43742 diff --git a/fs/reiser4/plugin/inode_ops_rename.c b/fs/reiser4/plugin/inode_ops_rename.c
43743 new file mode 100644
43744 index 0000000..a64e777
43745 --- /dev/null
43746 +++ b/fs/reiser4/plugin/inode_ops_rename.c
43747 @@ -0,0 +1,914 @@
43748 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
43749 + * reiser4/README */
43750 +
43751 +#include "../inode.h"
43752 +#include "../safe_link.h"
43753 +
43754 +static const char *possible_leak = "Possible disk space leak.";
43755 +
43756 +/* re-bind existing name at @from_coord in @from_dir to point to @to_inode.
43757 +
43758 +   Helper function called from hashed_rename() */
43759 +static int replace_name(struct inode *to_inode,        /* inode where @from_coord is
43760 +                                                * to be re-targeted at */
43761 +                       struct inode *from_dir, /* directory where @from_coord
43762 +                                                * lives */
43763 +                       struct inode *from_inode,       /* inode @from_coord
43764 +                                                        * originally point to */
43765 +                       coord_t * from_coord,   /* where directory entry is in
43766 +                                                * the tree */
43767 +                       lock_handle * from_lh /* lock handle on @from_coord */ )
43768 +{
43769 +       item_plugin *from_item;
43770 +       int result;
43771 +       znode *node;
43772 +
43773 +       coord_clear_iplug(from_coord);
43774 +       node = from_coord->node;
43775 +       result = zload(node);
43776 +       if (result != 0)
43777 +               return result;
43778 +       from_item = item_plugin_by_coord(from_coord);
43779 +       if (plugin_of_group(item_plugin_by_coord(from_coord),
43780 +                           DIR_ENTRY_ITEM_TYPE))
43781 +       {
43782 +               reiser4_key to_key;
43783 +
43784 +               build_sd_key(to_inode, &to_key);
43785 +
43786 +               /* everything is found and prepared to change directory entry
43787 +                  at @from_coord to point to @to_inode.
43788 +
43789 +                  @to_inode is just about to get new name, so bump its link
43790 +                  counter.
43791 +
43792 +                */
43793 +               result = reiser4_add_nlink(to_inode, from_dir, 0);
43794 +               if (result != 0) {
43795 +                       /* Don't issue warning: this may be plain -EMLINK */
43796 +                       zrelse(node);
43797 +                       return result;
43798 +               }
43799 +
43800 +               result =
43801 +                   from_item->s.dir.update_key(from_coord, &to_key, from_lh);
43802 +               if (result != 0) {
43803 +                       reiser4_del_nlink(to_inode, from_dir, 0);
43804 +                       zrelse(node);
43805 +                       return result;
43806 +               }
43807 +
43808 +               /* @from_inode just lost its name, he-he.
43809 +
43810 +                  If @from_inode was directory, it contained dotdot pointing
43811 +                  to @from_dir. @from_dir i_nlink will be decreased when
43812 +                  iput() will be called on @from_inode.
43813 +
43814 +                  If file-system is not ADG (hard-links are
43815 +                  supported on directories), iput(from_inode) will not remove
43816 +                  @from_inode, and thus above is incorrect, but hard-links on
43817 +                  directories are problematic in many other respects.
43818 +                */
43819 +               result = reiser4_del_nlink(from_inode, from_dir, 0);
43820 +               if (result != 0) {
43821 +                       warning("nikita-2330",
43822 +                               "Cannot remove link from source: %i. %s",
43823 +                               result, possible_leak);
43824 +               }
43825 +               /* Has to return success, because entry is already
43826 +                * modified. */
43827 +               result = 0;
43828 +
43829 +               /* NOTE-NIKITA consider calling plugin method in stead of
43830 +                  accessing inode fields directly. */
43831 +               from_dir->i_mtime = CURRENT_TIME;
43832 +       } else {
43833 +               warning("nikita-2326", "Unexpected item type");
43834 +               result = RETERR(-EIO);
43835 +       }
43836 +       zrelse(node);
43837 +       return result;
43838 +}
43839 +
43840 +/* add new entry pointing to @inode into @dir at @coord, locked by @lh
43841 +
43842 +   Helper function used by hashed_rename(). */
43843 +static int add_name(struct inode *inode,       /* inode where @coord is to be
43844 +                                                * re-targeted at */
43845 +                   struct inode *dir,  /* directory where @coord lives */
43846 +                   struct dentry *name,        /* new name */
43847 +                   coord_t * coord,    /* where directory entry is in the tree */
43848 +                   lock_handle * lh,   /* lock handle on @coord */
43849 +                   int is_dir /* true, if @inode is directory */ )
43850 +{
43851 +       int result;
43852 +       reiser4_dir_entry_desc entry;
43853 +
43854 +       assert("nikita-2333", lh->node == coord->node);
43855 +       assert("nikita-2334", is_dir == S_ISDIR(inode->i_mode));
43856 +
43857 +       memset(&entry, 0, sizeof entry);
43858 +       entry.obj = inode;
43859 +       /* build key of directory entry description */
43860 +       inode_dir_plugin(dir)->build_entry_key(dir, &name->d_name, &entry.key);
43861 +
43862 +       /* ext2 does this in different order: first inserts new entry,
43863 +          then increases directory nlink. We don't want do this,
43864 +          because reiser4_add_nlink() calls ->add_link() plugin
43865 +          method that can fail for whatever reason, leaving as with
43866 +          cleanup problems.
43867 +        */
43868 +       /* @inode is getting new name */
43869 +       reiser4_add_nlink(inode, dir, 0);
43870 +       /* create @new_name in @new_dir pointing to
43871 +          @old_inode */
43872 +       result = WITH_COORD(coord,
43873 +                           inode_dir_item_plugin(dir)->s.dir.add_entry(dir,
43874 +                                                                       coord,
43875 +                                                                       lh,
43876 +                                                                       name,
43877 +                                                                       &entry));
43878 +       if (result != 0) {
43879 +               int result2;
43880 +               result2 = reiser4_del_nlink(inode, dir, 0);
43881 +               if (result2 != 0) {
43882 +                       warning("nikita-2327",
43883 +                               "Cannot drop link on %lli %i. %s",
43884 +                               (unsigned long long)get_inode_oid(inode),
43885 +                               result2, possible_leak);
43886 +               }
43887 +       } else
43888 +               INODE_INC_FIELD(dir, i_size);
43889 +       return result;
43890 +}
43891 +
43892 +static reiser4_block_nr estimate_rename(struct inode *old_dir, /* directory where @old is located */
43893 +                                       struct dentry *old_name,        /* old name */
43894 +                                       struct inode *new_dir,  /* directory where @new is located */
43895 +                                       struct dentry *new_name /* new name */ )
43896 +{
43897 +       reiser4_block_nr res1, res2;
43898 +       dir_plugin *p_parent_old, *p_parent_new;
43899 +       file_plugin *p_child_old, *p_child_new;
43900 +
43901 +       assert("vpf-311", old_dir != NULL);
43902 +       assert("vpf-312", new_dir != NULL);
43903 +       assert("vpf-313", old_name != NULL);
43904 +       assert("vpf-314", new_name != NULL);
43905 +
43906 +       p_parent_old = inode_dir_plugin(old_dir);
43907 +       p_parent_new = inode_dir_plugin(new_dir);
43908 +       p_child_old = inode_file_plugin(old_name->d_inode);
43909 +       if (new_name->d_inode)
43910 +               p_child_new = inode_file_plugin(new_name->d_inode);
43911 +       else
43912 +               p_child_new = NULL;
43913 +
43914 +       /* find_entry - can insert one leaf. */
43915 +       res1 = res2 = 1;
43916 +
43917 +       /* replace_name */
43918 +       {
43919 +               /* reiser4_add_nlink(p_child_old) and reiser4_del_nlink(p_child_old) */
43920 +               res1 += 2 * p_child_old->estimate.update(old_name->d_inode);
43921 +               /* update key */
43922 +               res1 += 1;
43923 +               /* reiser4_del_nlink(p_child_new) */
43924 +               if (p_child_new)
43925 +                       res1 += p_child_new->estimate.update(new_name->d_inode);
43926 +       }
43927 +
43928 +       /* else add_name */
43929 +       {
43930 +               /* reiser4_add_nlink(p_parent_new) and reiser4_del_nlink(p_parent_new) */
43931 +               res2 +=
43932 +                   2 * inode_file_plugin(new_dir)->estimate.update(new_dir);
43933 +               /* reiser4_add_nlink(p_parent_old) */
43934 +               res2 += p_child_old->estimate.update(old_name->d_inode);
43935 +               /* add_entry(p_parent_new) */
43936 +               res2 += p_parent_new->estimate.add_entry(new_dir);
43937 +               /* reiser4_del_nlink(p_parent_old) */
43938 +               res2 += p_child_old->estimate.update(old_name->d_inode);
43939 +       }
43940 +
43941 +       res1 = res1 < res2 ? res2 : res1;
43942 +
43943 +       /* reiser4_write_sd(p_parent_new) */
43944 +       res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43945 +
43946 +       /* reiser4_write_sd(p_child_new) */
43947 +       if (p_child_new)
43948 +               res1 += p_child_new->estimate.update(new_name->d_inode);
43949 +
43950 +       /* hashed_rem_entry(p_parent_old) */
43951 +       res1 += p_parent_old->estimate.rem_entry(old_dir);
43952 +
43953 +       /* reiser4_del_nlink(p_child_old) */
43954 +       res1 += p_child_old->estimate.update(old_name->d_inode);
43955 +
43956 +       /* replace_name */
43957 +       {
43958 +               /* reiser4_add_nlink(p_parent_dir_new) */
43959 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43960 +               /* update_key */
43961 +               res1 += 1;
43962 +               /* reiser4_del_nlink(p_parent_new) */
43963 +               res1 += inode_file_plugin(new_dir)->estimate.update(new_dir);
43964 +               /* reiser4_del_nlink(p_parent_old) */
43965 +               res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43966 +       }
43967 +
43968 +       /* reiser4_write_sd(p_parent_old) */
43969 +       res1 += inode_file_plugin(old_dir)->estimate.update(old_dir);
43970 +
43971 +       /* reiser4_write_sd(p_child_old) */
43972 +       res1 += p_child_old->estimate.update(old_name->d_inode);
43973 +
43974 +       return res1;
43975 +}
43976 +
43977 +static int hashed_rename_estimate_and_grab(struct inode *old_dir,      /* directory where @old is located */
43978 +                                          struct dentry *old_name,     /* old name */
43979 +                                          struct inode *new_dir,       /* directory where @new is located */
43980 +                                          struct dentry *new_name
43981 +                                          /* new name */ )
43982 +{
43983 +       reiser4_block_nr reserve;
43984 +
43985 +       reserve = estimate_rename(old_dir, old_name, new_dir, new_name);
43986 +
43987 +       if (reiser4_grab_space(reserve, BA_CAN_COMMIT))
43988 +               return RETERR(-ENOSPC);
43989 +
43990 +       return 0;
43991 +}
43992 +
43993 +/* check whether @old_inode and @new_inode can be moved within file system
43994 + * tree. This singles out attempts to rename pseudo-files, for example. */
43995 +static int can_rename(struct inode *old_dir, struct inode *old_inode,
43996 +                     struct inode *new_dir, struct inode *new_inode)
43997 +{
43998 +       file_plugin *fplug;
43999 +       dir_plugin *dplug;
44000 +
44001 +       assert("nikita-3370", old_inode != NULL);
44002 +
44003 +       dplug = inode_dir_plugin(new_dir);
44004 +       fplug = inode_file_plugin(old_inode);
44005 +
44006 +       if (dplug == NULL)
44007 +               return RETERR(-ENOTDIR);
44008 +       else if (new_dir->i_op->create == NULL)
44009 +               return RETERR(-EPERM);
44010 +       else if (!fplug->can_add_link(old_inode))
44011 +               return RETERR(-EMLINK);
44012 +       else if (new_inode != NULL) {
44013 +               fplug = inode_file_plugin(new_inode);
44014 +               if (fplug->can_rem_link != NULL &&
44015 +                   !fplug->can_rem_link(new_inode))
44016 +                       return RETERR(-EBUSY);
44017 +       }
44018 +       return 0;
44019 +}
44020 +
44021 +int reiser4_find_entry(struct inode *, struct dentry *, lock_handle *,
44022 +              znode_lock_mode, reiser4_dir_entry_desc *);
44023 +int reiser4_update_dir(struct inode *);
44024 +
44025 +/* this is common implementation of vfs's rename method of struct
44026 +   inode_operations
44027 +   See comments in the body.
44028 +
44029 +   It is arguable that this function can be made generic so, that it
44030 +   will be applicable to any kind of directory plugin that deals with
44031 +   directories composed out of directory entries. The only obstacle
44032 +   here is that we don't have any data-type to represent directory
44033 +   entry. This should be re-considered when more than one different
44034 +   directory plugin will be implemented.
44035 +*/
44036 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
44037 +                                                * is located */ ,
44038 +                         struct dentry *old_name /* old name */ ,
44039 +                         struct inode *new_dir /* directory where @new
44040 +                                                * is located */ ,
44041 +                         struct dentry *new_name /* new name */ )
44042 +{
44043 +       /* From `The Open Group Base Specifications Issue 6'
44044 +
44045 +          If either the old or new argument names a symbolic link, rename()
44046 +          shall operate on the symbolic link itself, and shall not resolve
44047 +          the last component of the argument. If the old argument and the new
44048 +          argument resolve to the same existing file, rename() shall return
44049 +          successfully and perform no other action.
44050 +
44051 +          [this is done by VFS: vfs_rename()]
44052 +
44053 +          If the old argument points to the pathname of a file that is not a
44054 +          directory, the new argument shall not point to the pathname of a
44055 +          directory.
44056 +
44057 +          [checked by VFS: vfs_rename->may_delete()]
44058 +
44059 +          If the link named by the new argument exists, it shall
44060 +          be removed and old renamed to new. In this case, a link named new
44061 +          shall remain visible to other processes throughout the renaming
44062 +          operation and refer either to the file referred to by new or old
44063 +          before the operation began.
44064 +
44065 +          [we should assure this]
44066 +
44067 +          Write access permission is required for
44068 +          both the directory containing old and the directory containing new.
44069 +
44070 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
44071 +
44072 +          If the old argument points to the pathname of a directory, the new
44073 +          argument shall not point to the pathname of a file that is not a
44074 +          directory.
44075 +
44076 +          [checked by VFS: vfs_rename->may_delete()]
44077 +
44078 +          If the directory named by the new argument exists, it
44079 +          shall be removed and old renamed to new. In this case, a link named
44080 +          new shall exist throughout the renaming operation and shall refer
44081 +          either to the directory referred to by new or old before the
44082 +          operation began.
44083 +
44084 +          [we should assure this]
44085 +
44086 +          If new names an existing directory, it shall be
44087 +          required to be an empty directory.
44088 +
44089 +          [we should check this]
44090 +
44091 +          If the old argument points to a pathname of a symbolic link, the
44092 +          symbolic link shall be renamed. If the new argument points to a
44093 +          pathname of a symbolic link, the symbolic link shall be removed.
44094 +
44095 +          The new pathname shall not contain a path prefix that names
44096 +          old. Write access permission is required for the directory
44097 +          containing old and the directory containing new. If the old
44098 +          argument points to the pathname of a directory, write access
44099 +          permission may be required for the directory named by old, and, if
44100 +          it exists, the directory named by new.
44101 +
44102 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
44103 +
44104 +          If the link named by the new argument exists and the file's link
44105 +          count becomes 0 when it is removed and no process has the file
44106 +          open, the space occupied by the file shall be freed and the file
44107 +          shall no longer be accessible. If one or more processes have the
44108 +          file open when the last link is removed, the link shall be removed
44109 +          before rename() returns, but the removal of the file contents shall
44110 +          be postponed until all references to the file are closed.
44111 +
44112 +          [iput() handles this, but we can do this manually, a la
44113 +          reiser4_unlink()]
44114 +
44115 +          Upon successful completion, rename() shall mark for update the
44116 +          st_ctime and st_mtime fields of the parent directory of each file.
44117 +
44118 +          [N/A]
44119 +
44120 +        */
44121 +       reiser4_context *ctx;
44122 +       int result;
44123 +       int is_dir;             /* is @old_name directory */
44124 +
44125 +       struct inode *old_inode;
44126 +       struct inode *new_inode;
44127 +       coord_t *new_coord;
44128 +
44129 +       reiser4_dentry_fsdata *new_fsdata;
44130 +       dir_plugin *dplug;
44131 +       file_plugin *fplug;
44132 +
44133 +       reiser4_dir_entry_desc *old_entry, *new_entry, *dotdot_entry;
44134 +       lock_handle *new_lh, *dotdot_lh;
44135 +       struct dentry *dotdot_name;
44136 +       reiser4_dentry_fsdata *dataonstack;
44137 +
44138 +       ctx = reiser4_init_context(old_dir->i_sb);
44139 +       if (IS_ERR(ctx))
44140 +               return PTR_ERR(ctx);
44141 +
44142 +       old_entry = kmalloc(3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
44143 +                           sizeof(*dotdot_name) + sizeof(*dataonstack),
44144 +                           reiser4_ctx_gfp_mask_get());
44145 +       if (old_entry == NULL) {
44146 +               context_set_commit_async(ctx);
44147 +               reiser4_exit_context(ctx);
44148 +               return RETERR(-ENOMEM);
44149 +       }
44150 +       memset(old_entry, 0, 3 * sizeof(*old_entry) + 2 * sizeof(*new_lh) +
44151 +              sizeof(*dotdot_name) + sizeof(*dataonstack));
44152 +
44153 +       new_entry = old_entry + 1;
44154 +       dotdot_entry = old_entry + 2;
44155 +       new_lh = (lock_handle *)(old_entry + 3);
44156 +       dotdot_lh = new_lh + 1;
44157 +       dotdot_name = (struct dentry *)(new_lh + 2);
44158 +       dataonstack = (reiser4_dentry_fsdata *)(dotdot_name + 1);
44159 +
44160 +       assert("nikita-2318", old_dir != NULL);
44161 +       assert("nikita-2319", new_dir != NULL);
44162 +       assert("nikita-2320", old_name != NULL);
44163 +       assert("nikita-2321", new_name != NULL);
44164 +
44165 +       old_inode = old_name->d_inode;
44166 +       new_inode = new_name->d_inode;
44167 +
44168 +       dplug = inode_dir_plugin(old_dir);
44169 +       fplug = NULL;
44170 +
44171 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
44172 +       if (IS_ERR(new_fsdata)) {
44173 +               kfree(old_entry);
44174 +               context_set_commit_async(ctx);
44175 +               reiser4_exit_context(ctx);
44176 +               return PTR_ERR(new_fsdata);
44177 +       }
44178 +
44179 +       new_coord = &new_fsdata->dec.entry_coord;
44180 +       coord_clear_iplug(new_coord);
44181 +
44182 +       is_dir = S_ISDIR(old_inode->i_mode);
44183 +
44184 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
44185 +
44186 +       /* if target is existing directory and it's not empty---return error.
44187 +
44188 +          This check is done specifically, because is_dir_empty() requires
44189 +          tree traversal and have to be done before locks are taken.
44190 +        */
44191 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0) {
44192 +               kfree(old_entry);
44193 +               context_set_commit_async(ctx);
44194 +               reiser4_exit_context(ctx);
44195 +               return RETERR(-ENOTEMPTY);
44196 +       }
44197 +
44198 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
44199 +       if (result != 0) {
44200 +               kfree(old_entry);
44201 +               context_set_commit_async(ctx);
44202 +               reiser4_exit_context(ctx);
44203 +               return result;
44204 +       }
44205 +
44206 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
44207 +                                                new_dir, new_name);
44208 +       if (result != 0) {
44209 +               kfree(old_entry);
44210 +               context_set_commit_async(ctx);
44211 +               reiser4_exit_context(ctx);
44212 +               return result;
44213 +       }
44214 +
44215 +       init_lh(new_lh);
44216 +
44217 +       /* find entry for @new_name */
44218 +       result = reiser4_find_entry(new_dir, new_name, new_lh, ZNODE_WRITE_LOCK,
44219 +                                   new_entry);
44220 +
44221 +       if (IS_CBKERR(result)) {
44222 +               done_lh(new_lh);
44223 +               kfree(old_entry);
44224 +               context_set_commit_async(ctx);
44225 +               reiser4_exit_context(ctx);
44226 +               return result;
44227 +       }
44228 +
44229 +       reiser4_seal_done(&new_fsdata->dec.entry_seal);
44230 +
44231 +       /* add or replace name for @old_inode as @new_name */
44232 +       if (new_inode != NULL) {
44233 +               /* target (@new_name) exists. */
44234 +               /* Not clear what to do with objects that are
44235 +                  both directories and files at the same time. */
44236 +               if (result == CBK_COORD_FOUND) {
44237 +                       result = replace_name(old_inode,
44238 +                                             new_dir,
44239 +                                             new_inode, new_coord, new_lh);
44240 +                       if (result == 0)
44241 +                               fplug = inode_file_plugin(new_inode);
44242 +               } else if (result == CBK_COORD_NOTFOUND) {
44243 +                       /* VFS told us that @new_name is bound to existing
44244 +                          inode, but we failed to find directory entry. */
44245 +                       warning("nikita-2324", "Target not found");
44246 +                       result = RETERR(-ENOENT);
44247 +               }
44248 +       } else {
44249 +               /* target (@new_name) doesn't exists. */
44250 +               if (result == CBK_COORD_NOTFOUND)
44251 +                       result = add_name(old_inode,
44252 +                                         new_dir,
44253 +                                         new_name, new_coord, new_lh, is_dir);
44254 +               else if (result == CBK_COORD_FOUND) {
44255 +                       /* VFS told us that @new_name is "negative" dentry,
44256 +                          but we found directory entry. */
44257 +                       warning("nikita-2331", "Target found unexpectedly");
44258 +                       result = RETERR(-EIO);
44259 +               }
44260 +       }
44261 +
44262 +       assert("nikita-3462", ergo(result == 0,
44263 +                                  old_inode->i_nlink >= 2 + !!is_dir));
44264 +
44265 +       /* We are done with all modifications to the @new_dir, release lock on
44266 +          node. */
44267 +       done_lh(new_lh);
44268 +
44269 +       if (fplug != NULL) {
44270 +               /* detach @new_inode from name-space */
44271 +               result = fplug->detach(new_inode, new_dir);
44272 +               if (result != 0)
44273 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
44274 +                               (unsigned long long)get_inode_oid(new_inode),
44275 +                               result, possible_leak);
44276 +       }
44277 +
44278 +       if (new_inode != NULL)
44279 +               reiser4_update_sd(new_inode);
44280 +
44281 +       if (result == 0) {
44282 +               old_entry->obj = old_inode;
44283 +
44284 +               dplug->build_entry_key(old_dir,
44285 +                                      &old_name->d_name, &old_entry->key);
44286 +
44287 +               /* At this stage new name was introduced for
44288 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
44289 +                  counters were updated.
44290 +
44291 +                  We want to remove @old_name now. If @old_inode wasn't
44292 +                  directory this is simple.
44293 +                */
44294 +               result = dplug->rem_entry(old_dir, old_name, old_entry);
44295 +               if (result != 0 && result != -ENOMEM) {
44296 +                       warning("nikita-2335",
44297 +                               "Cannot remove old name: %i", result);
44298 +               } else {
44299 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
44300 +                       if (result != 0 && result != -ENOMEM) {
44301 +                               warning("nikita-2337",
44302 +                                       "Cannot drop link on old: %i", result);
44303 +                       }
44304 +               }
44305 +
44306 +               if (result == 0 && is_dir) {
44307 +                       /* @old_inode is directory. We also have to update
44308 +                          dotdot entry. */
44309 +                       coord_t *dotdot_coord;
44310 +
44311 +                       memset(dataonstack, 0, sizeof dataonstack);
44312 +                       memset(dotdot_entry, 0, sizeof dotdot_entry);
44313 +                       dotdot_entry->obj = old_dir;
44314 +                       memset(dotdot_name, 0, sizeof dotdot_name);
44315 +                       dotdot_name->d_name.name = "..";
44316 +                       dotdot_name->d_name.len = 2;
44317 +                       /*
44318 +                        * allocate ->d_fsdata on the stack to avoid using
44319 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
44320 +                        * because dentry is private to the current thread.
44321 +                        */
44322 +                       dotdot_name->d_fsdata = dataonstack;
44323 +                       init_lh(dotdot_lh);
44324 +
44325 +                       dotdot_coord = &dataonstack->dec.entry_coord;
44326 +                       coord_clear_iplug(dotdot_coord);
44327 +
44328 +                       result = reiser4_find_entry(old_inode, dotdot_name,
44329 +                                                   dotdot_lh, ZNODE_WRITE_LOCK,
44330 +                                                   dotdot_entry);
44331 +                       if (result == 0) {
44332 +                               /* replace_name() decreases i_nlink on
44333 +                                * @old_dir */
44334 +                               result = replace_name(new_dir,
44335 +                                                     old_inode,
44336 +                                                     old_dir,
44337 +                                                     dotdot_coord, dotdot_lh);
44338 +                       } else
44339 +                               result = RETERR(-EIO);
44340 +                       done_lh(dotdot_lh);
44341 +               }
44342 +       }
44343 +       reiser4_update_dir(new_dir);
44344 +       reiser4_update_dir(old_dir);
44345 +       reiser4_update_sd(old_inode);
44346 +       if (result == 0) {
44347 +               file_plugin *fplug;
44348 +
44349 +               if (new_inode != NULL) {
44350 +                       /* add safe-link for target file (in case we removed
44351 +                        * last reference to the poor fellow */
44352 +                       fplug = inode_file_plugin(new_inode);
44353 +                       if (new_inode->i_nlink == 0)
44354 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
44355 +               }
44356 +       }
44357 +       kfree(old_entry);
44358 +       context_set_commit_async(ctx);
44359 +       reiser4_exit_context(ctx);
44360 +       return result;
44361 +}
44362 +
44363 +#if 0
44364 +int reiser4_rename_common(struct inode *old_dir /* directory where @old
44365 +                                                * is located */ ,
44366 +                         struct dentry *old_name /* old name */ ,
44367 +                         struct inode *new_dir /* directory where @new
44368 +                                                * is located */ ,
44369 +                         struct dentry *new_name /* new name */ )
44370 +{
44371 +       /* From `The Open Group Base Specifications Issue 6'
44372 +
44373 +          If either the old or new argument names a symbolic link, rename()
44374 +          shall operate on the symbolic link itself, and shall not resolve
44375 +          the last component of the argument. If the old argument and the new
44376 +          argument resolve to the same existing file, rename() shall return
44377 +          successfully and perform no other action.
44378 +
44379 +          [this is done by VFS: vfs_rename()]
44380 +
44381 +          If the old argument points to the pathname of a file that is not a
44382 +          directory, the new argument shall not point to the pathname of a
44383 +          directory.
44384 +
44385 +          [checked by VFS: vfs_rename->may_delete()]
44386 +
44387 +          If the link named by the new argument exists, it shall
44388 +          be removed and old renamed to new. In this case, a link named new
44389 +          shall remain visible to other processes throughout the renaming
44390 +          operation and refer either to the file referred to by new or old
44391 +          before the operation began.
44392 +
44393 +          [we should assure this]
44394 +
44395 +          Write access permission is required for
44396 +          both the directory containing old and the directory containing new.
44397 +
44398 +          [checked by VFS: vfs_rename->may_delete(), may_create()]
44399 +
44400 +          If the old argument points to the pathname of a directory, the new
44401 +          argument shall not point to the pathname of a file that is not a
44402 +          directory.
44403 +
44404 +          [checked by VFS: vfs_rename->may_delete()]
44405 +
44406 +          If the directory named by the new argument exists, it
44407 +          shall be removed and old renamed to new. In this case, a link named
44408 +          new shall exist throughout the renaming operation and shall refer
44409 +          either to the directory referred to by new or old before the
44410 +          operation began.
44411 +
44412 +          [we should assure this]
44413 +
44414 +          If new names an existing directory, it shall be
44415 +          required to be an empty directory.
44416 +
44417 +          [we should check this]
44418 +
44419 +          If the old argument points to a pathname of a symbolic link, the
44420 +          symbolic link shall be renamed. If the new argument points to a
44421 +          pathname of a symbolic link, the symbolic link shall be removed.
44422 +
44423 +          The new pathname shall not contain a path prefix that names
44424 +          old. Write access permission is required for the directory
44425 +          containing old and the directory containing new. If the old
44426 +          argument points to the pathname of a directory, write access
44427 +          permission may be required for the directory named by old, and, if
44428 +          it exists, the directory named by new.
44429 +
44430 +          [checked by VFS: vfs_rename(), vfs_rename_dir()]
44431 +
44432 +          If the link named by the new argument exists and the file's link
44433 +          count becomes 0 when it is removed and no process has the file
44434 +          open, the space occupied by the file shall be freed and the file
44435 +          shall no longer be accessible. If one or more processes have the
44436 +          file open when the last link is removed, the link shall be removed
44437 +          before rename() returns, but the removal of the file contents shall
44438 +          be postponed until all references to the file are closed.
44439 +
44440 +          [iput() handles this, but we can do this manually, a la
44441 +          reiser4_unlink()]
44442 +
44443 +          Upon successful completion, rename() shall mark for update the
44444 +          st_ctime and st_mtime fields of the parent directory of each file.
44445 +
44446 +          [N/A]
44447 +
44448 +        */
44449 +       reiser4_context *ctx;
44450 +       int result;
44451 +       int is_dir;             /* is @old_name directory */
44452 +       struct inode *old_inode;
44453 +       struct inode *new_inode;
44454 +       reiser4_dir_entry_desc old_entry;
44455 +       reiser4_dir_entry_desc new_entry;
44456 +       coord_t *new_coord;
44457 +       reiser4_dentry_fsdata *new_fsdata;
44458 +       lock_handle new_lh;
44459 +       dir_plugin *dplug;
44460 +       file_plugin *fplug;
44461 +
44462 +       ctx = reiser4_init_context(old_dir->i_sb);
44463 +       if (IS_ERR(ctx))
44464 +               return PTR_ERR(ctx);
44465 +
44466 +       assert("nikita-2318", old_dir != NULL);
44467 +       assert("nikita-2319", new_dir != NULL);
44468 +       assert("nikita-2320", old_name != NULL);
44469 +       assert("nikita-2321", new_name != NULL);
44470 +
44471 +       old_inode = old_name->d_inode;
44472 +       new_inode = new_name->d_inode;
44473 +
44474 +       dplug = inode_dir_plugin(old_dir);
44475 +       fplug = NULL;
44476 +
44477 +       new_fsdata = reiser4_get_dentry_fsdata(new_name);
44478 +       if (IS_ERR(new_fsdata)) {
44479 +               result = PTR_ERR(new_fsdata);
44480 +               goto exit;
44481 +       }
44482 +
44483 +       new_coord = &new_fsdata->dec.entry_coord;
44484 +       coord_clear_iplug(new_coord);
44485 +
44486 +       is_dir = S_ISDIR(old_inode->i_mode);
44487 +
44488 +       assert("nikita-3461", old_inode->i_nlink >= 1 + !!is_dir);
44489 +
44490 +       /* if target is existing directory and it's not empty---return error.
44491 +
44492 +          This check is done specifically, because is_dir_empty() requires
44493 +          tree traversal and have to be done before locks are taken.
44494 +        */
44495 +       if (is_dir && new_inode != NULL && is_dir_empty(new_inode) != 0)
44496 +               return RETERR(-ENOTEMPTY);
44497 +
44498 +       result = can_rename(old_dir, old_inode, new_dir, new_inode);
44499 +       if (result != 0)
44500 +               goto exit;
44501 +
44502 +       result = hashed_rename_estimate_and_grab(old_dir, old_name,
44503 +                                                new_dir, new_name);
44504 +       if (result != 0)
44505 +               goto exit;
44506 +
44507 +       init_lh(&new_lh);
44508 +
44509 +       /* find entry for @new_name */
44510 +       result = reiser4_find_entry(new_dir, new_name, &new_lh,
44511 +                                   ZNODE_WRITE_LOCK, &new_entry);
44512 +
44513 +       if (IS_CBKERR(result)) {
44514 +               done_lh(&new_lh);
44515 +               goto exit;
44516 +       }
44517 +
44518 +       reiser4_seal_done(&new_fsdata->dec.entry_seal);
44519 +
44520 +       /* add or replace name for @old_inode as @new_name */
44521 +       if (new_inode != NULL) {
44522 +               /* target (@new_name) exists. */
44523 +               /* Not clear what to do with objects that are
44524 +                  both directories and files at the same time. */
44525 +               if (result == CBK_COORD_FOUND) {
44526 +                       result = replace_name(old_inode,
44527 +                                             new_dir,
44528 +                                             new_inode, new_coord, &new_lh);
44529 +                       if (result == 0)
44530 +                               fplug = inode_file_plugin(new_inode);
44531 +               } else if (result == CBK_COORD_NOTFOUND) {
44532 +                       /* VFS told us that @new_name is bound to existing
44533 +                          inode, but we failed to find directory entry. */
44534 +                       warning("nikita-2324", "Target not found");
44535 +                       result = RETERR(-ENOENT);
44536 +               }
44537 +       } else {
44538 +               /* target (@new_name) doesn't exists. */
44539 +               if (result == CBK_COORD_NOTFOUND)
44540 +                       result = add_name(old_inode,
44541 +                                         new_dir,
44542 +                                         new_name, new_coord, &new_lh, is_dir);
44543 +               else if (result == CBK_COORD_FOUND) {
44544 +                       /* VFS told us that @new_name is "negative" dentry,
44545 +                          but we found directory entry. */
44546 +                       warning("nikita-2331", "Target found unexpectedly");
44547 +                       result = RETERR(-EIO);
44548 +               }
44549 +       }
44550 +
44551 +       assert("nikita-3462", ergo(result == 0,
44552 +                                  old_inode->i_nlink >= 2 + !!is_dir));
44553 +
44554 +       /* We are done with all modifications to the @new_dir, release lock on
44555 +          node. */
44556 +       done_lh(&new_lh);
44557 +
44558 +       if (fplug != NULL) {
44559 +               /* detach @new_inode from name-space */
44560 +               result = fplug->detach(new_inode, new_dir);
44561 +               if (result != 0)
44562 +                       warning("nikita-2330", "Cannot detach %lli: %i. %s",
44563 +                               (unsigned long long)get_inode_oid(new_inode),
44564 +                               result, possible_leak);
44565 +       }
44566 +
44567 +       if (new_inode != NULL)
44568 +               reiser4_update_sd(new_inode);
44569 +
44570 +       if (result == 0) {
44571 +               memset(&old_entry, 0, sizeof old_entry);
44572 +               old_entry.obj = old_inode;
44573 +
44574 +               dplug->build_entry_key(old_dir,
44575 +                                      &old_name->d_name, &old_entry.key);
44576 +
44577 +               /* At this stage new name was introduced for
44578 +                  @old_inode. @old_inode, @new_dir, and @new_inode i_nlink
44579 +                  counters were updated.
44580 +
44581 +                  We want to remove @old_name now. If @old_inode wasn't
44582 +                  directory this is simple.
44583 +                */
44584 +               result = dplug->rem_entry(old_dir, old_name, &old_entry);
44585 +               /*result = rem_entry_hashed(old_dir, old_name, &old_entry); */
44586 +               if (result != 0 && result != -ENOMEM) {
44587 +                       warning("nikita-2335",
44588 +                               "Cannot remove old name: %i", result);
44589 +               } else {
44590 +                       result = reiser4_del_nlink(old_inode, old_dir, 0);
44591 +                       if (result != 0 && result != -ENOMEM) {
44592 +                               warning("nikita-2337",
44593 +                                       "Cannot drop link on old: %i", result);
44594 +                       }
44595 +               }
44596 +
44597 +               if (result == 0 && is_dir) {
44598 +                       /* @old_inode is directory. We also have to update
44599 +                          dotdot entry. */
44600 +                       coord_t *dotdot_coord;
44601 +                       lock_handle dotdot_lh;
44602 +                       struct dentry dotdot_name;
44603 +                       reiser4_dir_entry_desc dotdot_entry;
44604 +                       reiser4_dentry_fsdata dataonstack;
44605 +                       reiser4_dentry_fsdata *fsdata;
44606 +
44607 +                       memset(&dataonstack, 0, sizeof dataonstack);
44608 +                       memset(&dotdot_entry, 0, sizeof dotdot_entry);
44609 +                       dotdot_entry.obj = old_dir;
44610 +                       memset(&dotdot_name, 0, sizeof dotdot_name);
44611 +                       dotdot_name.d_name.name = "..";
44612 +                       dotdot_name.d_name.len = 2;
44613 +                       /*
44614 +                        * allocate ->d_fsdata on the stack to avoid using
44615 +                        * reiser4_get_dentry_fsdata(). Locking is not needed,
44616 +                        * because dentry is private to the current thread.
44617 +                        */
44618 +                       dotdot_name.d_fsdata = &dataonstack;
44619 +                       init_lh(&dotdot_lh);
44620 +
44621 +                       fsdata = &dataonstack;
44622 +                       dotdot_coord = &fsdata->dec.entry_coord;
44623 +                       coord_clear_iplug(dotdot_coord);
44624 +
44625 +                       result = reiser4_find_entry(old_inode,
44626 +                                                   &dotdot_name,
44627 +                                                   &dotdot_lh,
44628 +                                                   ZNODE_WRITE_LOCK,
44629 +                                                   &dotdot_entry);
44630 +                       if (result == 0) {
44631 +                               /* replace_name() decreases i_nlink on
44632 +                                * @old_dir */
44633 +                               result = replace_name(new_dir,
44634 +                                                     old_inode,
44635 +                                                     old_dir,
44636 +                                                     dotdot_coord, &dotdot_lh);
44637 +                       } else
44638 +                               result = RETERR(-EIO);
44639 +                       done_lh(&dotdot_lh);
44640 +               }
44641 +       }
44642 +       reiser4_update_dir(new_dir);
44643 +       reiser4_update_dir(old_dir);
44644 +       reiser4_update_sd(old_inode);
44645 +       if (result == 0) {
44646 +               file_plugin *fplug;
44647 +
44648 +               if (new_inode != NULL) {
44649 +                       /* add safe-link for target file (in case we removed
44650 +                        * last reference to the poor fellow */
44651 +                       fplug = inode_file_plugin(new_inode);
44652 +                       if (new_inode->i_nlink == 0)
44653 +                               result = safe_link_add(new_inode, SAFE_UNLINK);
44654 +               }
44655 +       }
44656 +      exit:
44657 +       context_set_commit_async(ctx);
44658 +       reiser4_exit_context(ctx);
44659 +       return result;
44660 +}
44661 +#endif
44662 diff --git a/fs/reiser4/plugin/item/Makefile b/fs/reiser4/plugin/item/Makefile
44663 new file mode 100644
44664 index 0000000..1bae623
44665 --- /dev/null
44666 +++ b/fs/reiser4/plugin/item/Makefile
44667 @@ -0,0 +1,18 @@
44668 +obj-$(CONFIG_REISER4_FS) += item_plugins.o
44669 +
44670 +item_plugins-objs :=           \
44671 +       item.o                  \
44672 +       static_stat.o           \
44673 +       sde.o                   \
44674 +       cde.o                   \
44675 +       blackbox.o              \
44676 +       internal.o              \
44677 +       tail.o                  \
44678 +       ctail.o                 \
44679 +       extent.o                \
44680 +       extent_item_ops.o       \
44681 +       extent_file_ops.o       \
44682 +       extent_flush_ops.o
44683 +
44684 +
44685 +
44686 diff --git a/fs/reiser4/plugin/item/acl.h b/fs/reiser4/plugin/item/acl.h
44687 new file mode 100644
44688 index 0000000..f26762a
44689 --- /dev/null
44690 +++ b/fs/reiser4/plugin/item/acl.h
44691 @@ -0,0 +1,66 @@
44692 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44693 +
44694 +/* Directory entry. */
44695 +
44696 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
44697 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
44698 +
44699 +#include "../../forward.h"
44700 +#include "../../dformat.h"
44701 +#include "../../kassign.h"
44702 +#include "../../key.h"
44703 +
44704 +#include <linux/fs.h>
44705 +#include <linux/dcache.h>      /* for struct dentry */
44706 +
44707 +typedef struct directory_entry_format {
44708 +       /* key of object stat-data. It's not necessary to store whole
44709 +          key here, because it's always key of stat-data, so minor
44710 +          packing locality and offset can be omitted here. But this
44711 +          relies on particular key allocation scheme for stat-data, so,
44712 +          for extensibility sake, whole key can be stored here.
44713 +
44714 +          We store key as array of bytes, because we don't want 8-byte
44715 +          alignment of dir entries.
44716 +        */
44717 +       obj_key_id id;
44718 +       /* file name. Null terminated string. */
44719 +       d8 name[0];
44720 +} directory_entry_format;
44721 +
44722 +void print_de(const char *prefix, coord_t * coord);
44723 +int extract_key_de(const coord_t * coord, reiser4_key * key);
44724 +int update_key_de(const coord_t * coord, const reiser4_key * key,
44725 +                 lock_handle * lh);
44726 +char *extract_name_de(const coord_t * coord, char *buf);
44727 +unsigned extract_file_type_de(const coord_t * coord);
44728 +int add_entry_de(struct inode *dir, coord_t * coord,
44729 +                lock_handle * lh, const struct dentry *name,
44730 +                reiser4_dir_entry_desc * entry);
44731 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
44732 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
44733 +int max_name_len_de(const struct inode *dir);
44734 +
44735 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
44736 +
44737 +char *extract_dent_name(const coord_t * coord,
44738 +                       directory_entry_format * dent, char *buf);
44739 +
44740 +#if REISER4_LARGE_KEY
44741 +#define DE_NAME_BUF_LEN (24)
44742 +#else
44743 +#define DE_NAME_BUF_LEN (16)
44744 +#endif
44745 +
44746 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
44747 +#endif
44748 +
44749 +/* Make Linus happy.
44750 +   Local variables:
44751 +   c-indentation-style: "K&R"
44752 +   mode-name: "LC"
44753 +   c-basic-offset: 8
44754 +   tab-width: 8
44755 +   fill-column: 120
44756 +   End:
44757 +*/
44758 diff --git a/fs/reiser4/plugin/item/blackbox.c b/fs/reiser4/plugin/item/blackbox.c
44759 new file mode 100644
44760 index 0000000..f13ff64
44761 --- /dev/null
44762 +++ b/fs/reiser4/plugin/item/blackbox.c
44763 @@ -0,0 +1,142 @@
44764 +/* Copyright 2003 by Hans Reiser, licensing governed by
44765 + * reiser4/README */
44766 +
44767 +/* Black box item implementation */
44768 +
44769 +#include "../../forward.h"
44770 +#include "../../debug.h"
44771 +#include "../../dformat.h"
44772 +#include "../../kassign.h"
44773 +#include "../../coord.h"
44774 +#include "../../tree.h"
44775 +#include "../../lock.h"
44776 +
44777 +#include "blackbox.h"
44778 +#include "item.h"
44779 +#include "../plugin.h"
44780 +
44781 +int
44782 +store_black_box(reiser4_tree * tree,
44783 +               const reiser4_key * key, void *data, int length)
44784 +{
44785 +       int result;
44786 +       reiser4_item_data idata;
44787 +       coord_t coord;
44788 +       lock_handle lh;
44789 +
44790 +       memset(&idata, 0, sizeof idata);
44791 +
44792 +       idata.data = data;
44793 +       idata.user = 0;
44794 +       idata.length = length;
44795 +       idata.iplug = item_plugin_by_id(BLACK_BOX_ID);
44796 +
44797 +       init_lh(&lh);
44798 +       result = insert_by_key(tree, key,
44799 +                              &idata, &coord, &lh, LEAF_LEVEL, CBK_UNIQUE);
44800 +
44801 +       assert("nikita-3413",
44802 +              ergo(result == 0,
44803 +                   WITH_COORD(&coord,
44804 +                              item_length_by_coord(&coord) == length)));
44805 +
44806 +       done_lh(&lh);
44807 +       return result;
44808 +}
44809 +
44810 +int
44811 +load_black_box(reiser4_tree * tree,
44812 +              reiser4_key * key, void *data, int length, int exact)
44813 +{
44814 +       int result;
44815 +       coord_t coord;
44816 +       lock_handle lh;
44817 +
44818 +       init_lh(&lh);
44819 +       result = coord_by_key(tree, key,
44820 +                             &coord, &lh, ZNODE_READ_LOCK,
44821 +                             exact ? FIND_EXACT : FIND_MAX_NOT_MORE_THAN,
44822 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44823 +
44824 +       if (result == 0) {
44825 +               int ilen;
44826 +
44827 +               result = zload(coord.node);
44828 +               if (result == 0) {
44829 +                       ilen = item_length_by_coord(&coord);
44830 +                       if (ilen <= length) {
44831 +                               memcpy(data, item_body_by_coord(&coord), ilen);
44832 +                               unit_key_by_coord(&coord, key);
44833 +                       } else if (exact) {
44834 +                               /*
44835 +                                * item is larger than buffer provided by the
44836 +                                * user. Only issue a warning if @exact is
44837 +                                * set. If @exact is false, we are iterating
44838 +                                * over all safe-links and here we are reaching
44839 +                                * the end of the iteration.
44840 +                                */
44841 +                               warning("nikita-3415",
44842 +                                       "Wrong black box length: %i > %i",
44843 +                                       ilen, length);
44844 +                               result = RETERR(-EIO);
44845 +                       }
44846 +                       zrelse(coord.node);
44847 +               }
44848 +       }
44849 +
44850 +       done_lh(&lh);
44851 +       return result;
44852 +
44853 +}
44854 +
44855 +int
44856 +update_black_box(reiser4_tree * tree,
44857 +                const reiser4_key * key, void *data, int length)
44858 +{
44859 +       int result;
44860 +       coord_t coord;
44861 +       lock_handle lh;
44862 +
44863 +       init_lh(&lh);
44864 +       result = coord_by_key(tree, key,
44865 +                             &coord, &lh, ZNODE_READ_LOCK,
44866 +                             FIND_EXACT,
44867 +                             LEAF_LEVEL, LEAF_LEVEL, CBK_UNIQUE, NULL);
44868 +       if (result == 0) {
44869 +               int ilen;
44870 +
44871 +               result = zload(coord.node);
44872 +               if (result == 0) {
44873 +                       ilen = item_length_by_coord(&coord);
44874 +                       if (length <= ilen) {
44875 +                               memcpy(item_body_by_coord(&coord), data,
44876 +                                      length);
44877 +                       } else {
44878 +                               warning("nikita-3437",
44879 +                                       "Wrong black box length: %i < %i",
44880 +                                       ilen, length);
44881 +                               result = RETERR(-EIO);
44882 +                       }
44883 +                       zrelse(coord.node);
44884 +               }
44885 +       }
44886 +
44887 +       done_lh(&lh);
44888 +       return result;
44889 +
44890 +}
44891 +
44892 +int kill_black_box(reiser4_tree * tree, const reiser4_key * key)
44893 +{
44894 +       return reiser4_cut_tree(tree, key, key, NULL, 1);
44895 +}
44896 +
44897 +/* Make Linus happy.
44898 +   Local variables:
44899 +   c-indentation-style: "K&R"
44900 +   mode-name: "LC"
44901 +   c-basic-offset: 8
44902 +   tab-width: 8
44903 +   fill-column: 120
44904 +   End:
44905 +*/
44906 diff --git a/fs/reiser4/plugin/item/blackbox.h b/fs/reiser4/plugin/item/blackbox.h
44907 new file mode 100644
44908 index 0000000..f5b7af3
44909 --- /dev/null
44910 +++ b/fs/reiser4/plugin/item/blackbox.h
44911 @@ -0,0 +1,33 @@
44912 +/* Copyright 2003 by Hans Reiser, licensing governed by
44913 + * reiser4/README */
44914 +
44915 +/* "Black box" entry to fixed-width contain user supplied data */
44916 +
44917 +#if !defined( __FS_REISER4_BLACK_BOX_H__ )
44918 +#define __FS_REISER4_BLACK_BOX_H__
44919 +
44920 +#include "../../forward.h"
44921 +#include "../../dformat.h"
44922 +#include "../../kassign.h"
44923 +#include "../../key.h"
44924 +
44925 +extern int store_black_box(reiser4_tree * tree,
44926 +                          const reiser4_key * key, void *data, int length);
44927 +extern int load_black_box(reiser4_tree * tree,
44928 +                         reiser4_key * key, void *data, int length, int exact);
44929 +extern int kill_black_box(reiser4_tree * tree, const reiser4_key * key);
44930 +extern int update_black_box(reiser4_tree * tree,
44931 +                           const reiser4_key * key, void *data, int length);
44932 +
44933 +/* __FS_REISER4_BLACK_BOX_H__ */
44934 +#endif
44935 +
44936 +/* Make Linus happy.
44937 +   Local variables:
44938 +   c-indentation-style: "K&R"
44939 +   mode-name: "LC"
44940 +   c-basic-offset: 8
44941 +   tab-width: 8
44942 +   fill-column: 120
44943 +   End:
44944 +*/
44945 diff --git a/fs/reiser4/plugin/item/cde.c b/fs/reiser4/plugin/item/cde.c
44946 new file mode 100644
44947 index 0000000..05374ac
44948 --- /dev/null
44949 +++ b/fs/reiser4/plugin/item/cde.c
44950 @@ -0,0 +1,1008 @@
44951 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
44952 +
44953 +/* Directory entry implementation */
44954 +
44955 +/* DESCRIPTION:
44956 +
44957 +   This is "compound" directory item plugin implementation. This directory
44958 +   item type is compound (as opposed to the "simple directory item" in
44959 +   fs/reiser4/plugin/item/sde.[ch]), because it consists of several directory
44960 +   entries.
44961 +
44962 +   The reason behind this decision is disk space efficiency: all directory
44963 +   entries inside the same directory have identical fragment in their
44964 +   keys. This, of course, depends on key assignment policy. In our default key
44965 +   assignment policy, all directory entries have the same locality which is
44966 +   equal to the object id of their directory.
44967 +
44968 +   Composing directory item out of several directory entries for the same
44969 +   directory allows us to store said key fragment only once. That is, this is
44970 +   some ad hoc form of key compression (stem compression) that is implemented
44971 +   here, because general key compression is not supposed to be implemented in
44972 +   v4.0.
44973 +
44974 +   Another decision that was made regarding all directory item plugins, is
44975 +   that they will store entry keys unaligned. This is for that sake of disk
44976 +   space efficiency again.
44977 +
44978 +   In should be noted, that storing keys unaligned increases CPU consumption,
44979 +   at least on some architectures.
44980 +
44981 +   Internal on-disk structure of the compound directory item is the following:
44982 +
44983 +        HEADER          cde_item_format.        Here number of entries is stored.
44984 +        ENTRY_HEADER_0  cde_unit_header.        Here part of entry key and
44985 +        ENTRY_HEADER_1                          offset of entry body are stored.
44986 +        ENTRY_HEADER_2                         (basically two last parts of key)
44987 +        ...
44988 +        ENTRY_HEADER_N
44989 +        ENTRY_BODY_0    directory_entry_format. Here part of stat data key and
44990 +        ENTRY_BODY_1                            NUL-terminated name are stored.
44991 +        ENTRY_BODY_2                           (part of statadta key in the
44992 +                                                sence that since all SDs have
44993 +                                                zero offset, this offset is not
44994 +                                                stored on disk).
44995 +        ...
44996 +        ENTRY_BODY_N
44997 +
44998 +   When it comes to the balancing, each directory entry in compound directory
44999 +   item is unit, that is, something that can be cut from one item and pasted
45000 +   into another item of the same type. Handling of unit cut and paste is major
45001 +   reason for the complexity of code below.
45002 +
45003 +*/
45004 +
45005 +#include "../../forward.h"
45006 +#include "../../debug.h"
45007 +#include "../../dformat.h"
45008 +#include "../../kassign.h"
45009 +#include "../../key.h"
45010 +#include "../../coord.h"
45011 +#include "sde.h"
45012 +#include "cde.h"
45013 +#include "item.h"
45014 +#include "../node/node.h"
45015 +#include "../plugin.h"
45016 +#include "../../znode.h"
45017 +#include "../../carry.h"
45018 +#include "../../tree.h"
45019 +#include "../../inode.h"
45020 +
45021 +#include <linux/fs.h>          /* for struct inode */
45022 +#include <linux/dcache.h>      /* for struct dentry */
45023 +#include <linux/quotaops.h>
45024 +
45025 +#if 0
45026 +#define CHECKME(coord)                                         \
45027 +({                                                             \
45028 +       const char *message;                                    \
45029 +       coord_t dup;                                            \
45030 +                                                               \
45031 +       coord_dup_nocheck(&dup, (coord));                       \
45032 +       dup.unit_pos = 0;                                       \
45033 +       assert("nikita-2871", cde_check(&dup, &message) == 0);  \
45034 +})
45035 +#else
45036 +#define CHECKME(coord) noop
45037 +#endif
45038 +
45039 +/* return body of compound directory item at @coord */
45040 +static inline cde_item_format *formatted_at(const coord_t * coord)
45041 +{
45042 +       assert("nikita-1282", coord != NULL);
45043 +       return item_body_by_coord(coord);
45044 +}
45045 +
45046 +/* return entry header at @coord */
45047 +static inline cde_unit_header *header_at(const coord_t *
45048 +                                        coord /* coord of item */ ,
45049 +                                        int idx /* index of unit */ )
45050 +{
45051 +       assert("nikita-1283", coord != NULL);
45052 +       return &formatted_at(coord)->entry[idx];
45053 +}
45054 +
45055 +/* return number of units in compound directory item at @coord */
45056 +static int units(const coord_t * coord /* coord of item */ )
45057 +{
45058 +       return le16_to_cpu(get_unaligned(&formatted_at(coord)->num_of_entries));
45059 +}
45060 +
45061 +/* return offset of the body of @idx-th entry in @coord */
45062 +static unsigned int offset_of(const coord_t * coord /* coord of item */ ,
45063 +                             int idx /* index of unit */ )
45064 +{
45065 +       if (idx < units(coord))
45066 +               return le16_to_cpu(get_unaligned(&header_at(coord, idx)->offset));
45067 +       else if (idx == units(coord))
45068 +               return item_length_by_coord(coord);
45069 +       else
45070 +               impossible("nikita-1308", "Wrong idx");
45071 +       return 0;
45072 +}
45073 +
45074 +/* set offset of the body of @idx-th entry in @coord */
45075 +static void set_offset(const coord_t * coord /* coord of item */ ,
45076 +                      int idx /* index of unit */ ,
45077 +                      unsigned int offset /* new offset */ )
45078 +{
45079 +       put_unaligned(cpu_to_le16((__u16) offset), &header_at(coord, idx)->offset);
45080 +}
45081 +
45082 +static void adj_offset(const coord_t * coord /* coord of item */ ,
45083 +                      int idx /* index of unit */ ,
45084 +                      int delta /* offset change */ )
45085 +{
45086 +       d16 *doffset;
45087 +       __u16 offset;
45088 +
45089 +       doffset = &header_at(coord, idx)->offset;
45090 +       offset = le16_to_cpu(get_unaligned(doffset));
45091 +       offset += delta;
45092 +       put_unaligned(cpu_to_le16((__u16) offset), doffset);
45093 +}
45094 +
45095 +/* return pointer to @offset-th byte from the beginning of @coord */
45096 +static char *address(const coord_t * coord /* coord of item */ ,
45097 +                    int offset)
45098 +{
45099 +       return ((char *)item_body_by_coord(coord)) + offset;
45100 +}
45101 +
45102 +/* return pointer to the body of @idx-th entry in @coord */
45103 +static directory_entry_format *entry_at(const coord_t * coord  /* coord of
45104 +                                                                * item */ ,
45105 +                                       int idx /* index of unit */ )
45106 +{
45107 +       return (directory_entry_format *) address(coord,
45108 +                                                 (int)offset_of(coord, idx));
45109 +}
45110 +
45111 +/* return number of unit referenced by @coord */
45112 +static int idx_of(const coord_t * coord /* coord of item */ )
45113 +{
45114 +       assert("nikita-1285", coord != NULL);
45115 +       return coord->unit_pos;
45116 +}
45117 +
45118 +/* find position where entry with @entry_key would be inserted into @coord */
45119 +static int find(const coord_t * coord /* coord of item */ ,
45120 +               const reiser4_key * entry_key /* key to look for */ ,
45121 +               cmp_t * last /* result of last comparison */ )
45122 +{
45123 +       int entries;
45124 +
45125 +       int left;
45126 +       int right;
45127 +
45128 +       cde_unit_header *header;
45129 +
45130 +       assert("nikita-1295", coord != NULL);
45131 +       assert("nikita-1296", entry_key != NULL);
45132 +       assert("nikita-1297", last != NULL);
45133 +
45134 +       entries = units(coord);
45135 +       left = 0;
45136 +       right = entries - 1;
45137 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
45138 +               int median;
45139 +
45140 +               median = (left + right) >> 1;
45141 +
45142 +               header = header_at(coord, median);
45143 +               *last = de_id_key_cmp(&header->hash, entry_key);
45144 +               switch (*last) {
45145 +               case LESS_THAN:
45146 +                       left = median;
45147 +                       break;
45148 +               case GREATER_THAN:
45149 +                       right = median;
45150 +                       break;
45151 +               case EQUAL_TO:{
45152 +                               do {
45153 +                                       median--;
45154 +                                       header--;
45155 +                               } while (median >= 0 &&
45156 +                                        de_id_key_cmp(&header->hash,
45157 +                                                      entry_key) == EQUAL_TO);
45158 +                               return median + 1;
45159 +                       }
45160 +               }
45161 +       }
45162 +       header = header_at(coord, left);
45163 +       for (; left < entries; ++left, ++header) {
45164 +               prefetch(header + 1);
45165 +               *last = de_id_key_cmp(&header->hash, entry_key);
45166 +               if (*last != LESS_THAN)
45167 +                       break;
45168 +       }
45169 +       if (left < entries)
45170 +               return left;
45171 +       else
45172 +               return RETERR(-ENOENT);
45173 +
45174 +}
45175 +
45176 +/* expand @coord as to accommodate for insertion of @no new entries starting
45177 +   from @pos, with total bodies size @size. */
45178 +static int expand_item(const coord_t * coord /* coord of item */ ,
45179 +                      int pos /* unit position */ , int no     /* number of new
45180 +                                                                * units*/ ,
45181 +                      int size /* total size of new units' data */ ,
45182 +                      unsigned int data_size   /* free space already reserved
45183 +                                                * in the item for insertion */ )
45184 +{
45185 +       int entries;
45186 +       cde_unit_header *header;
45187 +       char *dent;
45188 +       int i;
45189 +
45190 +       assert("nikita-1310", coord != NULL);
45191 +       assert("nikita-1311", pos >= 0);
45192 +       assert("nikita-1312", no > 0);
45193 +       assert("nikita-1313", data_size >= no * sizeof(directory_entry_format));
45194 +       assert("nikita-1343",
45195 +              item_length_by_coord(coord) >=
45196 +              (int)(size + data_size + no * sizeof *header));
45197 +
45198 +       entries = units(coord);
45199 +
45200 +       if (pos == entries)
45201 +               dent = address(coord, size);
45202 +       else
45203 +               dent = (char *)entry_at(coord, pos);
45204 +       /* place where new header will be in */
45205 +       header = header_at(coord, pos);
45206 +       /* free space for new entry headers */
45207 +       memmove(header + no, header,
45208 +               (unsigned)(address(coord, size) - (char *)header));
45209 +       /* if adding to the end initialise first new header */
45210 +       if (pos == entries) {
45211 +               set_offset(coord, pos, (unsigned)size);
45212 +       }
45213 +
45214 +       /* adjust entry pointer and size */
45215 +       dent = dent + no * sizeof *header;
45216 +       size += no * sizeof *header;
45217 +       /* free space for new entries */
45218 +       memmove(dent + data_size, dent,
45219 +               (unsigned)(address(coord, size) - dent));
45220 +
45221 +       /* increase counter */
45222 +       entries += no;
45223 +       put_unaligned(cpu_to_le16((__u16) entries), &formatted_at(coord)->num_of_entries);
45224 +
45225 +       /* [ 0 ... pos ] entries were shifted by no * ( sizeof *header )
45226 +          bytes.  */
45227 +       for (i = 0; i <= pos; ++i)
45228 +               adj_offset(coord, i, no * sizeof *header);
45229 +       /* [ pos + no ... +\infty ) entries were shifted by ( no *
45230 +          sizeof *header + data_size ) bytes */
45231 +       for (i = pos + no; i < entries; ++i)
45232 +               adj_offset(coord, i, no * sizeof *header + data_size);
45233 +       return 0;
45234 +}
45235 +
45236 +/* insert new @entry into item */
45237 +static int expand(const coord_t * coord /* coord of item */ ,
45238 +                 cde_entry * entry /* entry to insert */ ,
45239 +                 int len /* length of @entry data */ ,
45240 +                 int *pos /* position to insert */ ,
45241 +                 reiser4_dir_entry_desc * dir_entry    /* parameters for new
45242 +                                                        * entry */ )
45243 +{
45244 +       cmp_t cmp_res;
45245 +       int datasize;
45246 +
45247 +       *pos = find(coord, &dir_entry->key, &cmp_res);
45248 +       if (*pos < 0)
45249 +               *pos = units(coord);
45250 +
45251 +       datasize = sizeof(directory_entry_format);
45252 +       if (is_longname(entry->name->name, entry->name->len))
45253 +               datasize += entry->name->len + 1;
45254 +
45255 +       expand_item(coord, *pos, 1, item_length_by_coord(coord) - len,
45256 +                   datasize);
45257 +       return 0;
45258 +}
45259 +
45260 +/* paste body of @entry into item */
45261 +static int paste_entry(const coord_t * coord /* coord of item */ ,
45262 +                      cde_entry * entry /* new entry */ ,
45263 +                      int pos /* position to insert */ ,
45264 +                      reiser4_dir_entry_desc * dir_entry       /* parameters for
45265 +                                                                * new entry */ )
45266 +{
45267 +       cde_unit_header *header;
45268 +       directory_entry_format *dent;
45269 +       const char *name;
45270 +       int len;
45271 +
45272 +       header = header_at(coord, pos);
45273 +       dent = entry_at(coord, pos);
45274 +
45275 +       build_de_id_by_key(&dir_entry->key, &header->hash);
45276 +       build_inode_key_id(entry->obj, &dent->id);
45277 +       /* AUDIT unsafe strcpy() operation! It should be replaced with
45278 +          much less CPU hungry
45279 +          memcpy( ( char * ) dent -> name, entry -> name -> name , entry -> name -> len );
45280 +
45281 +          Also a more major thing is that there should be a way to figure out
45282 +          amount of space in dent -> name and be able to check that we are
45283 +          not going to overwrite more than we supposed to */
45284 +       name = entry->name->name;
45285 +       len = entry->name->len;
45286 +       if (is_longname(name, len)) {
45287 +               strcpy((unsigned char *)dent->name, name);
45288 +               put_unaligned(0, &dent->name[len]);
45289 +       }
45290 +       return 0;
45291 +}
45292 +
45293 +/* estimate how much space is necessary in item to insert/paste set of entries
45294 +   described in @data. */
45295 +int estimate_cde(const coord_t * coord /* coord of item */ ,
45296 +                const reiser4_item_data * data /* parameters for new item */ )
45297 +{
45298 +       cde_entry_data *e;
45299 +       int result;
45300 +       int i;
45301 +
45302 +       e = (cde_entry_data *) data->data;
45303 +
45304 +       assert("nikita-1288", e != NULL);
45305 +       assert("nikita-1289", e->num_of_entries >= 0);
45306 +
45307 +       if (coord == NULL)
45308 +               /* insert */
45309 +               result = sizeof(cde_item_format);
45310 +       else
45311 +               /* paste */
45312 +               result = 0;
45313 +
45314 +       result += e->num_of_entries *
45315 +           (sizeof(cde_unit_header) + sizeof(directory_entry_format));
45316 +       for (i = 0; i < e->num_of_entries; ++i) {
45317 +               const char *name;
45318 +               int len;
45319 +
45320 +               name = e->entry[i].name->name;
45321 +               len = e->entry[i].name->len;
45322 +               assert("nikita-2054", strlen(name) == len);
45323 +               if (is_longname(name, len))
45324 +                       result += len + 1;
45325 +       }
45326 +       ((reiser4_item_data *) data)->length = result;
45327 +       return result;
45328 +}
45329 +
45330 +/* ->nr_units() method for this item plugin. */
45331 +pos_in_node_t nr_units_cde(const coord_t * coord /* coord of item */ )
45332 +{
45333 +       return units(coord);
45334 +}
45335 +
45336 +/* ->unit_key() method for this item plugin. */
45337 +reiser4_key *unit_key_cde(const coord_t * coord /* coord of item */ ,
45338 +                         reiser4_key * key /* resulting key */ )
45339 +{
45340 +       assert("nikita-1452", coord != NULL);
45341 +       assert("nikita-1345", idx_of(coord) < units(coord));
45342 +       assert("nikita-1346", key != NULL);
45343 +
45344 +       item_key_by_coord(coord, key);
45345 +       extract_key_from_de_id(extract_dir_id_from_key(key),
45346 +                              &header_at(coord, idx_of(coord))->hash, key);
45347 +       return key;
45348 +}
45349 +
45350 +/* mergeable_cde(): implementation of ->mergeable() item method.
45351 +
45352 +   Two directory items are mergeable iff they are from the same
45353 +   directory. That simple.
45354 +
45355 +*/
45356 +int mergeable_cde(const coord_t * p1 /* coord of first item */ ,
45357 +                 const coord_t * p2 /* coord of second item */ )
45358 +{
45359 +       reiser4_key k1;
45360 +       reiser4_key k2;
45361 +
45362 +       assert("nikita-1339", p1 != NULL);
45363 +       assert("nikita-1340", p2 != NULL);
45364 +
45365 +       return
45366 +           (item_plugin_by_coord(p1) == item_plugin_by_coord(p2)) &&
45367 +           (extract_dir_id_from_key(item_key_by_coord(p1, &k1)) ==
45368 +            extract_dir_id_from_key(item_key_by_coord(p2, &k2)));
45369 +
45370 +}
45371 +
45372 +/* ->max_key_inside() method for this item plugin. */
45373 +reiser4_key *max_key_inside_cde(const coord_t * coord /* coord of item */ ,
45374 +                               reiser4_key * result /* resulting key */ )
45375 +{
45376 +       assert("nikita-1342", coord != NULL);
45377 +
45378 +       item_key_by_coord(coord, result);
45379 +       set_key_ordering(result, get_key_ordering(reiser4_max_key()));
45380 +       set_key_fulloid(result, get_key_fulloid(reiser4_max_key()));
45381 +       set_key_offset(result, get_key_offset(reiser4_max_key()));
45382 +       return result;
45383 +}
45384 +
45385 +/* @data contains data which are to be put into tree */
45386 +int can_contain_key_cde(const coord_t * coord /* coord of item */ ,
45387 +                       const reiser4_key * key /* key to check */ ,
45388 +                       const reiser4_item_data * data  /* parameters of new
45389 +                                                        * item/unit being
45390 +                                                        * created */ )
45391 +{
45392 +       reiser4_key item_key;
45393 +
45394 +       /* FIXME-VS: do not rely on anything but iplug field of @data. Only
45395 +          data->iplug is initialized */
45396 +       assert("vs-457", data && data->iplug);
45397 +/*     assert( "vs-553", data -> user == 0 );*/
45398 +       item_key_by_coord(coord, &item_key);
45399 +
45400 +       return (item_plugin_by_coord(coord) == data->iplug) &&
45401 +           (extract_dir_id_from_key(&item_key) ==
45402 +            extract_dir_id_from_key(key));
45403 +}
45404 +
45405 +#if REISER4_DEBUG
45406 +/* cde_check ->check() method for compressed directory items
45407 +
45408 +   used for debugging, every item should have here the most complete
45409 +   possible check of the consistency of the item that the inventor can
45410 +   construct
45411 +*/
45412 +int reiser4_check_cde(const coord_t * coord /* coord of item to check */,
45413 +                     const char **error /* where to store error message */)
45414 +{
45415 +       int i;
45416 +       int result;
45417 +       char *item_start;
45418 +       char *item_end;
45419 +       reiser4_key key;
45420 +
45421 +       coord_t c;
45422 +
45423 +       assert("nikita-1357", coord != NULL);
45424 +       assert("nikita-1358", error != NULL);
45425 +
45426 +       if (!ergo(coord->item_pos != 0,
45427 +                 is_dot_key(item_key_by_coord(coord, &key)))) {
45428 +               *error = "CDE doesn't start with dot";
45429 +               return -1;
45430 +       }
45431 +       item_start = item_body_by_coord(coord);
45432 +       item_end = item_start + item_length_by_coord(coord);
45433 +
45434 +       coord_dup(&c, coord);
45435 +       result = 0;
45436 +       for (i = 0; i < units(coord); ++i) {
45437 +               directory_entry_format *entry;
45438 +
45439 +               if ((char *)(header_at(coord, i) + 1) >
45440 +                   item_end - units(coord) * sizeof *entry) {
45441 +                       *error = "CDE header is out of bounds";
45442 +                       result = -1;
45443 +                       break;
45444 +               }
45445 +               entry = entry_at(coord, i);
45446 +               if ((char *)entry < item_start + sizeof(cde_item_format)) {
45447 +                       *error = "CDE header is too low";
45448 +                       result = -1;
45449 +                       break;
45450 +               }
45451 +               if ((char *)(entry + 1) > item_end) {
45452 +                       *error = "CDE header is too high";
45453 +                       result = -1;
45454 +                       break;
45455 +               }
45456 +       }
45457 +
45458 +       return result;
45459 +}
45460 +#endif
45461 +
45462 +/* ->init() method for this item plugin. */
45463 +int init_cde(coord_t * coord /* coord of item */ ,
45464 +            coord_t * from UNUSED_ARG, reiser4_item_data * data        /* structure used for insertion */
45465 +            UNUSED_ARG)
45466 +{
45467 +       put_unaligned(cpu_to_le16(0), &formatted_at(coord)->num_of_entries);
45468 +       return 0;
45469 +}
45470 +
45471 +/* ->lookup() method for this item plugin. */
45472 +lookup_result lookup_cde(const reiser4_key * key /* key to search for */ ,
45473 +                        lookup_bias bias /* search bias */ ,
45474 +                        coord_t * coord /* coord of item to lookup in */ )
45475 +{
45476 +       cmp_t last_comp;
45477 +       int pos;
45478 +
45479 +       reiser4_key utmost_key;
45480 +
45481 +       assert("nikita-1293", coord != NULL);
45482 +       assert("nikita-1294", key != NULL);
45483 +
45484 +       CHECKME(coord);
45485 +
45486 +       if (keygt(item_key_by_coord(coord, &utmost_key), key)) {
45487 +               coord->unit_pos = 0;
45488 +               coord->between = BEFORE_UNIT;
45489 +               return CBK_COORD_NOTFOUND;
45490 +       }
45491 +       pos = find(coord, key, &last_comp);
45492 +       if (pos >= 0) {
45493 +               coord->unit_pos = (int)pos;
45494 +               switch (last_comp) {
45495 +               case EQUAL_TO:
45496 +                       coord->between = AT_UNIT;
45497 +                       return CBK_COORD_FOUND;
45498 +               case GREATER_THAN:
45499 +                       coord->between = BEFORE_UNIT;
45500 +                       return RETERR(-ENOENT);
45501 +               case LESS_THAN:
45502 +               default:
45503 +                       impossible("nikita-1298", "Broken find");
45504 +                       return RETERR(-EIO);
45505 +               }
45506 +       } else {
45507 +               coord->unit_pos = units(coord) - 1;
45508 +               coord->between = AFTER_UNIT;
45509 +               return (bias ==
45510 +                       FIND_MAX_NOT_MORE_THAN) ? CBK_COORD_FOUND :
45511 +                   CBK_COORD_NOTFOUND;
45512 +       }
45513 +}
45514 +
45515 +/* ->paste() method for this item plugin. */
45516 +int paste_cde(coord_t * coord /* coord of item */ ,
45517 +             reiser4_item_data * data  /* parameters of new unit being
45518 +                                        * inserted */ ,
45519 +             carry_plugin_info * info UNUSED_ARG /* todo carry queue */ )
45520 +{
45521 +       cde_entry_data *e;
45522 +       int result;
45523 +       int i;
45524 +
45525 +       CHECKME(coord);
45526 +       e = (cde_entry_data *) data->data;
45527 +
45528 +       result = 0;
45529 +       for (i = 0; i < e->num_of_entries; ++i) {
45530 +               int pos;
45531 +               int phantom_size;
45532 +
45533 +               phantom_size = data->length;
45534 +               if (units(coord) == 0)
45535 +                       phantom_size -= sizeof(cde_item_format);
45536 +
45537 +               result =
45538 +                   expand(coord, e->entry + i, phantom_size, &pos, data->arg);
45539 +               if (result != 0)
45540 +                       break;
45541 +               result = paste_entry(coord, e->entry + i, pos, data->arg);
45542 +               if (result != 0)
45543 +                       break;
45544 +       }
45545 +       CHECKME(coord);
45546 +       return result;
45547 +}
45548 +
45549 +/* amount of space occupied by all entries starting from @idx both headers and
45550 +   bodies. */
45551 +static unsigned int part_size(const coord_t * coord /* coord of item */ ,
45552 +                             int idx /* index of unit */ )
45553 +{
45554 +       assert("nikita-1299", coord != NULL);
45555 +       assert("nikita-1300", idx < (int)units(coord));
45556 +
45557 +       return sizeof(cde_item_format) +
45558 +           (idx + 1) * sizeof(cde_unit_header) + offset_of(coord,
45559 +                                                           idx + 1) -
45560 +           offset_of(coord, 0);
45561 +}
45562 +
45563 +/* how many but not more than @want units of @source can be merged with
45564 +   item in @target node. If pend == append - we try to append last item
45565 +   of @target by first units of @source. If pend == prepend - we try to
45566 +   "prepend" first item in @target by last units of @source. @target
45567 +   node has @free_space bytes of free space. Total size of those units
45568 +   are returned via @size */
45569 +int can_shift_cde(unsigned free_space /* free space in item */ ,
45570 +                 coord_t * coord /* coord of source item */ ,
45571 +                 znode * target /* target node */ ,
45572 +                 shift_direction pend /* shift direction */ ,
45573 +                 unsigned *size /* resulting number of shifted bytes */ ,
45574 +                 unsigned want /* maximal number of bytes to shift */ )
45575 +{
45576 +       int shift;
45577 +
45578 +       CHECKME(coord);
45579 +       if (want == 0) {
45580 +               *size = 0;
45581 +               return 0;
45582 +       }
45583 +
45584 +       /* pend == SHIFT_LEFT <==> shifting to the left */
45585 +       if (pend == SHIFT_LEFT) {
45586 +               for (shift = min((int)want - 1, units(coord)); shift >= 0;
45587 +                    --shift) {
45588 +                       *size = part_size(coord, shift);
45589 +                       if (target != NULL)
45590 +                               *size -= sizeof(cde_item_format);
45591 +                       if (*size <= free_space)
45592 +                               break;
45593 +               }
45594 +               shift = shift + 1;
45595 +       } else {
45596 +               int total_size;
45597 +
45598 +               assert("nikita-1301", pend == SHIFT_RIGHT);
45599 +
45600 +               total_size = item_length_by_coord(coord);
45601 +               for (shift = units(coord) - want - 1; shift < units(coord) - 1;
45602 +                    ++shift) {
45603 +                       *size = total_size - part_size(coord, shift);
45604 +                       if (target == NULL)
45605 +                               *size += sizeof(cde_item_format);
45606 +                       if (*size <= free_space)
45607 +                               break;
45608 +               }
45609 +               shift = units(coord) - shift - 1;
45610 +       }
45611 +       if (shift == 0)
45612 +               *size = 0;
45613 +       CHECKME(coord);
45614 +       return shift;
45615 +}
45616 +
45617 +/* ->copy_units() method for this item plugin. */
45618 +void copy_units_cde(coord_t * target /* coord of target item */ ,
45619 +                   coord_t * source /* coord of source item */ ,
45620 +                   unsigned from /* starting unit */ ,
45621 +                   unsigned count /* how many units to copy */ ,
45622 +                   shift_direction where_is_free_space /* shift direction */ ,
45623 +                   unsigned free_space /* free space in item */ )
45624 +{
45625 +       char *header_from;
45626 +       char *header_to;
45627 +
45628 +       char *entry_from;
45629 +       char *entry_to;
45630 +
45631 +       int pos_in_target;
45632 +       int data_size;
45633 +       int data_delta;
45634 +       int i;
45635 +
45636 +       assert("nikita-1303", target != NULL);
45637 +       assert("nikita-1304", source != NULL);
45638 +       assert("nikita-1305", (int)from < units(source));
45639 +       assert("nikita-1307", (int)(from + count) <= units(source));
45640 +
45641 +       if (where_is_free_space == SHIFT_LEFT) {
45642 +               assert("nikita-1453", from == 0);
45643 +               pos_in_target = units(target);
45644 +       } else {
45645 +               assert("nikita-1309", (int)(from + count) == units(source));
45646 +               pos_in_target = 0;
45647 +               memmove(item_body_by_coord(target),
45648 +                       (char *)item_body_by_coord(target) + free_space,
45649 +                       item_length_by_coord(target) - free_space);
45650 +       }
45651 +
45652 +       CHECKME(target);
45653 +       CHECKME(source);
45654 +
45655 +       /* expand @target */
45656 +       data_size =
45657 +           offset_of(source, (int)(from + count)) - offset_of(source,
45658 +                                                              (int)from);
45659 +
45660 +       if (units(target) == 0)
45661 +               free_space -= sizeof(cde_item_format);
45662 +
45663 +       expand_item(target, pos_in_target, (int)count,
45664 +                   (int)(item_length_by_coord(target) - free_space),
45665 +                   (unsigned)data_size);
45666 +
45667 +       /* copy first @count units of @source into @target */
45668 +       data_delta =
45669 +           offset_of(target, pos_in_target) - offset_of(source, (int)from);
45670 +
45671 +       /* copy entries */
45672 +       entry_from = (char *)entry_at(source, (int)from);
45673 +       entry_to = (char *)entry_at(source, (int)(from + count));
45674 +       memmove(entry_at(target, pos_in_target), entry_from,
45675 +               (unsigned)(entry_to - entry_from));
45676 +
45677 +       /* copy headers */
45678 +       header_from = (char *)header_at(source, (int)from);
45679 +       header_to = (char *)header_at(source, (int)(from + count));
45680 +       memmove(header_at(target, pos_in_target), header_from,
45681 +               (unsigned)(header_to - header_from));
45682 +
45683 +       /* update offsets */
45684 +       for (i = pos_in_target; i < (int)(pos_in_target + count); ++i)
45685 +               adj_offset(target, i, data_delta);
45686 +       CHECKME(target);
45687 +       CHECKME(source);
45688 +}
45689 +
45690 +/* ->cut_units() method for this item plugin. */
45691 +int cut_units_cde(coord_t * coord /* coord of item */ ,
45692 +                 pos_in_node_t from /* start unit pos */ ,
45693 +                 pos_in_node_t to /* stop unit pos */ ,
45694 +                 struct carry_cut_data *cdata UNUSED_ARG,
45695 +                 reiser4_key * smallest_removed, reiser4_key * new_first)
45696 +{
45697 +       char *header_from;
45698 +       char *header_to;
45699 +
45700 +       char *entry_from;
45701 +       char *entry_to;
45702 +
45703 +       int size;
45704 +       int entry_delta;
45705 +       int header_delta;
45706 +       int i;
45707 +
45708 +       unsigned count;
45709 +
45710 +       CHECKME(coord);
45711 +
45712 +       count = to - from + 1;
45713 +
45714 +       assert("nikita-1454", coord != NULL);
45715 +       assert("nikita-1455", (int)(from + count) <= units(coord));
45716 +
45717 +       if (smallest_removed)
45718 +               unit_key_by_coord(coord, smallest_removed);
45719 +
45720 +       if (new_first) {
45721 +               coord_t next;
45722 +
45723 +               /* not everything is cut from item head */
45724 +               assert("vs-1527", from == 0);
45725 +               assert("vs-1528", to < units(coord) - 1);
45726 +
45727 +               coord_dup(&next, coord);
45728 +               next.unit_pos++;
45729 +               unit_key_by_coord(&next, new_first);
45730 +       }
45731 +
45732 +       size = item_length_by_coord(coord);
45733 +       if (count == (unsigned)units(coord)) {
45734 +               return size;
45735 +       }
45736 +
45737 +       header_from = (char *)header_at(coord, (int)from);
45738 +       header_to = (char *)header_at(coord, (int)(from + count));
45739 +
45740 +       entry_from = (char *)entry_at(coord, (int)from);
45741 +       entry_to = (char *)entry_at(coord, (int)(from + count));
45742 +
45743 +       /* move headers */
45744 +       memmove(header_from, header_to,
45745 +               (unsigned)(address(coord, size) - header_to));
45746 +
45747 +       header_delta = header_to - header_from;
45748 +
45749 +       entry_from -= header_delta;
45750 +       entry_to -= header_delta;
45751 +       size -= header_delta;
45752 +
45753 +       /* copy entries */
45754 +       memmove(entry_from, entry_to,
45755 +               (unsigned)(address(coord, size) - entry_to));
45756 +
45757 +       entry_delta = entry_to - entry_from;
45758 +       size -= entry_delta;
45759 +
45760 +       /* update offsets */
45761 +
45762 +       for (i = 0; i < (int)from; ++i)
45763 +               adj_offset(coord, i, -header_delta);
45764 +
45765 +       for (i = from; i < units(coord) - (int)count; ++i)
45766 +               adj_offset(coord, i, -header_delta - entry_delta);
45767 +
45768 +       put_unaligned(cpu_to_le16((__u16) units(coord) - count),
45769 +                     &formatted_at(coord)->num_of_entries);
45770 +
45771 +       if (from == 0) {
45772 +               /* entries from head was removed - move remaining to right */
45773 +               memmove((char *)item_body_by_coord(coord) +
45774 +                       header_delta + entry_delta, item_body_by_coord(coord),
45775 +                       (unsigned)size);
45776 +               if (REISER4_DEBUG)
45777 +                       memset(item_body_by_coord(coord), 0,
45778 +                              (unsigned)header_delta + entry_delta);
45779 +       } else {
45780 +               /* freed space is already at the end of item */
45781 +               if (REISER4_DEBUG)
45782 +                       memset((char *)item_body_by_coord(coord) + size, 0,
45783 +                              (unsigned)header_delta + entry_delta);
45784 +       }
45785 +
45786 +       return header_delta + entry_delta;
45787 +}
45788 +
45789 +int kill_units_cde(coord_t * coord /* coord of item */ ,
45790 +                  pos_in_node_t from /* start unit pos */ ,
45791 +                  pos_in_node_t to /* stop unit pos */ ,
45792 +                  struct carry_kill_data *kdata UNUSED_ARG,
45793 +                  reiser4_key * smallest_removed, reiser4_key * new_first)
45794 +{
45795 +       return cut_units_cde(coord, from, to, NULL, smallest_removed, new_first);
45796 +}
45797 +
45798 +/* ->s.dir.extract_key() method for this item plugin. */
45799 +int extract_key_cde(const coord_t * coord /* coord of item */ ,
45800 +                   reiser4_key * key /* resulting key */ )
45801 +{
45802 +       directory_entry_format *dent;
45803 +
45804 +       assert("nikita-1155", coord != NULL);
45805 +       assert("nikita-1156", key != NULL);
45806 +
45807 +       dent = entry_at(coord, idx_of(coord));
45808 +       return extract_key_from_id(&dent->id, key);
45809 +}
45810 +
45811 +int
45812 +update_key_cde(const coord_t * coord, const reiser4_key * key,
45813 +              lock_handle * lh UNUSED_ARG)
45814 +{
45815 +       directory_entry_format *dent;
45816 +       obj_key_id obj_id;
45817 +       int result;
45818 +
45819 +       assert("nikita-2344", coord != NULL);
45820 +       assert("nikita-2345", key != NULL);
45821 +
45822 +       dent = entry_at(coord, idx_of(coord));
45823 +       result = build_obj_key_id(key, &obj_id);
45824 +       if (result == 0) {
45825 +               dent->id = obj_id;
45826 +               znode_make_dirty(coord->node);
45827 +       }
45828 +       return 0;
45829 +}
45830 +
45831 +/* ->s.dir.extract_name() method for this item plugin. */
45832 +char *extract_name_cde(const coord_t * coord /* coord of item */ , char *buf)
45833 +{
45834 +       directory_entry_format *dent;
45835 +
45836 +       assert("nikita-1157", coord != NULL);
45837 +
45838 +       dent = entry_at(coord, idx_of(coord));
45839 +       return extract_dent_name(coord, dent, buf);
45840 +}
45841 +
45842 +static int cde_bytes(int pasting, const reiser4_item_data * data)
45843 +{
45844 +       int result;
45845 +
45846 +       result = data->length;
45847 +       if (!pasting)
45848 +               result -= sizeof(cde_item_format);
45849 +       return result;
45850 +}
45851 +
45852 +/* ->s.dir.add_entry() method for this item plugin */
45853 +int add_entry_cde(struct inode *dir /* directory object */ ,
45854 +                 coord_t * coord /* coord of item */ ,
45855 +                 lock_handle * lh /* lock handle for insertion */ ,
45856 +                 const struct dentry *name /* name to insert */ ,
45857 +                 reiser4_dir_entry_desc * dir_entry    /* parameters of new
45858 +                                                        * directory entry */ )
45859 +{
45860 +       reiser4_item_data data;
45861 +       cde_entry entry;
45862 +       cde_entry_data edata;
45863 +       int result;
45864 +
45865 +       assert("nikita-1656", coord->node == lh->node);
45866 +       assert("nikita-1657", znode_is_write_locked(coord->node));
45867 +
45868 +       edata.num_of_entries = 1;
45869 +       edata.entry = &entry;
45870 +
45871 +       entry.dir = dir;
45872 +       entry.obj = dir_entry->obj;
45873 +       entry.name = &name->d_name;
45874 +
45875 +       data.data = (char *)&edata;
45876 +       data.user = 0;          /* &edata is not user space */
45877 +       data.iplug = item_plugin_by_id(COMPOUND_DIR_ID);
45878 +       data.arg = dir_entry;
45879 +       assert("nikita-1302", data.iplug != NULL);
45880 +
45881 +       result = is_dot_key(&dir_entry->key);
45882 +       data.length = estimate_cde(result ? coord : NULL, &data);
45883 +
45884 +       /* NOTE-NIKITA quota plugin? */
45885 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, cde_bytes(result, &data)))
45886 +               return RETERR(-EDQUOT);
45887 +
45888 +       if (result)
45889 +               result = insert_by_coord(coord, &data, &dir_entry->key, lh, 0);
45890 +       else
45891 +               result = reiser4_resize_item(coord, &data, &dir_entry->key,
45892 +                                            lh, 0);
45893 +       return result;
45894 +}
45895 +
45896 +/* ->s.dir.rem_entry() */
45897 +int rem_entry_cde(struct inode *dir /* directory of item */ ,
45898 +                 const struct qstr *name, coord_t * coord /* coord of item */ ,
45899 +                 lock_handle * lh UNUSED_ARG   /* lock handle for
45900 +                                                * removal */ ,
45901 +                 reiser4_dir_entry_desc * entry UNUSED_ARG     /* parameters of
45902 +                                                                * directory entry
45903 +                                                                * being removed */ )
45904 +{
45905 +       coord_t shadow;
45906 +       int result;
45907 +       int length;
45908 +       ON_DEBUG(char buf[DE_NAME_BUF_LEN]);
45909 +
45910 +       assert("nikita-2870", strlen(name->name) == name->len);
45911 +       assert("nikita-2869",
45912 +              !strcmp(name->name, extract_name_cde(coord, buf)));
45913 +
45914 +       length = sizeof(directory_entry_format) + sizeof(cde_unit_header);
45915 +       if (is_longname(name->name, name->len))
45916 +               length += name->len + 1;
45917 +
45918 +       if (inode_get_bytes(dir) < length) {
45919 +               warning("nikita-2628", "Dir is broke: %llu: %llu",
45920 +                       (unsigned long long)get_inode_oid(dir),
45921 +                       inode_get_bytes(dir));
45922 +
45923 +               return RETERR(-EIO);
45924 +       }
45925 +
45926 +       /* cut_node() is supposed to take pointers to _different_
45927 +          coords, because it will modify them without respect to
45928 +          possible aliasing. To work around this, create temporary copy
45929 +          of @coord.
45930 +        */
45931 +       coord_dup(&shadow, coord);
45932 +       result =
45933 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
45934 +       if (result == 0) {
45935 +               /* NOTE-NIKITA quota plugin? */
45936 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
45937 +       }
45938 +       return result;
45939 +}
45940 +
45941 +/* ->s.dir.max_name_len() method for this item plugin */
45942 +int max_name_len_cde(const struct inode *dir /* directory */ )
45943 +{
45944 +       return
45945 +               reiser4_tree_by_inode(dir)->nplug->max_item_size() -
45946 +               sizeof(directory_entry_format) - sizeof(cde_item_format) -
45947 +               sizeof(cde_unit_header) - 2;
45948 +}
45949 +
45950 +/* Make Linus happy.
45951 +   Local variables:
45952 +   c-indentation-style: "K&R"
45953 +   mode-name: "LC"
45954 +   c-basic-offset: 8
45955 +   tab-width: 8
45956 +   fill-column: 120
45957 +   End:
45958 +*/
45959 diff --git a/fs/reiser4/plugin/item/cde.h b/fs/reiser4/plugin/item/cde.h
45960 new file mode 100644
45961 index 0000000..73a30d5
45962 --- /dev/null
45963 +++ b/fs/reiser4/plugin/item/cde.h
45964 @@ -0,0 +1,87 @@
45965 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
45966 +
45967 +/* Compound directory item. See cde.c for description. */
45968 +
45969 +#if !defined( __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ )
45970 +#define __FS_REISER4_PLUGIN_COMPRESSED_DE_H__
45971 +
45972 +#include "../../forward.h"
45973 +#include "../../kassign.h"
45974 +#include "../../dformat.h"
45975 +
45976 +#include <linux/fs.h>          /* for struct inode */
45977 +#include <linux/dcache.h>      /* for struct dentry, etc  */
45978 +
45979 +typedef struct cde_unit_header {
45980 +       de_id hash;
45981 +       d16 offset;
45982 +} cde_unit_header;
45983 +
45984 +typedef struct cde_item_format {
45985 +       d16 num_of_entries;
45986 +       cde_unit_header entry[0];
45987 +} cde_item_format;
45988 +
45989 +typedef struct cde_entry {
45990 +       const struct inode *dir;
45991 +       const struct inode *obj;
45992 +       const struct qstr *name;
45993 +} cde_entry;
45994 +
45995 +typedef struct cde_entry_data {
45996 +       int num_of_entries;
45997 +       cde_entry *entry;
45998 +} cde_entry_data;
45999 +
46000 +/* plugin->item.b.* */
46001 +reiser4_key *max_key_inside_cde(const coord_t * coord, reiser4_key * result);
46002 +int can_contain_key_cde(const coord_t * coord, const reiser4_key * key,
46003 +                       const reiser4_item_data *);
46004 +int mergeable_cde(const coord_t * p1, const coord_t * p2);
46005 +pos_in_node_t nr_units_cde(const coord_t * coord);
46006 +reiser4_key *unit_key_cde(const coord_t * coord, reiser4_key * key);
46007 +int estimate_cde(const coord_t * coord, const reiser4_item_data * data);
46008 +void print_cde(const char *prefix, coord_t * coord);
46009 +int init_cde(coord_t * coord, coord_t * from, reiser4_item_data * data);
46010 +lookup_result lookup_cde(const reiser4_key * key, lookup_bias bias,
46011 +                        coord_t * coord);
46012 +int paste_cde(coord_t * coord, reiser4_item_data * data,
46013 +             carry_plugin_info * info UNUSED_ARG);
46014 +int can_shift_cde(unsigned free_space, coord_t * coord, znode * target,
46015 +                 shift_direction pend, unsigned *size, unsigned want);
46016 +void copy_units_cde(coord_t * target, coord_t * source, unsigned from,
46017 +                   unsigned count, shift_direction where_is_free_space,
46018 +                   unsigned free_space);
46019 +int cut_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46020 +                 struct carry_cut_data *, reiser4_key * smallest_removed,
46021 +                 reiser4_key * new_first);
46022 +int kill_units_cde(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46023 +                  struct carry_kill_data *, reiser4_key * smallest_removed,
46024 +                  reiser4_key * new_first);
46025 +void print_cde(const char *prefix, coord_t * coord);
46026 +int reiser4_check_cde(const coord_t * coord, const char **error);
46027 +
46028 +/* plugin->u.item.s.dir.* */
46029 +int extract_key_cde(const coord_t * coord, reiser4_key * key);
46030 +int update_key_cde(const coord_t * coord, const reiser4_key * key,
46031 +                  lock_handle * lh);
46032 +char *extract_name_cde(const coord_t * coord, char *buf);
46033 +int add_entry_cde(struct inode *dir, coord_t * coord,
46034 +                 lock_handle * lh, const struct dentry *name,
46035 +                 reiser4_dir_entry_desc * entry);
46036 +int rem_entry_cde(struct inode *dir, const struct qstr *name, coord_t * coord,
46037 +                 lock_handle * lh, reiser4_dir_entry_desc * entry);
46038 +int max_name_len_cde(const struct inode *dir);
46039 +
46040 +/* __FS_REISER4_PLUGIN_COMPRESSED_DE_H__ */
46041 +#endif
46042 +
46043 +/* Make Linus happy.
46044 +   Local variables:
46045 +   c-indentation-style: "K&R"
46046 +   mode-name: "LC"
46047 +   c-basic-offset: 8
46048 +   tab-width: 8
46049 +   fill-column: 120
46050 +   End:
46051 +*/
46052 diff --git a/fs/reiser4/plugin/item/ctail.c b/fs/reiser4/plugin/item/ctail.c
46053 new file mode 100644
46054 index 0000000..9cb8eca
46055 --- /dev/null
46056 +++ b/fs/reiser4/plugin/item/ctail.c
46057 @@ -0,0 +1,1570 @@
46058 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
46059 +
46060 +/* ctails (aka "clustered tails") are items for cryptcompress objects */
46061 +
46062 +/* DESCRIPTION:
46063 +
46064 +Each cryptcompress object is stored on disk as a set of clusters sliced
46065 +into ctails.
46066 +
46067 +Internal on-disk structure:
46068 +
46069 +        HEADER   (1)  Here stored disk cluster shift
46070 +       BODY
46071 +*/
46072 +
46073 +#include "../../forward.h"
46074 +#include "../../debug.h"
46075 +#include "../../dformat.h"
46076 +#include "../../kassign.h"
46077 +#include "../../key.h"
46078 +#include "../../coord.h"
46079 +#include "item.h"
46080 +#include "../node/node.h"
46081 +#include "../plugin.h"
46082 +#include "../object.h"
46083 +#include "../../znode.h"
46084 +#include "../../carry.h"
46085 +#include "../../tree.h"
46086 +#include "../../inode.h"
46087 +#include "../../super.h"
46088 +#include "../../context.h"
46089 +#include "../../page_cache.h"
46090 +#include "../cluster.h"
46091 +#include "../../flush.h"
46092 +#include "../../tree_walk.h"
46093 +
46094 +#include <linux/pagevec.h>
46095 +#include <linux/swap.h>
46096 +#include <linux/fs.h>
46097 +
46098 +/* return body of ctail item at @coord */
46099 +static ctail_item_format *ctail_formatted_at(const coord_t * coord)
46100 +{
46101 +       assert("edward-60", coord != NULL);
46102 +       return item_body_by_coord(coord);
46103 +}
46104 +
46105 +static int cluster_shift_by_coord(const coord_t * coord)
46106 +{
46107 +       return get_unaligned(&ctail_formatted_at(coord)->cluster_shift);
46108 +}
46109 +
46110 +static inline void dclust_set_extension_shift(hint_t * hint)
46111 +{
46112 +       assert("edward-1270",
46113 +              item_id_by_coord(&hint->ext_coord.coord) == CTAIL_ID);
46114 +       hint->ext_coord.extension.ctail.shift =
46115 +           cluster_shift_by_coord(&hint->ext_coord.coord);
46116 +}
46117 +
46118 +static loff_t off_by_coord(const coord_t * coord)
46119 +{
46120 +       reiser4_key key;
46121 +       return get_key_offset(item_key_by_coord(coord, &key));
46122 +}
46123 +
46124 +int coord_is_unprepped_ctail(const coord_t * coord)
46125 +{
46126 +       assert("edward-1233", coord != NULL);
46127 +       assert("edward-1234", item_id_by_coord(coord) == CTAIL_ID);
46128 +       assert("edward-1235",
46129 +              ergo((int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT,
46130 +                   nr_units_ctail(coord) == (pos_in_node_t) UCTAIL_NR_UNITS));
46131 +
46132 +       return (int)cluster_shift_by_coord(coord) == (int)UCTAIL_SHIFT;
46133 +}
46134 +
46135 +static cloff_t clust_by_coord(const coord_t * coord, struct inode *inode)
46136 +{
46137 +       int shift;
46138 +
46139 +       if (inode != NULL) {
46140 +               shift = inode_cluster_shift(inode);
46141 +               assert("edward-1236",
46142 +                      ergo(!coord_is_unprepped_ctail(coord),
46143 +                           shift == cluster_shift_by_coord(coord)));
46144 +       } else {
46145 +               assert("edward-1237", !coord_is_unprepped_ctail(coord));
46146 +               shift = cluster_shift_by_coord(coord);
46147 +       }
46148 +       return off_by_coord(coord) >> shift;
46149 +}
46150 +
46151 +static int disk_cluster_size(const coord_t * coord)
46152 +{
46153 +       assert("edward-1156",
46154 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
46155 +       /* calculation of disk cluster size
46156 +          is meaninless if ctail is unprepped */
46157 +       assert("edward-1238", !coord_is_unprepped_ctail(coord));
46158 +
46159 +       return 1 << cluster_shift_by_coord(coord);
46160 +}
46161 +
46162 +/* true if the key is of first disk cluster item */
46163 +static int is_disk_cluster_key(const reiser4_key * key, const coord_t * coord)
46164 +{
46165 +       assert("edward-1239", item_id_by_coord(coord) == CTAIL_ID);
46166 +
46167 +       return coord_is_unprepped_ctail(coord) ||
46168 +           ((get_key_offset(key) &
46169 +             ((loff_t) disk_cluster_size(coord) - 1)) == 0);
46170 +}
46171 +
46172 +static char *first_unit(coord_t * coord)
46173 +{
46174 +       /* FIXME: warning: pointer of type `void *' used in arithmetic */
46175 +       return (char *)item_body_by_coord(coord) + sizeof(ctail_item_format);
46176 +}
46177 +
46178 +/* plugin->u.item.b.max_key_inside :
46179 +   tail_max_key_inside */
46180 +
46181 +/* plugin->u.item.b.can_contain_key */
46182 +int
46183 +can_contain_key_ctail(const coord_t * coord, const reiser4_key * key,
46184 +                     const reiser4_item_data * data)
46185 +{
46186 +       reiser4_key item_key;
46187 +
46188 +       if (item_plugin_by_coord(coord) != data->iplug)
46189 +               return 0;
46190 +
46191 +       item_key_by_coord(coord, &item_key);
46192 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
46193 +           get_key_objectid(key) != get_key_objectid(&item_key))
46194 +               return 0;
46195 +       if (get_key_offset(&item_key) + nr_units_ctail(coord) !=
46196 +           get_key_offset(key))
46197 +               return 0;
46198 +       if (is_disk_cluster_key(key, coord))
46199 +               return 0;
46200 +       return 1;
46201 +}
46202 +
46203 +/* plugin->u.item.b.mergeable
46204 +   c-tails of different clusters are not mergeable */
46205 +int mergeable_ctail(const coord_t * p1, const coord_t * p2)
46206 +{
46207 +       reiser4_key key1, key2;
46208 +
46209 +       assert("edward-62", item_id_by_coord(p1) == CTAIL_ID);
46210 +       assert("edward-61", plugin_of_group(item_plugin_by_coord(p1),
46211 +                                           UNIX_FILE_METADATA_ITEM_TYPE));
46212 +
46213 +       if (item_id_by_coord(p2) != CTAIL_ID) {
46214 +               /* second item is of another type */
46215 +               return 0;
46216 +       }
46217 +
46218 +       item_key_by_coord(p1, &key1);
46219 +       item_key_by_coord(p2, &key2);
46220 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
46221 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
46222 +           get_key_type(&key1) != get_key_type(&key2)) {
46223 +               /* items of different objects */
46224 +               return 0;
46225 +       }
46226 +       if (get_key_offset(&key1) + nr_units_ctail(p1) != get_key_offset(&key2))
46227 +               /*  not adjacent items */
46228 +               return 0;
46229 +       if (is_disk_cluster_key(&key2, p2))
46230 +               return 0;
46231 +       return 1;
46232 +}
46233 +
46234 +/* plugin->u.item.b.nr_units */
46235 +pos_in_node_t nr_units_ctail(const coord_t * coord)
46236 +{
46237 +       return (item_length_by_coord(coord) -
46238 +               sizeof(ctail_formatted_at(coord)->cluster_shift));
46239 +}
46240 +
46241 +/* plugin->u.item.b.estimate:
46242 +   estimate how much space is needed to insert/paste @data->length bytes
46243 +   into ctail at @coord */
46244 +int estimate_ctail(const coord_t * coord /* coord of item */ ,
46245 +                  const reiser4_item_data *
46246 +                  data /* parameters for new item */ )
46247 +{
46248 +       if (coord == NULL)
46249 +               /* insert */
46250 +               return (sizeof(ctail_item_format) + data->length);
46251 +       else
46252 +               /* paste */
46253 +               return data->length;
46254 +}
46255 +
46256 +/* ->init() method for this item plugin. */
46257 +int init_ctail(coord_t * to /* coord of item */ ,
46258 +              coord_t * from /* old_item */ ,
46259 +              reiser4_item_data * data /* structure used for insertion */ )
46260 +{
46261 +       int cluster_shift;      /* cpu value to convert */
46262 +
46263 +       if (data) {
46264 +               assert("edward-463", data->length > sizeof(ctail_item_format));
46265 +               cluster_shift = *((int *)(data->arg));
46266 +               data->length -= sizeof(ctail_item_format);
46267 +       } else {
46268 +               assert("edward-464", from != NULL);
46269 +               assert("edward-855", ctail_ok(from));
46270 +               cluster_shift = (int)(cluster_shift_by_coord(from));
46271 +       }
46272 +       put_unaligned((d8)cluster_shift, &ctail_formatted_at(to)->cluster_shift);
46273 +       assert("edward-856", ctail_ok(to));
46274 +       return 0;
46275 +}
46276 +
46277 +/* plugin->u.item.b.lookup:
46278 +   NULL: We are looking for item keys only */
46279 +
46280 +#if REISER4_DEBUG
46281 +int ctail_ok(const coord_t * coord)
46282 +{
46283 +       return coord_is_unprepped_ctail(coord) ||
46284 +           cluster_shift_ok(cluster_shift_by_coord(coord));
46285 +}
46286 +
46287 +/* plugin->u.item.b.check */
46288 +int check_ctail(const coord_t * coord, const char **error)
46289 +{
46290 +       if (!ctail_ok(coord)) {
46291 +               if (error)
46292 +                       *error = "bad cluster shift in ctail";
46293 +               return 1;
46294 +       }
46295 +       return 0;
46296 +}
46297 +#endif
46298 +
46299 +/* plugin->u.item.b.paste */
46300 +int
46301 +paste_ctail(coord_t * coord, reiser4_item_data * data,
46302 +           carry_plugin_info * info UNUSED_ARG)
46303 +{
46304 +       unsigned old_nr_units;
46305 +
46306 +       assert("edward-268", data->data != NULL);
46307 +       /* copy only from kernel space */
46308 +       assert("edward-66", data->user == 0);
46309 +
46310 +       old_nr_units =
46311 +           item_length_by_coord(coord) - sizeof(ctail_item_format) -
46312 +           data->length;
46313 +
46314 +       /* ctail items never get pasted in the middle */
46315 +
46316 +       if (coord->unit_pos == 0 && coord->between == AT_UNIT) {
46317 +
46318 +               /* paste at the beginning when create new item */
46319 +               assert("edward-450",
46320 +                      item_length_by_coord(coord) ==
46321 +                      data->length + sizeof(ctail_item_format));
46322 +               assert("edward-451", old_nr_units == 0);
46323 +       } else if (coord->unit_pos == old_nr_units - 1
46324 +                  && coord->between == AFTER_UNIT) {
46325 +
46326 +               /* paste at the end */
46327 +               coord->unit_pos++;
46328 +       } else
46329 +               impossible("edward-453", "bad paste position");
46330 +
46331 +       memcpy(first_unit(coord) + coord->unit_pos, data->data, data->length);
46332 +
46333 +       assert("edward-857", ctail_ok(coord));
46334 +
46335 +       return 0;
46336 +}
46337 +
46338 +/* plugin->u.item.b.fast_paste */
46339 +
46340 +/* plugin->u.item.b.can_shift
46341 +   number of units is returned via return value, number of bytes via @size. For
46342 +   ctail items they coincide */
46343 +int
46344 +can_shift_ctail(unsigned free_space, coord_t * source,
46345 +               znode * target, shift_direction direction UNUSED_ARG,
46346 +               unsigned *size /* number of bytes */ , unsigned want)
46347 +{
46348 +       /* make sure that that we do not want to shift more than we have */
46349 +       assert("edward-68", want > 0 && want <= nr_units_ctail(source));
46350 +
46351 +       *size = min(want, free_space);
46352 +
46353 +       if (!target) {
46354 +               /* new item will be created */
46355 +               if (*size <= sizeof(ctail_item_format)) {
46356 +                       *size = 0;
46357 +                       return 0;
46358 +               }
46359 +               return *size - sizeof(ctail_item_format);
46360 +       }
46361 +       return *size;
46362 +}
46363 +
46364 +/* plugin->u.item.b.copy_units
46365 +   cooperates with ->can_shift() */
46366 +void
46367 +copy_units_ctail(coord_t * target, coord_t * source,
46368 +                unsigned from, unsigned count /* units */ ,
46369 +                shift_direction where_is_free_space,
46370 +                unsigned free_space /* bytes */ )
46371 +{
46372 +       /* make sure that item @target is expanded already */
46373 +       assert("edward-69", (unsigned)item_length_by_coord(target) >= count);
46374 +       assert("edward-70", free_space == count || free_space == count + 1);
46375 +
46376 +       assert("edward-858", ctail_ok(source));
46377 +
46378 +       if (where_is_free_space == SHIFT_LEFT) {
46379 +               /* append item @target with @count first bytes of @source:
46380 +                  this restriction came from ordinary tails */
46381 +               assert("edward-71", from == 0);
46382 +               assert("edward-860", ctail_ok(target));
46383 +
46384 +               memcpy(first_unit(target) + nr_units_ctail(target) - count,
46385 +                      first_unit(source), count);
46386 +       } else {
46387 +               /* target item is moved to right already */
46388 +               reiser4_key key;
46389 +
46390 +               assert("edward-72", nr_units_ctail(source) == from + count);
46391 +
46392 +               if (free_space == count) {
46393 +                       init_ctail(target, source, NULL);
46394 +               } else {
46395 +                       /* new item has been created */
46396 +                       assert("edward-862", ctail_ok(target));
46397 +               }
46398 +               memcpy(first_unit(target), first_unit(source) + from, count);
46399 +
46400 +               assert("edward-863", ctail_ok(target));
46401 +
46402 +               /* new units are inserted before first unit in an item,
46403 +                  therefore, we have to update item key */
46404 +               item_key_by_coord(source, &key);
46405 +               set_key_offset(&key, get_key_offset(&key) + from);
46406 +
46407 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
46408 +                                                                  NULL /*info */);
46409 +       }
46410 +}
46411 +
46412 +/* plugin->u.item.b.create_hook */
46413 +int create_hook_ctail(const coord_t * coord, void *arg)
46414 +{
46415 +       assert("edward-864", znode_is_loaded(coord->node));
46416 +
46417 +       znode_set_convertible(coord->node);
46418 +       return 0;
46419 +}
46420 +
46421 +/* plugin->u.item.b.kill_hook */
46422 +int
46423 +kill_hook_ctail(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
46424 +               carry_kill_data * kdata)
46425 +{
46426 +       struct inode *inode;
46427 +
46428 +       assert("edward-1157", item_id_by_coord(coord) == CTAIL_ID);
46429 +       assert("edward-291", znode_is_write_locked(coord->node));
46430 +
46431 +       inode = kdata->inode;
46432 +       if (inode) {
46433 +               reiser4_key key;
46434 +               item_key_by_coord(coord, &key);
46435 +
46436 +               if (from == 0 && is_disk_cluster_key(&key, coord)) {
46437 +                       /* disk cluster is killed */
46438 +                       cloff_t start =
46439 +                           off_to_clust(get_key_offset(&key), inode);
46440 +                       truncate_page_cluster_cryptcompress(inode, start,
46441 +                                                       kdata->params.truncate);
46442 +                       inode_sub_bytes(inode, inode_cluster_size(inode));
46443 +               }
46444 +       }
46445 +       return 0;
46446 +}
46447 +
46448 +/* for shift_hook_ctail(),
46449 +   return true if the first disk cluster item has dirty child
46450 +*/
46451 +static int ctail_convertible(const coord_t * coord)
46452 +{
46453 +       int result;
46454 +       reiser4_key key;
46455 +       jnode *child = NULL;
46456 +
46457 +       assert("edward-477", coord != NULL);
46458 +       assert("edward-478", item_id_by_coord(coord) == CTAIL_ID);
46459 +
46460 +       if (coord_is_unprepped_ctail(coord))
46461 +               /* unprepped ctail should be converted */
46462 +               return 1;
46463 +
46464 +       item_key_by_coord(coord, &key);
46465 +       child = jlookup(current_tree,
46466 +                       get_key_objectid(&key),
46467 +                       off_to_pg(off_by_coord(coord)));
46468 +       if (!child)
46469 +               return 0;
46470 +       result = JF_ISSET(child, JNODE_DIRTY);
46471 +       jput(child);
46472 +       return result;
46473 +}
46474 +
46475 +/* FIXME-EDWARD */
46476 +/* plugin->u.item.b.shift_hook */
46477 +int shift_hook_ctail(const coord_t * item /* coord of item */ ,
46478 +                    unsigned from UNUSED_ARG /* start unit */ ,
46479 +                    unsigned count UNUSED_ARG /* stop unit */ ,
46480 +                    znode * old_node /* old parent */ )
46481 +{
46482 +       assert("edward-479", item != NULL);
46483 +       assert("edward-480", item->node != old_node);
46484 +
46485 +       if (!znode_convertible(old_node) || znode_convertible(item->node))
46486 +               return 0;
46487 +       if (ctail_convertible(item))
46488 +               znode_set_convertible(item->node);
46489 +       return 0;
46490 +}
46491 +
46492 +static int
46493 +cut_or_kill_ctail_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
46494 +                       int cut, void *p, reiser4_key * smallest_removed,
46495 +                       reiser4_key * new_first)
46496 +{
46497 +       pos_in_node_t count;    /* number of units to cut */
46498 +       char *item;
46499 +
46500 +       count = to - from + 1;
46501 +       item = item_body_by_coord(coord);
46502 +
46503 +       assert("edward-74", ergo(from != 0, to == coord_last_unit_pos(coord)));
46504 +
46505 +       if (smallest_removed) {
46506 +               /* store smallest key removed */
46507 +               item_key_by_coord(coord, smallest_removed);
46508 +               set_key_offset(smallest_removed,
46509 +                              get_key_offset(smallest_removed) + from);
46510 +       }
46511 +
46512 +       if (new_first) {
46513 +               assert("vs-1531", from == 0);
46514 +
46515 +               item_key_by_coord(coord, new_first);
46516 +               set_key_offset(new_first,
46517 +                              get_key_offset(new_first) + from + count);
46518 +       }
46519 +
46520 +       if (!cut)
46521 +               kill_hook_ctail(coord, from, 0, (struct carry_kill_data *)p);
46522 +
46523 +       if (from == 0) {
46524 +               if (count != nr_units_ctail(coord)) {
46525 +                       /* part of item is removed, so move free space at the beginning
46526 +                          of the item and update item key */
46527 +                       reiser4_key key;
46528 +                       memcpy(item + to + 1, item, sizeof(ctail_item_format));
46529 +                       item_key_by_coord(coord, &key);
46530 +                       set_key_offset(&key, get_key_offset(&key) + count);
46531 +                       node_plugin_by_node(coord->node)->update_item_key(coord,
46532 +                                                                         &key,
46533 +                                                                         NULL);
46534 +               } else {
46535 +                       /* cut_units should not be called to cut evrything */
46536 +                       assert("vs-1532", ergo(cut, 0));
46537 +                       /* whole item is cut, so more then amount of space occupied
46538 +                          by units got freed */
46539 +                       count += sizeof(ctail_item_format);
46540 +               }
46541 +               if (REISER4_DEBUG)
46542 +                       memset(item, 0, count);
46543 +       } else if (REISER4_DEBUG)
46544 +               memset(item + sizeof(ctail_item_format) + from, 0, count);
46545 +       return count;
46546 +}
46547 +
46548 +/* plugin->u.item.b.cut_units */
46549 +int
46550 +cut_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
46551 +               carry_cut_data * cdata, reiser4_key * smallest_removed,
46552 +               reiser4_key * new_first)
46553 +{
46554 +       return cut_or_kill_ctail_units(item, from, to, 1, NULL,
46555 +                                      smallest_removed, new_first);
46556 +}
46557 +
46558 +/* plugin->u.item.b.kill_units */
46559 +int
46560 +kill_units_ctail(coord_t * item, pos_in_node_t from, pos_in_node_t to,
46561 +                struct carry_kill_data *kdata, reiser4_key * smallest_removed,
46562 +                reiser4_key * new_first)
46563 +{
46564 +       return cut_or_kill_ctail_units(item, from, to, 0, kdata,
46565 +                                      smallest_removed, new_first);
46566 +}
46567 +
46568 +/* plugin->u.item.s.file.read */
46569 +int read_ctail(struct file *file UNUSED_ARG, flow_t * f, hint_t * hint)
46570 +{
46571 +       uf_coord_t *uf_coord;
46572 +       coord_t *coord;
46573 +
46574 +       uf_coord = &hint->ext_coord;
46575 +       coord = &uf_coord->coord;
46576 +       assert("edward-127", f->user == 0);
46577 +       assert("edward-129", coord && coord->node);
46578 +       assert("edward-130", coord_is_existing_unit(coord));
46579 +       assert("edward-132", znode_is_loaded(coord->node));
46580 +
46581 +       /* start read only from the beginning of ctail */
46582 +       assert("edward-133", coord->unit_pos == 0);
46583 +       /* read only whole ctails */
46584 +       assert("edward-135", nr_units_ctail(coord) <= f->length);
46585 +
46586 +       assert("edward-136", reiser4_schedulable());
46587 +       assert("edward-886", ctail_ok(coord));
46588 +
46589 +       if (f->data)
46590 +               memcpy(f->data, (char *)first_unit(coord),
46591 +                      (size_t) nr_units_ctail(coord));
46592 +
46593 +       dclust_set_extension_shift(hint);
46594 +       mark_page_accessed(znode_page(coord->node));
46595 +       move_flow_forward(f, nr_units_ctail(coord));
46596 +
46597 +       return 0;
46598 +}
46599 +
46600 +/* Reads a disk cluster consists of ctail items,
46601 +   attaches a transform stream with plain text */
46602 +int ctail_read_disk_cluster(reiser4_cluster_t * clust, struct inode *inode,
46603 +                           znode_lock_mode mode)
46604 +{
46605 +       int result;
46606 +       assert("edward-1450", mode == ZNODE_READ_LOCK || ZNODE_WRITE_LOCK);
46607 +       assert("edward-671", clust->hint != NULL);
46608 +       assert("edward-140", clust->dstat == INVAL_DISK_CLUSTER);
46609 +       assert("edward-672", cryptcompress_inode_ok(inode));
46610 +
46611 +       /* set input stream */
46612 +       result = grab_tfm_stream(inode, &clust->tc, INPUT_STREAM);
46613 +       if (result)
46614 +               return result;
46615 +
46616 +       result = find_disk_cluster(clust, inode, 1 /* read items */, mode);
46617 +       assert("edward-1340", !result);
46618 +       if (result)
46619 +               return result;
46620 +       if (mode == ZNODE_READ_LOCK)
46621 +               /* write still need the lock to insert unprepped
46622 +                  items, etc... */
46623 +               put_hint_cluster(clust, inode, ZNODE_READ_LOCK);
46624 +
46625 +       if (clust->dstat == FAKE_DISK_CLUSTER ||
46626 +           clust->dstat == UNPR_DISK_CLUSTER) {
46627 +               tfm_cluster_set_uptodate(&clust->tc);
46628 +               return 0;
46629 +       }
46630 +       result = grab_coa(&clust->tc, inode_compression_plugin(inode));
46631 +       if (result)
46632 +               return result;
46633 +       result = reiser4_inflate_cluster(clust, inode);
46634 +       if (result)
46635 +               return result;
46636 +       tfm_cluster_set_uptodate(&clust->tc);
46637 +       return 0;
46638 +}
46639 +
46640 +/* read one locked page */
46641 +int do_readpage_ctail(struct inode * inode, reiser4_cluster_t * clust,
46642 +                     struct page *page, znode_lock_mode mode)
46643 +{
46644 +       int ret;
46645 +       unsigned cloff;
46646 +       char *data;
46647 +       size_t pgcnt;
46648 +       tfm_cluster_t *tc = &clust->tc;
46649 +
46650 +       assert("edward-212", PageLocked(page));
46651 +
46652 +       if (PageUptodate(page))
46653 +               goto exit;
46654 +
46655 +       if (!tfm_cluster_is_uptodate(&clust->tc)) {
46656 +               clust->index = pg_to_clust(page->index, inode);
46657 +               unlock_page(page);
46658 +               ret = ctail_read_disk_cluster(clust, inode, mode);
46659 +               lock_page(page);
46660 +               if (ret)
46661 +                       return ret;
46662 +       }
46663 +       if (PageUptodate(page))
46664 +               /* races with another read/write */
46665 +               goto exit;
46666 +
46667 +       /* bytes in the page */
46668 +       pgcnt = cnt_to_pgcnt(i_size_read(inode), page->index);
46669 +
46670 +       if (pgcnt == 0) {
46671 +               assert("edward-1290", 0);
46672 +               return RETERR(-EINVAL);
46673 +       }
46674 +       assert("edward-119", tfm_cluster_is_uptodate(tc));
46675 +
46676 +       switch (clust->dstat) {
46677 +       case UNPR_DISK_CLUSTER:
46678 +               assert("edward-1285", 0);
46679 +#if REISER4_DEBUG
46680 +               warning("edward-1168",
46681 +                       "page %lu is not uptodate and disk cluster %lu (inode %llu) is unprepped\n",
46682 +                       page->index, clust->index,
46683 +                       (unsigned long long)get_inode_oid(inode));
46684 +#endif
46685 +       case FAKE_DISK_CLUSTER:
46686 +               /* fill the page by zeroes */
46687 +               data = kmap_atomic(page, KM_USER0);
46688 +
46689 +               memset(data, 0, PAGE_CACHE_SIZE);
46690 +               flush_dcache_page(page);
46691 +               kunmap_atomic(data, KM_USER0);
46692 +               SetPageUptodate(page);
46693 +               break;
46694 +       case PREP_DISK_CLUSTER:
46695 +               /* fill the page by transformed data */
46696 +               assert("edward-1058", !PageUptodate(page));
46697 +               assert("edward-120", tc->len <= inode_cluster_size(inode));
46698 +
46699 +               /* start page offset in the cluster */
46700 +               cloff = pg_to_off_to_cloff(page->index, inode);
46701 +
46702 +               data = kmap(page);
46703 +               memcpy(data, tfm_stream_data(tc, OUTPUT_STREAM) + cloff, pgcnt);
46704 +               memset(data + pgcnt, 0, (size_t) PAGE_CACHE_SIZE - pgcnt);
46705 +               flush_dcache_page(page);
46706 +               kunmap(page);
46707 +               SetPageUptodate(page);
46708 +               break;
46709 +       default:
46710 +               impossible("edward-1169", "bad disk cluster state");
46711 +       }
46712 +      exit:
46713 +       return 0;
46714 +}
46715 +
46716 +/* plugin->u.item.s.file.readpage */
46717 +int readpage_ctail(void *vp, struct page *page)
46718 +{
46719 +       int result;
46720 +       hint_t *hint;
46721 +       reiser4_cluster_t *clust = vp;
46722 +
46723 +       assert("edward-114", clust != NULL);
46724 +       assert("edward-115", PageLocked(page));
46725 +       assert("edward-116", !PageUptodate(page));
46726 +       assert("edward-117", !jprivate(page) && !PagePrivate(page));
46727 +       assert("edward-118", page->mapping && page->mapping->host);
46728 +       assert("edward-867", !tfm_cluster_is_uptodate(&clust->tc));
46729 +
46730 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46731 +       if (hint == NULL) {
46732 +               unlock_page(page);
46733 +               return RETERR(-ENOMEM);
46734 +       }
46735 +       clust->hint = hint;
46736 +       result = load_file_hint(clust->file, hint);
46737 +       if (result) {
46738 +               kfree(hint);
46739 +               unlock_page(page);
46740 +               return result;
46741 +       }
46742 +       assert("vs-25", hint->ext_coord.lh == &hint->lh);
46743 +       result = do_readpage_ctail(page->mapping->host, clust, page,
46744 +                                  ZNODE_READ_LOCK);
46745 +
46746 +       assert("edward-213", PageLocked(page));
46747 +       assert("edward-1163", ergo(!result, PageUptodate(page)));
46748 +       assert("edward-868",
46749 +              ergo(!result, tfm_cluster_is_uptodate(&clust->tc)));
46750 +
46751 +       unlock_page(page);
46752 +       done_lh(&hint->lh);
46753 +       hint->ext_coord.valid = 0;
46754 +       save_file_hint(clust->file, hint);
46755 +       kfree(hint);
46756 +       tfm_cluster_clr_uptodate(&clust->tc);
46757 +
46758 +       return result;
46759 +}
46760 +
46761 +/* Helper function for ->readpages() */
46762 +static int
46763 +ctail_read_page_cluster(reiser4_cluster_t * clust, struct inode *inode)
46764 +{
46765 +       int i;
46766 +       int result;
46767 +       assert("edward-779", clust != NULL);
46768 +       assert("edward-1059", clust->win == NULL);
46769 +       assert("edward-780", inode != NULL);
46770 +
46771 +       result = prepare_page_cluster(inode, clust, 0 /* do not capture */ );
46772 +       if (result)
46773 +               return result;
46774 +       result = ctail_read_disk_cluster(clust, inode, ZNODE_READ_LOCK);
46775 +       if (result)
46776 +               goto out;
46777 +       /* at this point stream with valid plain text is attached */
46778 +       assert("edward-781", tfm_cluster_is_uptodate(&clust->tc));
46779 +
46780 +       for (i = 0; i < clust->nr_pages; i++) {
46781 +               struct page *page = clust->pages[i];
46782 +               lock_page(page);
46783 +               result = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
46784 +               unlock_page(page);
46785 +               if (result)
46786 +                       break;
46787 +       }
46788 +       tfm_cluster_clr_uptodate(&clust->tc);
46789 +      out:
46790 +       reiser4_release_cluster_pages(clust);
46791 +       return result;
46792 +}
46793 +
46794 +/* filler for read_cache_pages() */
46795 +static int ctail_readpages_filler(void * data, struct page * page)
46796 +{
46797 +       int ret = 0;
46798 +       reiser4_cluster_t * clust = data;
46799 +       struct inode * inode = clust->file->f_dentry->d_inode;
46800 +
46801 +       if (PageUptodate(page)) {
46802 +               unlock_page(page);
46803 +               return 0;
46804 +       }
46805 +       unlock_page(page);
46806 +       move_cluster_forward(clust, inode, page->index);
46807 +       ret = ctail_read_page_cluster(clust, inode);
46808 +       if (ret)
46809 +               return ret;
46810 +       assert("edward-869", !tfm_cluster_is_uptodate(&clust->tc));
46811 +
46812 +       lock_page(page);
46813 +       ret = do_readpage_ctail(inode, clust, page, ZNODE_READ_LOCK);
46814 +       assert("edward-1061", ergo(!ret, PageUptodate(page)));
46815 +       unlock_page(page);
46816 +
46817 +       return ret;
46818 +}
46819 +
46820 +/* We populate a bit more then upper readahead suggests:
46821 +   with each nominated page we read the whole page cluster
46822 +   this page belongs to. */
46823 +int readpages_ctail(struct file *file, struct address_space *mapping,
46824 +                   struct list_head *pages)
46825 +{
46826 +       int ret = 0;
46827 +       hint_t *hint;
46828 +       reiser4_cluster_t clust;
46829 +       struct inode *inode = mapping->host;
46830 +
46831 +       assert("edward-1521", inode == file->f_dentry->d_inode);
46832 +
46833 +       cluster_init_read(&clust, NULL);
46834 +       clust.file = file;
46835 +       hint = kmalloc(sizeof(*hint), reiser4_ctx_gfp_mask_get());
46836 +       if (hint == NULL) {
46837 +               warning("vs-28", "failed to allocate hint");
46838 +               ret = RETERR(-ENOMEM);
46839 +               goto exit1;
46840 +       }
46841 +       clust.hint = hint;
46842 +       ret = load_file_hint(clust.file, hint);
46843 +       if (ret) {
46844 +               warning("edward-1522", "failed to load hint");
46845 +               goto exit2;
46846 +       }
46847 +       assert("vs-26", hint->ext_coord.lh == &hint->lh);
46848 +       ret = alloc_cluster_pgset(&clust, cluster_nrpages(inode));
46849 +       if (ret) {
46850 +               warning("edward-1523", "failed to alloc pgset");
46851 +               goto exit3;
46852 +       }
46853 +       ret = read_cache_pages(mapping, pages, ctail_readpages_filler, &clust);
46854 +
46855 +       assert("edward-870", !tfm_cluster_is_uptodate(&clust.tc));
46856 + exit3:
46857 +       done_lh(&hint->lh);
46858 +       save_file_hint(file, hint);
46859 +       hint->ext_coord.valid = 0;
46860 + exit2:
46861 +       kfree(hint);
46862 + exit1:
46863 +       put_cluster_handle(&clust);
46864 +       return ret;
46865 +}
46866 +
46867 +/*
46868 +   plugin->u.item.s.file.append_key
46869 +   key of the first item of the next disk cluster
46870 +*/
46871 +reiser4_key *append_key_ctail(const coord_t * coord, reiser4_key * key)
46872 +{
46873 +       assert("edward-1241", item_id_by_coord(coord) == CTAIL_ID);
46874 +       assert("edward-1242", cluster_shift_ok(cluster_shift_by_coord(coord)));
46875 +
46876 +       item_key_by_coord(coord, key);
46877 +       set_key_offset(key,
46878 +                      ((__u64) (clust_by_coord(coord, NULL)) +
46879 +                       1) << cluster_shift_by_coord(coord));
46880 +       return key;
46881 +}
46882 +
46883 +static int
46884 +insert_unprepped_ctail(reiser4_cluster_t * clust, struct inode *inode)
46885 +{
46886 +       int result;
46887 +       char buf[UCTAIL_NR_UNITS];
46888 +       reiser4_item_data data;
46889 +       reiser4_key key;
46890 +       int shift = (int)UCTAIL_SHIFT;
46891 +
46892 +       memset(buf, 0, (size_t) UCTAIL_NR_UNITS);
46893 +       result = key_by_inode_cryptcompress(inode,
46894 +                                           clust_to_off(clust->index, inode),
46895 +                                           &key);
46896 +       if (result)
46897 +               return result;
46898 +       data.user = 0;
46899 +       data.iplug = item_plugin_by_id(CTAIL_ID);
46900 +       data.arg = &shift;
46901 +       data.length = sizeof(ctail_item_format) + (size_t) UCTAIL_NR_UNITS;
46902 +       data.data = buf;
46903 +
46904 +       result = insert_by_coord(&clust->hint->ext_coord.coord,
46905 +                                &data, &key, clust->hint->ext_coord.lh, 0);
46906 +       return result;
46907 +}
46908 +
46909 +static int
46910 +insert_cryptcompress_flow(coord_t * coord, lock_handle * lh, flow_t * f,
46911 +                         struct inode *inode)
46912 +{
46913 +       int result;
46914 +       carry_pool *pool;
46915 +       carry_level *lowest_level;
46916 +       reiser4_item_data *data;
46917 +       carry_op *op;
46918 +       int cluster_shift = inode_cluster_shift(inode);
46919 +
46920 +       pool =
46921 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
46922 +                           sizeof(*data));
46923 +       if (IS_ERR(pool))
46924 +               return PTR_ERR(pool);
46925 +       lowest_level = (carry_level *) (pool + 1);
46926 +       init_carry_level(lowest_level, pool);
46927 +       data = (reiser4_item_data *) (lowest_level + 3);
46928 +
46929 +       assert("edward-466", coord->between == AFTER_ITEM
46930 +              || coord->between == AFTER_UNIT || coord->between == BEFORE_ITEM
46931 +              || coord->between == EMPTY_NODE
46932 +              || coord->between == BEFORE_UNIT);
46933 +
46934 +       if (coord->between == AFTER_UNIT) {
46935 +               coord->unit_pos = 0;
46936 +               coord->between = AFTER_ITEM;
46937 +       }
46938 +       op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
46939 +                               0 /* operate directly on coord -> node */);
46940 +       if (IS_ERR(op) || (op == NULL)) {
46941 +               done_carry_pool(pool);
46942 +               return RETERR(op ? PTR_ERR(op) : -EIO);
46943 +       }
46944 +       data->user = 0;
46945 +       data->iplug = item_plugin_by_id(CTAIL_ID);
46946 +       data->arg = &cluster_shift;
46947 +
46948 +       data->length = 0;
46949 +       data->data = NULL;
46950 +
46951 +       op->u.insert_flow.flags = COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT;
46952 +       op->u.insert_flow.insert_point = coord;
46953 +       op->u.insert_flow.flow = f;
46954 +       op->u.insert_flow.data = data;
46955 +       op->u.insert_flow.new_nodes = 0;
46956 +
46957 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
46958 +       lowest_level->tracked = lh;
46959 +
46960 +       result = reiser4_carry(lowest_level, NULL);
46961 +       done_carry_pool(pool);
46962 +
46963 +       return result;
46964 +}
46965 +
46966 +/* Implementation of CRC_APPEND_ITEM mode of ctail conversion */
46967 +static int insert_cryptcompress_flow_in_place(coord_t * coord,
46968 +                                             lock_handle * lh, flow_t * f,
46969 +                                             struct inode *inode)
46970 +{
46971 +       int ret;
46972 +       coord_t pos;
46973 +       lock_handle lock;
46974 +
46975 +       assert("edward-674", f->length <= inode_scaled_cluster_size(inode));
46976 +       assert("edward-484", coord->between == AT_UNIT
46977 +              || coord->between == AFTER_ITEM);
46978 +       assert("edward-485", item_id_by_coord(coord) == CTAIL_ID);
46979 +
46980 +       coord_dup(&pos, coord);
46981 +       pos.unit_pos = 0;
46982 +       pos.between = AFTER_ITEM;
46983 +
46984 +       init_lh(&lock);
46985 +       copy_lh(&lock, lh);
46986 +
46987 +       ret = insert_cryptcompress_flow(&pos, &lock, f, inode);
46988 +       done_lh(&lock);
46989 +       assert("edward-1347", znode_is_write_locked(lh->node));
46990 +       assert("edward-1228", !ret);
46991 +       return ret;
46992 +}
46993 +
46994 +/* Implementation of CRC_OVERWRITE_ITEM mode of ctail conversion */
46995 +static int overwrite_ctail(coord_t * coord, flow_t * f)
46996 +{
46997 +       unsigned count;
46998 +
46999 +       assert("edward-269", f->user == 0);
47000 +       assert("edward-270", f->data != NULL);
47001 +       assert("edward-271", f->length > 0);
47002 +       assert("edward-272", coord_is_existing_unit(coord));
47003 +       assert("edward-273", coord->unit_pos == 0);
47004 +       assert("edward-274", znode_is_write_locked(coord->node));
47005 +       assert("edward-275", reiser4_schedulable());
47006 +       assert("edward-467", item_id_by_coord(coord) == CTAIL_ID);
47007 +       assert("edward-1243", ctail_ok(coord));
47008 +
47009 +       count = nr_units_ctail(coord);
47010 +
47011 +       if (count > f->length)
47012 +               count = f->length;
47013 +       memcpy(first_unit(coord), f->data, count);
47014 +       move_flow_forward(f, count);
47015 +       coord->unit_pos += count;
47016 +       return 0;
47017 +}
47018 +
47019 +/* Implementation of CRC_CUT_ITEM mode of ctail conversion:
47020 +   cut ctail (part or whole) starting from next unit position */
47021 +static int cut_ctail(coord_t * coord)
47022 +{
47023 +       coord_t stop;
47024 +
47025 +       assert("edward-435", coord->between == AT_UNIT &&
47026 +              coord->item_pos < coord_num_items(coord) &&
47027 +              coord->unit_pos <= coord_num_units(coord));
47028 +
47029 +       if (coord->unit_pos == coord_num_units(coord))
47030 +               /* nothing to cut */
47031 +               return 0;
47032 +       coord_dup(&stop, coord);
47033 +       stop.unit_pos = coord_last_unit_pos(coord);
47034 +
47035 +       return cut_node_content(coord, &stop, NULL, NULL, NULL);
47036 +}
47037 +
47038 +int
47039 +ctail_insert_unprepped_cluster(reiser4_cluster_t * clust, struct inode *inode)
47040 +{
47041 +       int result;
47042 +       assert("edward-1244", inode != NULL);
47043 +       assert("edward-1245", clust->hint != NULL);
47044 +       assert("edward-1246", clust->dstat == FAKE_DISK_CLUSTER);
47045 +       assert("edward-1247", clust->reserved == 1);
47046 +
47047 +       result = get_disk_cluster_locked(clust, inode, ZNODE_WRITE_LOCK);
47048 +       if (cbk_errored(result))
47049 +               return result;
47050 +       assert("edward-1249", result == CBK_COORD_NOTFOUND);
47051 +       assert("edward-1250", znode_is_write_locked(clust->hint->lh.node));
47052 +
47053 +       assert("edward-1295",
47054 +              clust->hint->ext_coord.lh->node ==
47055 +              clust->hint->ext_coord.coord.node);
47056 +
47057 +       coord_set_between_clusters(&clust->hint->ext_coord.coord);
47058 +
47059 +       result = insert_unprepped_ctail(clust, inode);
47060 +       all_grabbed2free();
47061 +
47062 +       assert("edward-1251", !result);
47063 +       assert("edward-1252", cryptcompress_inode_ok(inode));
47064 +       assert("edward-1253", znode_is_write_locked(clust->hint->lh.node));
47065 +       assert("edward-1254",
47066 +              reiser4_clustered_blocks(reiser4_get_current_sb()));
47067 +       assert("edward-1255",
47068 +              znode_convertible(clust->hint->ext_coord.coord.node));
47069 +
47070 +       return result;
47071 +}
47072 +
47073 +static int do_convert_ctail(flush_pos_t * pos, cryptcompress_write_mode_t mode)
47074 +{
47075 +       int result = 0;
47076 +       convert_item_info_t *info;
47077 +
47078 +       assert("edward-468", pos != NULL);
47079 +       assert("edward-469", pos->sq != NULL);
47080 +       assert("edward-845", item_convert_data(pos) != NULL);
47081 +
47082 +       info = item_convert_data(pos);
47083 +       assert("edward-679", info->flow.data != NULL);
47084 +
47085 +       switch (mode) {
47086 +       case CRC_APPEND_ITEM:
47087 +               assert("edward-1229", info->flow.length != 0);
47088 +               assert("edward-1256",
47089 +                      cluster_shift_ok(cluster_shift_by_coord(&pos->coord)));
47090 +               result =
47091 +                   insert_cryptcompress_flow_in_place(&pos->coord,
47092 +                                                      &pos->lock,
47093 +                                                      &info->flow,
47094 +                                                      info->inode);
47095 +               break;
47096 +       case CRC_OVERWRITE_ITEM:
47097 +               assert("edward-1230", info->flow.length != 0);
47098 +               overwrite_ctail(&pos->coord, &info->flow);
47099 +               if (info->flow.length != 0)
47100 +                       break;
47101 +       case CRC_CUT_ITEM:
47102 +               assert("edward-1231", info->flow.length == 0);
47103 +               result = cut_ctail(&pos->coord);
47104 +               break;
47105 +       default:
47106 +               result = RETERR(-EIO);
47107 +               impossible("edward-244", "bad convert mode");
47108 +       }
47109 +       return result;
47110 +}
47111 +
47112 +/* plugin->u.item.f.scan */
47113 +int scan_ctail(flush_scan * scan)
47114 +{
47115 +       int result = 0;
47116 +       struct page *page;
47117 +       struct inode *inode;
47118 +       jnode *node = scan->node;
47119 +
47120 +       assert("edward-227", scan->node != NULL);
47121 +       assert("edward-228", jnode_is_cluster_page(scan->node));
47122 +       assert("edward-639", znode_is_write_locked(scan->parent_lock.node));
47123 +
47124 +       page = jnode_page(node);
47125 +       inode = page->mapping->host;
47126 +
47127 +       if (!reiser4_scanning_left(scan))
47128 +               return result;
47129 +       if (!ZF_ISSET(scan->parent_lock.node, JNODE_DIRTY))
47130 +               znode_make_dirty(scan->parent_lock.node);
47131 +
47132 +       if (!znode_convertible(scan->parent_lock.node)) {
47133 +               if (JF_ISSET(scan->node, JNODE_DIRTY))
47134 +                       znode_set_convertible(scan->parent_lock.node);
47135 +               else {
47136 +                       warning("edward-681",
47137 +                               "cluster page is already processed");
47138 +                       return -EAGAIN;
47139 +               }
47140 +       }
47141 +       return result;
47142 +}
47143 +
47144 +/* If true, this function attaches children */
47145 +static int should_attach_convert_idata(flush_pos_t * pos)
47146 +{
47147 +       int result;
47148 +       assert("edward-431", pos != NULL);
47149 +       assert("edward-432", pos->child == NULL);
47150 +       assert("edward-619", znode_is_write_locked(pos->coord.node));
47151 +       assert("edward-470",
47152 +              item_plugin_by_coord(&pos->coord) ==
47153 +              item_plugin_by_id(CTAIL_ID));
47154 +
47155 +       /* check for leftmost child */
47156 +       utmost_child_ctail(&pos->coord, LEFT_SIDE, &pos->child);
47157 +
47158 +       if (!pos->child)
47159 +               return 0;
47160 +       spin_lock_jnode(pos->child);
47161 +       result = (JF_ISSET(pos->child, JNODE_DIRTY) &&
47162 +                 pos->child->atom == ZJNODE(pos->coord.node)->atom);
47163 +       spin_unlock_jnode(pos->child);
47164 +       if (!result && pos->child) {
47165 +               /* existing child isn't to attach, clear up this one */
47166 +               jput(pos->child);
47167 +               pos->child = NULL;
47168 +       }
47169 +       return result;
47170 +}
47171 +
47172 +/* plugin->init_convert_data() */
47173 +static int
47174 +init_convert_data_ctail(convert_item_info_t * idata, struct inode *inode)
47175 +{
47176 +       assert("edward-813", idata != NULL);
47177 +       assert("edward-814", inode != NULL);
47178 +
47179 +       idata->inode = inode;
47180 +       idata->d_cur = DC_FIRST_ITEM;
47181 +       idata->d_next = DC_INVALID_STATE;
47182 +
47183 +       return 0;
47184 +}
47185 +
47186 +static int alloc_item_convert_data(convert_info_t * sq)
47187 +{
47188 +       assert("edward-816", sq != NULL);
47189 +       assert("edward-817", sq->itm == NULL);
47190 +
47191 +       sq->itm = kmalloc(sizeof(*sq->itm), reiser4_ctx_gfp_mask_get());
47192 +       if (sq->itm == NULL)
47193 +               return RETERR(-ENOMEM);
47194 +       return 0;
47195 +}
47196 +
47197 +static void free_item_convert_data(convert_info_t * sq)
47198 +{
47199 +       assert("edward-818", sq != NULL);
47200 +       assert("edward-819", sq->itm != NULL);
47201 +       assert("edward-820", sq->iplug != NULL);
47202 +
47203 +       kfree(sq->itm);
47204 +       sq->itm = NULL;
47205 +       return;
47206 +}
47207 +
47208 +static int alloc_convert_data(flush_pos_t * pos)
47209 +{
47210 +       assert("edward-821", pos != NULL);
47211 +       assert("edward-822", pos->sq == NULL);
47212 +
47213 +       pos->sq = kmalloc(sizeof(*pos->sq), reiser4_ctx_gfp_mask_get());
47214 +       if (!pos->sq)
47215 +               return RETERR(-ENOMEM);
47216 +       memset(pos->sq, 0, sizeof(*pos->sq));
47217 +       cluster_init_write(&pos->sq->clust, NULL);
47218 +       return 0;
47219 +}
47220 +
47221 +void free_convert_data(flush_pos_t * pos)
47222 +{
47223 +       convert_info_t *sq;
47224 +
47225 +       assert("edward-823", pos != NULL);
47226 +       assert("edward-824", pos->sq != NULL);
47227 +
47228 +       sq = pos->sq;
47229 +       if (sq->itm)
47230 +               free_item_convert_data(sq);
47231 +       put_cluster_handle(&sq->clust);
47232 +       kfree(pos->sq);
47233 +       pos->sq = NULL;
47234 +       return;
47235 +}
47236 +
47237 +static int init_item_convert_data(flush_pos_t * pos, struct inode *inode)
47238 +{
47239 +       convert_info_t *sq;
47240 +
47241 +       assert("edward-825", pos != NULL);
47242 +       assert("edward-826", pos->sq != NULL);
47243 +       assert("edward-827", item_convert_data(pos) != NULL);
47244 +       assert("edward-828", inode != NULL);
47245 +
47246 +       sq = pos->sq;
47247 +
47248 +       memset(sq->itm, 0, sizeof(*sq->itm));
47249 +
47250 +       /* iplug->init_convert_data() */
47251 +       return init_convert_data_ctail(sq->itm, inode);
47252 +}
47253 +
47254 +/* create and attach disk cluster info used by 'convert' phase of the flush
47255 +   squalloc() */
47256 +static int attach_convert_idata(flush_pos_t * pos, struct inode *inode)
47257 +{
47258 +       int ret = 0;
47259 +       convert_item_info_t *info;
47260 +       reiser4_cluster_t *clust;
47261 +       file_plugin *fplug = inode_file_plugin(inode);
47262 +       compression_plugin *cplug = inode_compression_plugin(inode);
47263 +
47264 +       assert("edward-248", pos != NULL);
47265 +       assert("edward-249", pos->child != NULL);
47266 +       assert("edward-251", inode != NULL);
47267 +       assert("edward-682", cryptcompress_inode_ok(inode));
47268 +       assert("edward-252",
47269 +              fplug == file_plugin_by_id(CRYPTCOMPRESS_FILE_PLUGIN_ID));
47270 +       assert("edward-473",
47271 +              item_plugin_by_coord(&pos->coord) ==
47272 +              item_plugin_by_id(CTAIL_ID));
47273 +
47274 +       if (!pos->sq) {
47275 +               ret = alloc_convert_data(pos);
47276 +               if (ret)
47277 +                       return ret;
47278 +       }
47279 +       clust = &pos->sq->clust;
47280 +       ret = grab_coa(&clust->tc, cplug);
47281 +       if (ret)
47282 +               goto err;
47283 +       ret = set_cluster_by_page(clust,
47284 +                                 jnode_page(pos->child),
47285 +                                 MAX_CLUSTER_NRPAGES);
47286 +       if (ret)
47287 +               goto err;
47288 +
47289 +       assert("edward-829", pos->sq != NULL);
47290 +       assert("edward-250", item_convert_data(pos) == NULL);
47291 +
47292 +       pos->sq->iplug = item_plugin_by_id(CTAIL_ID);
47293 +
47294 +       ret = alloc_item_convert_data(pos->sq);
47295 +       if (ret)
47296 +               goto err;
47297 +       ret = init_item_convert_data(pos, inode);
47298 +       if (ret)
47299 +               goto err;
47300 +       info = item_convert_data(pos);
47301 +
47302 +       ret = flush_cluster_pages(clust, pos->child, inode);
47303 +       if (ret)
47304 +               goto err;
47305 +
47306 +       reiser4_deflate_cluster(clust, inode);
47307 +       inc_item_convert_count(pos);
47308 +
47309 +       /* make flow by transformed stream */
47310 +       fplug->flow_by_inode(info->inode,
47311 +                            (const char __user *)tfm_stream_data(&clust->tc, OUTPUT_STREAM),
47312 +                            0 /* kernel space */ ,
47313 +                            clust->tc.len,
47314 +                            clust_to_off(clust->index, inode),
47315 +                            WRITE_OP, &info->flow);
47316 +       jput(pos->child);
47317 +
47318 +       assert("edward-683", cryptcompress_inode_ok(inode));
47319 +       return 0;
47320 +      err:
47321 +       jput(pos->child);
47322 +       free_convert_data(pos);
47323 +       return ret;
47324 +}
47325 +
47326 +/* clear up disk cluster info */
47327 +static void detach_convert_idata(convert_info_t * sq)
47328 +{
47329 +       convert_item_info_t *info;
47330 +
47331 +       assert("edward-253", sq != NULL);
47332 +       assert("edward-840", sq->itm != NULL);
47333 +
47334 +       info = sq->itm;
47335 +       assert("edward-255", info->inode != NULL);
47336 +       assert("edward-1212", info->flow.length == 0);
47337 +
47338 +       free_item_convert_data(sq);
47339 +       return;
47340 +}
47341 +
47342 +/* plugin->u.item.f.utmost_child */
47343 +
47344 +/* This function sets leftmost child for a first cluster item,
47345 +   if the child exists, and NULL in other cases.
47346 +   NOTE-EDWARD: Do not call this for RIGHT_SIDE */
47347 +
47348 +int utmost_child_ctail(const coord_t * coord, sideof side, jnode ** child)
47349 +{
47350 +       reiser4_key key;
47351 +
47352 +       item_key_by_coord(coord, &key);
47353 +
47354 +       assert("edward-257", coord != NULL);
47355 +       assert("edward-258", child != NULL);
47356 +       assert("edward-259", side == LEFT_SIDE);
47357 +       assert("edward-260",
47358 +              item_plugin_by_coord(coord) == item_plugin_by_id(CTAIL_ID));
47359 +
47360 +       if (!is_disk_cluster_key(&key, coord))
47361 +               *child = NULL;
47362 +       else
47363 +               *child = jlookup(current_tree,
47364 +                                get_key_objectid(item_key_by_coord
47365 +                                                 (coord, &key)),
47366 +                                off_to_pg(get_key_offset(&key)));
47367 +       return 0;
47368 +}
47369 +
47370 +/* Returns true if @p2 is the next item to @p1
47371 +   in the _same_ disk cluster.
47372 +   Disk cluster is a set of items. If ->clustered() != NULL,
47373 +   with each item the whole disk cluster should be read/modified
47374 +*/
47375 +static int clustered_ctail(const coord_t * p1, const coord_t * p2)
47376 +{
47377 +       return mergeable_ctail(p1, p2);
47378 +}
47379 +
47380 +/* Go rightward and check for next disk cluster item, set
47381 +   d_next to DC_CHAINED_ITEM, if the last one exists.
47382 +   If the current position is last item, go to right neighbor.
47383 +   Skip empty nodes. Note, that right neighbors may be not in
47384 +   the slum because of races. If so, make it dirty and
47385 +   convertible.
47386 +*/
47387 +static int next_item_dc_stat(flush_pos_t * pos)
47388 +{
47389 +       int ret = 0;
47390 +       int stop = 0;
47391 +       znode *cur;
47392 +       coord_t coord;
47393 +       lock_handle lh;
47394 +       lock_handle right_lock;
47395 +
47396 +       assert("edward-1232", !node_is_empty(pos->coord.node));
47397 +       assert("edward-1014",
47398 +              pos->coord.item_pos < coord_num_items(&pos->coord));
47399 +       assert("edward-1015", chaining_data_present(pos));
47400 +       assert("edward-1017",
47401 +              item_convert_data(pos)->d_next == DC_INVALID_STATE);
47402 +
47403 +       item_convert_data(pos)->d_next = DC_AFTER_CLUSTER;
47404 +
47405 +       if (item_convert_data(pos)->d_cur == DC_AFTER_CLUSTER)
47406 +               return ret;
47407 +       if (pos->coord.item_pos < coord_num_items(&pos->coord) - 1)
47408 +               return ret;
47409 +
47410 +       /* check next slum item */
47411 +       init_lh(&right_lock);
47412 +       cur = pos->coord.node;
47413 +
47414 +       while (!stop) {
47415 +               init_lh(&lh);
47416 +               ret = reiser4_get_right_neighbor(&lh,
47417 +                                                cur,
47418 +                                                ZNODE_WRITE_LOCK,
47419 +                                                GN_CAN_USE_UPPER_LEVELS);
47420 +               if (ret)
47421 +                       break;
47422 +               ret = zload(lh.node);
47423 +               if (ret) {
47424 +                       done_lh(&lh);
47425 +                       break;
47426 +               }
47427 +               coord_init_before_first_item(&coord, lh.node);
47428 +
47429 +               if (node_is_empty(lh.node)) {
47430 +                       znode_make_dirty(lh.node);
47431 +                       znode_set_convertible(lh.node);
47432 +                       stop = 0;
47433 +               } else if (clustered_ctail(&pos->coord, &coord)) {
47434 +
47435 +                       item_convert_data(pos)->d_next = DC_CHAINED_ITEM;
47436 +
47437 +                       if (!ZF_ISSET(lh.node, JNODE_DIRTY)) {
47438 +                               /*
47439 +                                  warning("edward-1024",
47440 +                                  "next slum item mergeable, "
47441 +                                  "but znode %p isn't dirty\n",
47442 +                                  lh.node);
47443 +                                */
47444 +                               znode_make_dirty(lh.node);
47445 +                       }
47446 +                       if (!znode_convertible(lh.node)) {
47447 +                               /*
47448 +                                  warning("edward-1272",
47449 +                                  "next slum item mergeable, "
47450 +                                  "but znode %p isn't convertible\n",
47451 +                                  lh.node);
47452 +                                */
47453 +                               znode_set_convertible(lh.node);
47454 +                       }
47455 +                       stop = 1;
47456 +               } else
47457 +                       stop = 1;
47458 +               zrelse(lh.node);
47459 +               done_lh(&right_lock);
47460 +               copy_lh(&right_lock, &lh);
47461 +               done_lh(&lh);
47462 +               cur = right_lock.node;
47463 +       }
47464 +       done_lh(&right_lock);
47465 +
47466 +       if (ret == -E_NO_NEIGHBOR)
47467 +               ret = 0;
47468 +       return ret;
47469 +}
47470 +
47471 +static int
47472 +assign_convert_mode(convert_item_info_t * idata,
47473 +                   cryptcompress_write_mode_t * mode)
47474 +{
47475 +       int result = 0;
47476 +
47477 +       assert("edward-1025", idata != NULL);
47478 +
47479 +       if (idata->flow.length) {
47480 +               /* append or overwrite */
47481 +               switch (idata->d_cur) {
47482 +               case DC_FIRST_ITEM:
47483 +               case DC_CHAINED_ITEM:
47484 +                       *mode = CRC_OVERWRITE_ITEM;
47485 +                       break;
47486 +               case DC_AFTER_CLUSTER:
47487 +                       *mode = CRC_APPEND_ITEM;
47488 +                       break;
47489 +               default:
47490 +                       impossible("edward-1018", "wrong current item state");
47491 +               }
47492 +       } else {
47493 +               /* cut or invalidate */
47494 +               switch (idata->d_cur) {
47495 +               case DC_FIRST_ITEM:
47496 +               case DC_CHAINED_ITEM:
47497 +                       *mode = CRC_CUT_ITEM;
47498 +                       break;
47499 +               case DC_AFTER_CLUSTER:
47500 +                       result = 1;
47501 +                       break;
47502 +               default:
47503 +                       impossible("edward-1019", "wrong current item state");
47504 +               }
47505 +       }
47506 +       return result;
47507 +}
47508 +
47509 +/* plugin->u.item.f.convert */
47510 +/* write ctail in guessed mode */
47511 +int convert_ctail(flush_pos_t * pos)
47512 +{
47513 +       int result;
47514 +       int nr_items;
47515 +       cryptcompress_write_mode_t mode = CRC_OVERWRITE_ITEM;
47516 +
47517 +       assert("edward-1020", pos != NULL);
47518 +       assert("edward-1213", coord_num_items(&pos->coord) != 0);
47519 +       assert("edward-1257", item_id_by_coord(&pos->coord) == CTAIL_ID);
47520 +       assert("edward-1258", ctail_ok(&pos->coord));
47521 +       assert("edward-261", pos->coord.node != NULL);
47522 +
47523 +       nr_items = coord_num_items(&pos->coord);
47524 +       if (!chaining_data_present(pos)) {
47525 +               if (should_attach_convert_idata(pos)) {
47526 +                       /* attach convert item info */
47527 +                       struct inode *inode;
47528 +
47529 +                       assert("edward-264", pos->child != NULL);
47530 +                       assert("edward-265", jnode_page(pos->child) != NULL);
47531 +                       assert("edward-266",
47532 +                              jnode_page(pos->child)->mapping != NULL);
47533 +
47534 +                       inode = jnode_page(pos->child)->mapping->host;
47535 +
47536 +                       assert("edward-267", inode != NULL);
47537 +
47538 +                       /* attach item convert info by child and put the last one */
47539 +                       result = attach_convert_idata(pos, inode);
47540 +                       pos->child = NULL;
47541 +                       if (result == -E_REPEAT) {
47542 +                               /* jnode became clean, or there is no dirty
47543 +                                  pages (nothing to update in disk cluster) */
47544 +                               warning("edward-1021",
47545 +                                       "convert_ctail: nothing to attach");
47546 +                               return 0;
47547 +                       }
47548 +                       if (result != 0)
47549 +                               return result;
47550 +               } else
47551 +                       /* unconvertible */
47552 +                       return 0;
47553 +       } else {
47554 +               /* use old convert info */
47555 +
47556 +               convert_item_info_t *idata;
47557 +
47558 +               idata = item_convert_data(pos);
47559 +
47560 +               result = assign_convert_mode(idata, &mode);
47561 +               if (result) {
47562 +                       /* disk cluster is over,
47563 +                          nothing to update anymore */
47564 +                       detach_convert_idata(pos->sq);
47565 +                       return 0;
47566 +               }
47567 +       }
47568 +
47569 +       assert("edward-433", chaining_data_present(pos));
47570 +       assert("edward-1022",
47571 +              pos->coord.item_pos < coord_num_items(&pos->coord));
47572 +
47573 +       result = next_item_dc_stat(pos);
47574 +       if (result) {
47575 +               detach_convert_idata(pos->sq);
47576 +               return result;
47577 +       }
47578 +       result = do_convert_ctail(pos, mode);
47579 +       if (result) {
47580 +               detach_convert_idata(pos->sq);
47581 +               return result;
47582 +       }
47583 +       switch (mode) {
47584 +       case CRC_CUT_ITEM:
47585 +               assert("edward-1214", item_convert_data(pos)->flow.length == 0);
47586 +               assert("edward-1215",
47587 +                      coord_num_items(&pos->coord) == nr_items ||
47588 +                      coord_num_items(&pos->coord) == nr_items - 1);
47589 +               if (item_convert_data(pos)->d_next == DC_CHAINED_ITEM)
47590 +                       break;
47591 +               if (coord_num_items(&pos->coord) != nr_items) {
47592 +                       /* the item was killed, no more chained items */
47593 +                       detach_convert_idata(pos->sq);
47594 +                       if (!node_is_empty(pos->coord.node))
47595 +                               /* make sure the next item will be scanned */
47596 +                               coord_init_before_item(&pos->coord);
47597 +                       break;
47598 +               }
47599 +       case CRC_APPEND_ITEM:
47600 +               assert("edward-434", item_convert_data(pos)->flow.length == 0);
47601 +               detach_convert_idata(pos->sq);
47602 +               break;
47603 +       case CRC_OVERWRITE_ITEM:
47604 +               if (coord_is_unprepped_ctail(&pos->coord)) {
47605 +                       /* convert unpprepped ctail to prepped one */
47606 +                       int shift;
47607 +                       shift =
47608 +                           inode_cluster_shift(item_convert_data(pos)->inode);
47609 +                       assert("edward-1259", cluster_shift_ok(shift));
47610 +                       put_unaligned((d8)shift,
47611 +                               &ctail_formatted_at(&pos->coord)->
47612 +                               cluster_shift);
47613 +               }
47614 +               break;
47615 +       }
47616 +       return result;
47617 +}
47618 +
47619 +/* Make Linus happy.
47620 +   Local variables:
47621 +   c-indentation-style: "K&R"
47622 +   mode-name: "LC"
47623 +   c-basic-offset: 8
47624 +   tab-width: 8
47625 +   fill-column: 120
47626 +   End:
47627 +*/
47628 diff --git a/fs/reiser4/plugin/item/ctail.h b/fs/reiser4/plugin/item/ctail.h
47629 new file mode 100644
47630 index 0000000..ead4418
47631 --- /dev/null
47632 +++ b/fs/reiser4/plugin/item/ctail.h
47633 @@ -0,0 +1,97 @@
47634 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47635 +
47636 +#if !defined( __FS_REISER4_CTAIL_H__ )
47637 +#define __FS_REISER4_CTAIL_H__
47638 +
47639 +/* Disk format of ctail item */
47640 +typedef struct ctail_item_format {
47641 +       /* packed shift; size of (prepped) disk cluster
47642 +          is calculated as (1 << cluster_shift) */
47643 +       d8 cluster_shift;
47644 +       /* ctail body */
47645 +       d8 body[0];
47646 +} __attribute__ ((packed)) ctail_item_format;
47647 +
47648 +/* Unprepped disk cluster is represented by a single ctail item
47649 +   with the following "magic" attributes: */
47650 +/* "magic" cluster_shift */
47651 +#define UCTAIL_SHIFT 0xff
47652 +/* How many units unprepped ctail item has */
47653 +#define UCTAIL_NR_UNITS 1
47654 +
47655 +/* The following is a set of various item states in a disk cluster.
47656 +   Disk cluster is a set of items whose keys belong to the interval
47657 +   [dc_key , dc_key + disk_cluster_size - 1] */
47658 +typedef enum {
47659 +       DC_INVALID_STATE = 0,
47660 +       DC_FIRST_ITEM = 1,
47661 +       DC_CHAINED_ITEM = 2,
47662 +       DC_AFTER_CLUSTER = 3
47663 +} dc_item_stat;
47664 +
47665 +/* ctail-specific extension.
47666 +   In particular this describes parameters of disk cluster an item belongs to */
47667 +typedef struct {
47668 +       int shift; /* this contains cluster_shift extracted from
47669 +                     ctail_item_format (above), or UCTAIL_SHIFT
47670 +                     (the last one is the "magic" of unprepped disk clusters)*/
47671 +       int dsize; /* size of a prepped disk cluster */
47672 +       int ncount; /* count of nodes occupied by a disk cluster */
47673 +} ctail_coord_extension_t;
47674 +
47675 +struct cut_list;
47676 +
47677 +/* plugin->item.b.* */
47678 +int can_contain_key_ctail(const coord_t *, const reiser4_key *,
47679 +                         const reiser4_item_data *);
47680 +int mergeable_ctail(const coord_t * p1, const coord_t * p2);
47681 +pos_in_node_t nr_units_ctail(const coord_t * coord);
47682 +int estimate_ctail(const coord_t * coord, const reiser4_item_data * data);
47683 +void print_ctail(const char *prefix, coord_t * coord);
47684 +lookup_result lookup_ctail(const reiser4_key *, lookup_bias, coord_t *);
47685 +
47686 +int paste_ctail(coord_t * coord, reiser4_item_data * data,
47687 +               carry_plugin_info * info UNUSED_ARG);
47688 +int init_ctail(coord_t *, coord_t *, reiser4_item_data *);
47689 +int can_shift_ctail(unsigned free_space, coord_t * coord,
47690 +                   znode * target, shift_direction pend, unsigned *size,
47691 +                   unsigned want);
47692 +void copy_units_ctail(coord_t * target, coord_t * source, unsigned from,
47693 +                     unsigned count, shift_direction where_is_free_space,
47694 +                     unsigned free_space);
47695 +int cut_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47696 +                   carry_cut_data *, reiser4_key * smallest_removed,
47697 +                   reiser4_key * new_first);
47698 +int kill_units_ctail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
47699 +                    carry_kill_data *, reiser4_key * smallest_removed,
47700 +                    reiser4_key * new_first);
47701 +int ctail_ok(const coord_t * coord);
47702 +int check_ctail(const coord_t * coord, const char **error);
47703 +
47704 +/* plugin->u.item.s.* */
47705 +int read_ctail(struct file *, flow_t *, hint_t *);
47706 +int readpage_ctail(void *, struct page *);
47707 +int readpages_ctail(struct file *, struct address_space *, struct list_head *);
47708 +reiser4_key *append_key_ctail(const coord_t *, reiser4_key *);
47709 +int create_hook_ctail(const coord_t * coord, void *arg);
47710 +int kill_hook_ctail(const coord_t *, pos_in_node_t, pos_in_node_t,
47711 +                   carry_kill_data *);
47712 +int shift_hook_ctail(const coord_t *, unsigned, unsigned, znode *);
47713 +
47714 +/* plugin->u.item.f */
47715 +int utmost_child_ctail(const coord_t *, sideof, jnode **);
47716 +int scan_ctail(flush_scan *);
47717 +int convert_ctail(flush_pos_t *);
47718 +size_t inode_scaled_cluster_size(struct inode *);
47719 +
47720 +#endif                         /* __FS_REISER4_CTAIL_H__ */
47721 +
47722 +/* Make Linus happy.
47723 +   Local variables:
47724 +   c-indentation-style: "K&R"
47725 +   mode-name: "LC"
47726 +   c-basic-offset: 8
47727 +   tab-width: 8
47728 +   fill-column: 120
47729 +   End:
47730 +*/
47731 diff --git a/fs/reiser4/plugin/item/extent.c b/fs/reiser4/plugin/item/extent.c
47732 new file mode 100644
47733 index 0000000..e35a4d5
47734 --- /dev/null
47735 +++ b/fs/reiser4/plugin/item/extent.c
47736 @@ -0,0 +1,197 @@
47737 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47738 +
47739 +#include "item.h"
47740 +#include "../../key.h"
47741 +#include "../../super.h"
47742 +#include "../../carry.h"
47743 +#include "../../inode.h"
47744 +#include "../../page_cache.h"
47745 +#include "../../flush.h"
47746 +#include "../object.h"
47747 +
47748 +/* prepare structure reiser4_item_data. It is used to put one extent unit into tree */
47749 +/* Audited by: green(2002.06.13) */
47750 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
47751 +                                  int nr_extents)
47752 +{
47753 +       data->data = ext_unit;
47754 +       /* data->data is kernel space */
47755 +       data->user = 0;
47756 +       data->length = sizeof(reiser4_extent) * nr_extents;
47757 +       data->arg = NULL;
47758 +       data->iplug = item_plugin_by_id(EXTENT_POINTER_ID);
47759 +       return data;
47760 +}
47761 +
47762 +/* how many bytes are addressed by @nr first extents of the extent item */
47763 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr)
47764 +{
47765 +       pos_in_node_t i;
47766 +       reiser4_block_nr blocks;
47767 +       reiser4_extent *ext;
47768 +
47769 +       ext = item_body_by_coord(coord);
47770 +       assert("vs-263", nr <= nr_units_extent(coord));
47771 +
47772 +       blocks = 0;
47773 +       for (i = 0; i < nr; i++, ext++) {
47774 +               blocks += extent_get_width(ext);
47775 +       }
47776 +
47777 +       return blocks * current_blocksize;
47778 +}
47779 +
47780 +extent_state state_of_extent(reiser4_extent * ext)
47781 +{
47782 +       switch ((int)extent_get_start(ext)) {
47783 +       case 0:
47784 +               return HOLE_EXTENT;
47785 +       case 1:
47786 +               return UNALLOCATED_EXTENT;
47787 +       default:
47788 +               break;
47789 +       }
47790 +       return ALLOCATED_EXTENT;
47791 +}
47792 +
47793 +int extent_is_unallocated(const coord_t * item)
47794 +{
47795 +       assert("jmacd-5133", item_is_extent(item));
47796 +
47797 +       return state_of_extent(extent_by_coord(item)) == UNALLOCATED_EXTENT;
47798 +}
47799 +
47800 +/* set extent's start and width */
47801 +void reiser4_set_extent(reiser4_extent * ext, reiser4_block_nr start,
47802 +                       reiser4_block_nr width)
47803 +{
47804 +       extent_set_start(ext, start);
47805 +       extent_set_width(ext, width);
47806 +}
47807 +
47808 +/**
47809 + * reiser4_replace_extent - replace extent and paste 1 or 2 after it
47810 + * @un_extent: coordinate of extent to be overwritten
47811 + * @lh: need better comment
47812 + * @key: need better comment
47813 + * @exts_to_add: data prepared for insertion into tree
47814 + * @replace: need better comment
47815 + * @flags: need better comment
47816 + * @return_insert_position: need better comment
47817 + *
47818 + * Overwrites one extent, pastes 1 or 2 more ones after overwritten one.  If
47819 + * @return_inserted_position is 1 - @un_extent and @lh are returned set to
47820 + * first of newly inserted units, if it is 0 - @un_extent and @lh are returned
47821 + * set to extent which was overwritten.
47822 + */
47823 +int reiser4_replace_extent(struct replace_handle *h,
47824 +                          int return_inserted_position)
47825 +{
47826 +       int result;
47827 +       znode *orig_znode;
47828 +       /*ON_DEBUG(reiser4_extent orig_ext);*/  /* this is for debugging */
47829 +
47830 +       assert("vs-990", coord_is_existing_unit(h->coord));
47831 +       assert("vs-1375", znode_is_write_locked(h->coord->node));
47832 +       assert("vs-1426", extent_get_width(&h->overwrite) != 0);
47833 +       assert("vs-1427", extent_get_width(&h->new_extents[0]) != 0);
47834 +       assert("vs-1427", ergo(h->nr_new_extents == 2,
47835 +                              extent_get_width(&h->new_extents[1]) != 0));
47836 +
47837 +       /* compose structure for paste */
47838 +       init_new_extent(&h->item, &h->new_extents[0], h->nr_new_extents);
47839 +
47840 +       coord_dup(&h->coord_after, h->coord);
47841 +       init_lh(&h->lh_after);
47842 +       copy_lh(&h->lh_after, h->lh);
47843 +       reiser4_tap_init(&h->watch, &h->coord_after, &h->lh_after, ZNODE_WRITE_LOCK);
47844 +       reiser4_tap_monitor(&h->watch);
47845 +
47846 +       ON_DEBUG(h->orig_ext = *extent_by_coord(h->coord));
47847 +       orig_znode = h->coord->node;
47848 +
47849 +#if REISER4_DEBUG
47850 +       /* make sure that key is set properly */
47851 +       unit_key_by_coord(h->coord, &h->tmp);
47852 +       set_key_offset(&h->tmp,
47853 +                      get_key_offset(&h->tmp) +
47854 +                      extent_get_width(&h->overwrite) * current_blocksize);
47855 +       assert("vs-1080", keyeq(&h->tmp, &h->paste_key));
47856 +#endif
47857 +
47858 +       /* set insert point after unit to be replaced */
47859 +       h->coord->between = AFTER_UNIT;
47860 +
47861 +       result = insert_into_item(h->coord, return_inserted_position ? h->lh : NULL,
47862 +                                 &h->paste_key, &h->item, h->flags);
47863 +       if (!result) {
47864 +               /* now we have to replace the unit after which new units were
47865 +                  inserted. Its position is tracked by @watch */
47866 +               reiser4_extent *ext;
47867 +               znode *node;
47868 +
47869 +               node = h->coord_after.node;
47870 +               if (node != orig_znode) {
47871 +                       coord_clear_iplug(&h->coord_after);
47872 +                       result = zload(node);
47873 +               }
47874 +
47875 +               if (likely(!result)) {
47876 +                       ext = extent_by_coord(&h->coord_after);
47877 +
47878 +                       assert("vs-987", znode_is_loaded(node));
47879 +                       assert("vs-988", !memcmp(ext, &h->orig_ext, sizeof(*ext)));
47880 +
47881 +                       /* overwrite extent unit */
47882 +                       memcpy(ext, &h->overwrite, sizeof(reiser4_extent));
47883 +                       znode_make_dirty(node);
47884 +
47885 +                       if (node != orig_znode)
47886 +                               zrelse(node);
47887 +
47888 +                       if (return_inserted_position == 0) {
47889 +                               /* coord and lh are to be set to overwritten
47890 +                                  extent */
47891 +                               assert("vs-1662",
47892 +                                      WITH_DATA(node, !memcmp(&h->overwrite,
47893 +                                                              extent_by_coord(
47894 +                                                                      &h->coord_after),
47895 +                                                              sizeof(reiser4_extent))));
47896 +
47897 +                               *h->coord = h->coord_after;
47898 +                               done_lh(h->lh);
47899 +                               copy_lh(h->lh, &h->lh_after);
47900 +                       } else {
47901 +                               /* h->coord and h->lh are to be set to first of
47902 +                                  inserted units */
47903 +                               assert("vs-1663",
47904 +                                      WITH_DATA(h->coord->node,
47905 +                                                !memcmp(&h->new_extents[0],
47906 +                                                        extent_by_coord(h->coord),
47907 +                                                        sizeof(reiser4_extent))));
47908 +                               assert("vs-1664", h->lh->node == h->coord->node);
47909 +                       }
47910 +               }
47911 +       }
47912 +       reiser4_tap_done(&h->watch);
47913 +
47914 +       return result;
47915 +}
47916 +
47917 +lock_handle *znode_lh(znode *node)
47918 +{
47919 +       assert("vs-1371", znode_is_write_locked(node));
47920 +       assert("vs-1372", znode_is_wlocked_once(node));
47921 +       return list_entry(node->lock.owners.next, lock_handle, owners_link);
47922 +}
47923 +
47924 +/*
47925 + * Local variables:
47926 + * c-indentation-style: "K&R"
47927 + * mode-name: "LC"
47928 + * c-basic-offset: 8
47929 + * tab-width: 8
47930 + * fill-column: 79
47931 + * scroll-step: 1
47932 + * End:
47933 + */
47934 diff --git a/fs/reiser4/plugin/item/extent.h b/fs/reiser4/plugin/item/extent.h
47935 new file mode 100644
47936 index 0000000..d817d1b
47937 --- /dev/null
47938 +++ b/fs/reiser4/plugin/item/extent.h
47939 @@ -0,0 +1,231 @@
47940 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
47941 +
47942 +#ifndef __REISER4_EXTENT_H__
47943 +#define __REISER4_EXTENT_H__
47944 +
47945 +/* on disk extent */
47946 +typedef struct {
47947 +       reiser4_dblock_nr start;
47948 +       reiser4_dblock_nr width;
47949 +} reiser4_extent;
47950 +
47951 +typedef struct extent_stat {
47952 +       int unallocated_units;
47953 +       int unallocated_blocks;
47954 +       int allocated_units;
47955 +       int allocated_blocks;
47956 +       int hole_units;
47957 +       int hole_blocks;
47958 +} extent_stat;
47959 +
47960 +/* extents in an extent item can be either holes, or unallocated or allocated
47961 +   extents */
47962 +typedef enum {
47963 +       HOLE_EXTENT,
47964 +       UNALLOCATED_EXTENT,
47965 +       ALLOCATED_EXTENT
47966 +} extent_state;
47967 +
47968 +#define HOLE_EXTENT_START 0
47969 +#define UNALLOCATED_EXTENT_START 1
47970 +#define UNALLOCATED_EXTENT_START2 2
47971 +
47972 +typedef struct {
47973 +       reiser4_block_nr pos_in_unit;
47974 +       reiser4_block_nr width; /* width of current unit */
47975 +       pos_in_node_t nr_units; /* number of units */
47976 +       int ext_offset;         /* offset from the beginning of zdata() */
47977 +       unsigned long expected_page;
47978 +#if REISER4_DEBUG
47979 +       reiser4_extent extent;
47980 +#endif
47981 +} extent_coord_extension_t;
47982 +
47983 +/* macros to set/get fields of on-disk extent */
47984 +static inline reiser4_block_nr extent_get_start(const reiser4_extent * ext)
47985 +{
47986 +       return le64_to_cpu(ext->start);
47987 +}
47988 +
47989 +static inline reiser4_block_nr extent_get_width(const reiser4_extent * ext)
47990 +{
47991 +       return le64_to_cpu(ext->width);
47992 +}
47993 +
47994 +extern __u64 reiser4_current_block_count(void);
47995 +
47996 +static inline void
47997 +extent_set_start(reiser4_extent * ext, reiser4_block_nr start)
47998 +{
47999 +       cassert(sizeof(ext->start) == 8);
48000 +       assert("nikita-2510",
48001 +              ergo(start > 1, start < reiser4_current_block_count()));
48002 +       put_unaligned(cpu_to_le64(start), &ext->start);
48003 +}
48004 +
48005 +static inline void
48006 +extent_set_width(reiser4_extent * ext, reiser4_block_nr width)
48007 +{
48008 +       cassert(sizeof(ext->width) == 8);
48009 +       assert("", width > 0);
48010 +       put_unaligned(cpu_to_le64(width), &ext->width);
48011 +       assert("nikita-2511",
48012 +              ergo(extent_get_start(ext) > 1,
48013 +                   extent_get_start(ext) + width <=
48014 +                   reiser4_current_block_count()));
48015 +}
48016 +
48017 +#define extent_item(coord)                                     \
48018 +({                                                             \
48019 +       assert("nikita-3143", item_is_extent(coord));           \
48020 +       ((reiser4_extent *)item_body_by_coord (coord));         \
48021 +})
48022 +
48023 +#define extent_by_coord(coord)                                 \
48024 +({                                                             \
48025 +       assert("nikita-3144", item_is_extent(coord));           \
48026 +       (extent_item (coord) + (coord)->unit_pos);              \
48027 +})
48028 +
48029 +#define width_by_coord(coord)                                  \
48030 +({                                                             \
48031 +       assert("nikita-3145", item_is_extent(coord));           \
48032 +       extent_get_width (extent_by_coord(coord));              \
48033 +})
48034 +
48035 +struct carry_cut_data;
48036 +struct carry_kill_data;
48037 +
48038 +/* plugin->u.item.b.* */
48039 +reiser4_key *max_key_inside_extent(const coord_t *, reiser4_key *);
48040 +int can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
48041 +                          const reiser4_item_data *);
48042 +int mergeable_extent(const coord_t * p1, const coord_t * p2);
48043 +pos_in_node_t nr_units_extent(const coord_t *);
48044 +lookup_result lookup_extent(const reiser4_key *, lookup_bias, coord_t *);
48045 +void init_coord_extent(coord_t *);
48046 +int init_extent(coord_t *, reiser4_item_data *);
48047 +int paste_extent(coord_t *, reiser4_item_data *, carry_plugin_info *);
48048 +int can_shift_extent(unsigned free_space,
48049 +                    coord_t * source, znode * target, shift_direction,
48050 +                    unsigned *size, unsigned want);
48051 +void copy_units_extent(coord_t * target, coord_t * source, unsigned from,
48052 +                      unsigned count, shift_direction where_is_free_space,
48053 +                      unsigned free_space);
48054 +int kill_hook_extent(const coord_t *, pos_in_node_t from, pos_in_node_t count,
48055 +                    struct carry_kill_data *);
48056 +int create_hook_extent(const coord_t * coord, void *arg);
48057 +int cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48058 +                    struct carry_cut_data *, reiser4_key * smallest_removed,
48059 +                    reiser4_key * new_first);
48060 +int kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
48061 +                     struct carry_kill_data *, reiser4_key * smallest_removed,
48062 +                     reiser4_key * new_first);
48063 +reiser4_key *unit_key_extent(const coord_t *, reiser4_key *);
48064 +reiser4_key *max_unit_key_extent(const coord_t *, reiser4_key *);
48065 +void print_extent(const char *, coord_t *);
48066 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** child);
48067 +int utmost_child_real_block_extent(const coord_t * coord, sideof side,
48068 +                                  reiser4_block_nr * block);
48069 +void item_stat_extent(const coord_t * coord, void *vp);
48070 +int reiser4_check_extent(const coord_t * coord, const char **error);
48071 +
48072 +/* plugin->u.item.s.file.* */
48073 +ssize_t reiser4_write_extent(struct file *, const char __user *,
48074 +                            size_t, loff_t *);
48075 +int reiser4_read_extent(struct file *, flow_t *, hint_t *);
48076 +int reiser4_readpage_extent(void *, struct page *);
48077 +int reiser4_do_readpage_extent(reiser4_extent*, reiser4_block_nr, struct page*);
48078 +reiser4_key *append_key_extent(const coord_t *, reiser4_key *);
48079 +void init_coord_extension_extent(uf_coord_t *, loff_t offset);
48080 +int get_block_address_extent(const coord_t *, sector_t block,
48081 +                            sector_t * result);
48082 +
48083 +/* these are used in flush.c
48084 +   FIXME-VS: should they be somewhere in item_plugin? */
48085 +int allocate_extent_item_in_place(coord_t *, lock_handle *, flush_pos_t * pos);
48086 +int allocate_and_copy_extent(znode * left, coord_t * right, flush_pos_t * pos,
48087 +                            reiser4_key * stop_key);
48088 +
48089 +int extent_is_unallocated(const coord_t * item);       /* True if this extent is unallocated (i.e., not a hole, not allocated). */
48090 +__u64 extent_unit_index(const coord_t * item); /* Block offset of this unit. */
48091 +__u64 extent_unit_width(const coord_t * item); /* Number of blocks in this unit. */
48092 +
48093 +/* plugin->u.item.f. */
48094 +int reiser4_scan_extent(flush_scan * scan);
48095 +extern int key_by_offset_extent(struct inode *, loff_t, reiser4_key *);
48096 +
48097 +reiser4_item_data *init_new_extent(reiser4_item_data * data, void *ext_unit,
48098 +                                  int nr_extents);
48099 +reiser4_block_nr reiser4_extent_size(const coord_t * coord, pos_in_node_t nr);
48100 +extent_state state_of_extent(reiser4_extent * ext);
48101 +void reiser4_set_extent(reiser4_extent *, reiser4_block_nr start,
48102 +                       reiser4_block_nr width);
48103 +int reiser4_update_extent(struct inode *, jnode *, loff_t pos,
48104 +                         int *plugged_hole);
48105 +
48106 +#include "../../coord.h"
48107 +#include "../../lock.h"
48108 +#include "../../tap.h"
48109 +
48110 +struct replace_handle {
48111 +       /* these are to be set before calling reiser4_replace_extent */
48112 +       coord_t *coord;
48113 +       lock_handle *lh;
48114 +       reiser4_key key;
48115 +       reiser4_key *pkey;
48116 +       reiser4_extent overwrite;
48117 +       reiser4_extent new_extents[2];
48118 +       int nr_new_extents;
48119 +       unsigned flags;
48120 +
48121 +       /* these are used by reiser4_replace_extent */
48122 +       reiser4_item_data item;
48123 +       coord_t coord_after;
48124 +       lock_handle lh_after;
48125 +       tap_t watch;
48126 +       reiser4_key paste_key;
48127 +#if REISER4_DEBUG
48128 +       reiser4_extent orig_ext;
48129 +       reiser4_key tmp;
48130 +#endif
48131 +};
48132 +
48133 +/* this structure is kmalloced before calling make_extent to avoid excessive
48134 +   stack consumption on plug_hole->reiser4_replace_extent */
48135 +struct make_extent_handle {
48136 +       uf_coord_t *uf_coord;
48137 +       reiser4_block_nr blocknr;
48138 +       int created;
48139 +       struct inode *inode;
48140 +       union {
48141 +               struct {
48142 +               } append;
48143 +               struct replace_handle replace;
48144 +       } u;
48145 +};
48146 +
48147 +int reiser4_replace_extent(struct replace_handle *,
48148 +                          int return_inserted_position);
48149 +lock_handle *znode_lh(znode *);
48150 +
48151 +/* the reiser4 repacker support */
48152 +struct repacker_cursor;
48153 +extern int process_extent_backward_for_repacking(tap_t *,
48154 +                                                struct repacker_cursor *);
48155 +extern int mark_extent_for_repacking(tap_t *, int);
48156 +
48157 +#define coord_by_uf_coord(uf_coord) (&((uf_coord)->coord))
48158 +#define ext_coord_by_uf_coord(uf_coord) (&((uf_coord)->extension.extent))
48159 +
48160 +/* __REISER4_EXTENT_H__ */
48161 +#endif
48162 +/*
48163 +   Local variables:
48164 +   c-indentation-style: "K&R"
48165 +   mode-name: "LC"
48166 +   c-basic-offset: 8
48167 +   tab-width: 8
48168 +   fill-column: 120
48169 +   End:
48170 +*/
48171 diff --git a/fs/reiser4/plugin/item/extent_file_ops.c b/fs/reiser4/plugin/item/extent_file_ops.c
48172 new file mode 100644
48173 index 0000000..cf337c4
48174 --- /dev/null
48175 +++ b/fs/reiser4/plugin/item/extent_file_ops.c
48176 @@ -0,0 +1,1435 @@
48177 +/* COPYRIGHT 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
48178 +
48179 +#include "item.h"
48180 +#include "../../inode.h"
48181 +#include "../../page_cache.h"
48182 +#include "../object.h"
48183 +
48184 +#include <linux/quotaops.h>
48185 +#include <linux/swap.h>
48186 +#include "../../../../mm/filemap.h"
48187 +
48188 +static inline reiser4_extent *ext_by_offset(const znode *node, int offset)
48189 +{
48190 +       reiser4_extent *ext;
48191 +
48192 +       ext = (reiser4_extent *) (zdata(node) + offset);
48193 +       return ext;
48194 +}
48195 +
48196 +/**
48197 + * check_uf_coord - verify coord extension
48198 + * @uf_coord:
48199 + * @key:
48200 + *
48201 + * Makes sure that all fields of @uf_coord are set properly. If @key is
48202 + * specified - check whether @uf_coord is set correspondingly.
48203 + */
48204 +static void check_uf_coord(const uf_coord_t *uf_coord, const reiser4_key *key)
48205 +{
48206 +#if REISER4_DEBUG
48207 +       const coord_t *coord;
48208 +       const extent_coord_extension_t *ext_coord;
48209 +       reiser4_extent *ext;
48210 +
48211 +       coord = &uf_coord->coord;
48212 +       ext_coord = &uf_coord->extension.extent;
48213 +       ext = ext_by_offset(coord->node, uf_coord->extension.extent.ext_offset);
48214 +
48215 +       assert("",
48216 +              WITH_DATA(coord->node,
48217 +                        (uf_coord->valid == 1 &&
48218 +                         coord_is_iplug_set(coord) &&
48219 +                         item_is_extent(coord) &&
48220 +                         ext_coord->nr_units == nr_units_extent(coord) &&
48221 +                         ext == extent_by_coord(coord) &&
48222 +                         ext_coord->width == extent_get_width(ext) &&
48223 +                         coord->unit_pos < ext_coord->nr_units &&
48224 +                         ext_coord->pos_in_unit < ext_coord->width &&
48225 +                         memcmp(ext, &ext_coord->extent,
48226 +                                sizeof(reiser4_extent)) == 0)));
48227 +       if (key) {
48228 +               reiser4_key coord_key;
48229 +
48230 +               unit_key_by_coord(&uf_coord->coord, &coord_key);
48231 +               set_key_offset(&coord_key,
48232 +                              get_key_offset(&coord_key) +
48233 +                              (uf_coord->extension.extent.
48234 +                               pos_in_unit << PAGE_CACHE_SHIFT));
48235 +               assert("", keyeq(key, &coord_key));
48236 +       }
48237 +#endif
48238 +}
48239 +
48240 +static inline reiser4_extent *ext_by_ext_coord(const uf_coord_t *uf_coord)
48241 +{
48242 +       check_uf_coord(uf_coord, NULL);
48243 +
48244 +       return ext_by_offset(uf_coord->coord.node,
48245 +                            uf_coord->extension.extent.ext_offset);
48246 +}
48247 +
48248 +#if REISER4_DEBUG
48249 +
48250 +/**
48251 + * offset_is_in_unit
48252 + *
48253 + *
48254 + *
48255 + */
48256 +/* return 1 if offset @off is inside of extent unit pointed to by @coord. Set
48257 +   pos_in_unit inside of unit correspondingly */
48258 +static int offset_is_in_unit(const coord_t *coord, loff_t off)
48259 +{
48260 +       reiser4_key unit_key;
48261 +       __u64 unit_off;
48262 +       reiser4_extent *ext;
48263 +
48264 +       ext = extent_by_coord(coord);
48265 +
48266 +       unit_key_extent(coord, &unit_key);
48267 +       unit_off = get_key_offset(&unit_key);
48268 +       if (off < unit_off)
48269 +               return 0;
48270 +       if (off >= (unit_off + (current_blocksize * extent_get_width(ext))))
48271 +               return 0;
48272 +       return 1;
48273 +}
48274 +
48275 +static int
48276 +coord_matches_key_extent(const coord_t * coord, const reiser4_key * key)
48277 +{
48278 +       reiser4_key item_key;
48279 +
48280 +       assert("vs-771", coord_is_existing_unit(coord));
48281 +       assert("vs-1258", keylt(key, append_key_extent(coord, &item_key)));
48282 +       assert("vs-1259", keyge(key, item_key_by_coord(coord, &item_key)));
48283 +
48284 +       return offset_is_in_unit(coord, get_key_offset(key));
48285 +}
48286 +
48287 +#endif
48288 +
48289 +/**
48290 + * can_append -
48291 + * @key:
48292 + * @coord:
48293 + *
48294 + * Returns 1 if @key is equal to an append key of item @coord is set to
48295 + */
48296 +static int can_append(const reiser4_key *key, const coord_t *coord)
48297 +{
48298 +       reiser4_key append_key;
48299 +
48300 +       return keyeq(key, append_key_extent(coord, &append_key));
48301 +}
48302 +
48303 +/**
48304 + * append_hole
48305 + * @coord:
48306 + * @lh:
48307 + * @key:
48308 + *
48309 + */
48310 +static int append_hole(coord_t *coord, lock_handle *lh,
48311 +                      const reiser4_key *key)
48312 +{
48313 +       reiser4_key append_key;
48314 +       reiser4_block_nr hole_width;
48315 +       reiser4_extent *ext, new_ext;
48316 +       reiser4_item_data idata;
48317 +
48318 +       /* last item of file may have to be appended with hole */
48319 +       assert("vs-708", znode_get_level(coord->node) == TWIG_LEVEL);
48320 +       assert("vs-714", item_id_by_coord(coord) == EXTENT_POINTER_ID);
48321 +
48322 +       /* key of first byte which is not addressed by this extent */
48323 +       append_key_extent(coord, &append_key);
48324 +
48325 +       assert("", keyle(&append_key, key));
48326 +
48327 +       /*
48328 +        * extent item has to be appended with hole. Calculate length of that
48329 +        * hole
48330 +        */
48331 +       hole_width = ((get_key_offset(key) - get_key_offset(&append_key) +
48332 +                      current_blocksize - 1) >> current_blocksize_bits);
48333 +       assert("vs-954", hole_width > 0);
48334 +
48335 +       /* set coord after last unit */
48336 +       coord_init_after_item_end(coord);
48337 +
48338 +       /* get last extent in the item */
48339 +       ext = extent_by_coord(coord);
48340 +       if (state_of_extent(ext) == HOLE_EXTENT) {
48341 +               /*
48342 +                * last extent of a file is hole extent. Widen that extent by
48343 +                * @hole_width blocks. Note that we do not worry about
48344 +                * overflowing - extent width is 64 bits
48345 +                */
48346 +               reiser4_set_extent(ext, HOLE_EXTENT_START,
48347 +                                  extent_get_width(ext) + hole_width);
48348 +               znode_make_dirty(coord->node);
48349 +               return 0;
48350 +       }
48351 +
48352 +       /* append last item of the file with hole extent unit */
48353 +       assert("vs-713", (state_of_extent(ext) == ALLOCATED_EXTENT ||
48354 +                         state_of_extent(ext) == UNALLOCATED_EXTENT));
48355 +
48356 +       reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
48357 +       init_new_extent(&idata, &new_ext, 1);
48358 +       return insert_into_item(coord, lh, &append_key, &idata, 0);
48359 +}
48360 +
48361 +/**
48362 + * check_jnodes
48363 + * @twig: longterm locked twig node
48364 + * @key:
48365 + *
48366 + */
48367 +static void check_jnodes(znode *twig, const reiser4_key *key, int count)
48368 +{
48369 +#if REISER4_DEBUG
48370 +       coord_t c;
48371 +       reiser4_key node_key, jnode_key;
48372 +
48373 +       jnode_key = *key;
48374 +
48375 +       assert("", twig != NULL);
48376 +       assert("", znode_get_level(twig) == TWIG_LEVEL);
48377 +       assert("", znode_is_write_locked(twig));
48378 +
48379 +       zload(twig);
48380 +       /* get the smallest key in twig node */
48381 +       coord_init_first_unit(&c, twig);
48382 +       unit_key_by_coord(&c, &node_key);
48383 +       assert("", keyle(&node_key, &jnode_key));
48384 +
48385 +       coord_init_last_unit(&c, twig);
48386 +       unit_key_by_coord(&c, &node_key);
48387 +       if (item_plugin_by_coord(&c)->s.file.append_key)
48388 +               item_plugin_by_coord(&c)->s.file.append_key(&c, &node_key);
48389 +       set_key_offset(&jnode_key,
48390 +                      get_key_offset(&jnode_key) + (loff_t)count * PAGE_CACHE_SIZE - 1);
48391 +       assert("", keylt(&jnode_key, &node_key));
48392 +       zrelse(twig);
48393 +#endif
48394 +}
48395 +
48396 +/**
48397 + * append_last_extent - append last file item
48398 + * @uf_coord: coord to start insertion from
48399 + * @jnodes: array of jnodes
48400 + * @count: number of jnodes in the array
48401 + *
48402 + * There is already at least one extent item of file @inode in the tree. Append
48403 + * the last of them with unallocated extent unit of width @count. Assign
48404 + * fake block numbers to jnodes corresponding to the inserted extent.
48405 + */
48406 +static int append_last_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48407 +                             jnode **jnodes, int count)
48408 +{
48409 +       int result;
48410 +       reiser4_extent new_ext;
48411 +       reiser4_item_data idata;
48412 +       coord_t *coord;
48413 +       extent_coord_extension_t *ext_coord;
48414 +       reiser4_extent *ext;
48415 +       reiser4_block_nr block;
48416 +       jnode *node;
48417 +       int i;
48418 +
48419 +       coord = &uf_coord->coord;
48420 +       ext_coord = &uf_coord->extension.extent;
48421 +       ext = ext_by_ext_coord(uf_coord);
48422 +
48423 +       /* check correctness of position in the item */
48424 +       assert("vs-228", coord->unit_pos == coord_last_unit_pos(coord));
48425 +       assert("vs-1311", coord->between == AFTER_UNIT);
48426 +       assert("vs-1302", ext_coord->pos_in_unit == ext_coord->width - 1);
48427 +
48428 +       if (!can_append(key, coord)) {
48429 +               /* hole extent has to be inserted */
48430 +               result = append_hole(coord, uf_coord->lh, key);
48431 +               uf_coord->valid = 0;
48432 +               return result;
48433 +       }
48434 +
48435 +       if (count == 0)
48436 +               return 0;
48437 +
48438 +       assert("", get_key_offset(key) == (loff_t)index_jnode(jnodes[0]) * PAGE_CACHE_SIZE);
48439 +
48440 +       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host,
48441 +                                          count);
48442 +       BUG_ON(result != 0);
48443 +
48444 +       switch (state_of_extent(ext)) {
48445 +       case UNALLOCATED_EXTENT:
48446 +               /*
48447 +                * last extent unit of the file is unallocated one. Increase
48448 +                * its width by @count
48449 +                */
48450 +               reiser4_set_extent(ext, UNALLOCATED_EXTENT_START,
48451 +                                  extent_get_width(ext) + count);
48452 +               znode_make_dirty(coord->node);
48453 +
48454 +               /* update coord extension */
48455 +               ext_coord->width += count;
48456 +               ON_DEBUG(extent_set_width
48457 +                        (&uf_coord->extension.extent.extent,
48458 +                         ext_coord->width));
48459 +               break;
48460 +
48461 +       case HOLE_EXTENT:
48462 +       case ALLOCATED_EXTENT:
48463 +               /*
48464 +                * last extent unit of the file is either hole or allocated
48465 +                * one. Append one unallocated extent of width @count
48466 +                */
48467 +               reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
48468 +               init_new_extent(&idata, &new_ext, 1);
48469 +               result = insert_into_item(coord, uf_coord->lh, key, &idata, 0);
48470 +               uf_coord->valid = 0;
48471 +               if (result)
48472 +                       return result;
48473 +               break;
48474 +
48475 +       default:
48476 +               return RETERR(-EIO);
48477 +       }
48478 +
48479 +       /*
48480 +        * make sure that we hold long term locked twig node containing all
48481 +        * jnodes we are about to capture
48482 +        */
48483 +       check_jnodes(uf_coord->lh->node, key, count);
48484 +
48485 +       /*
48486 +        * assign fake block numbers to all jnodes. FIXME: make sure whether
48487 +        * twig node containing inserted extent item is locked
48488 +        */
48489 +       block = fake_blocknr_unformatted(count);
48490 +       for (i = 0; i < count; i ++, block ++) {
48491 +               node = jnodes[i];
48492 +               spin_lock_jnode(node);
48493 +               JF_SET(node, JNODE_CREATED);
48494 +               jnode_set_block(node, &block);
48495 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
48496 +               BUG_ON(result != 0);
48497 +               jnode_make_dirty_locked(node);
48498 +               spin_unlock_jnode(node);
48499 +       }
48500 +       return count;
48501 +}
48502 +
48503 +/**
48504 + * insert_first_hole - inser hole extent into tree
48505 + * @coord:
48506 + * @lh:
48507 + * @key:
48508 + *
48509 + *
48510 + */
48511 +static int insert_first_hole(coord_t *coord, lock_handle *lh,
48512 +                            const reiser4_key *key)
48513 +{
48514 +       reiser4_extent new_ext;
48515 +       reiser4_item_data idata;
48516 +       reiser4_key item_key;
48517 +       reiser4_block_nr hole_width;
48518 +
48519 +       /* @coord must be set for inserting of new item */
48520 +       assert("vs-711", coord_is_between_items(coord));
48521 +
48522 +       item_key = *key;
48523 +       set_key_offset(&item_key, 0ull);
48524 +
48525 +       hole_width = ((get_key_offset(key) + current_blocksize - 1) >>
48526 +                     current_blocksize_bits);
48527 +       assert("vs-710", hole_width > 0);
48528 +
48529 +       /* compose body of hole extent and insert item into tree */
48530 +       reiser4_set_extent(&new_ext, HOLE_EXTENT_START, hole_width);
48531 +       init_new_extent(&idata, &new_ext, 1);
48532 +       return insert_extent_by_coord(coord, &idata, &item_key, lh);
48533 +}
48534 +
48535 +
48536 +/**
48537 + * insert_first_extent - insert first file item
48538 + * @inode: inode of file
48539 + * @uf_coord: coord to start insertion from
48540 + * @jnodes: array of jnodes
48541 + * @count: number of jnodes in the array
48542 + * @inode:
48543 + *
48544 + * There are no items of file @inode in the tree yet. Insert unallocated extent
48545 + * of width @count into tree or hole extent if writing not to the
48546 + * beginning. Assign fake block numbers to jnodes corresponding to the inserted
48547 + * unallocated extent. Returns number of jnodes or error code.
48548 + */
48549 +static int insert_first_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48550 +                              jnode **jnodes, int count,
48551 +                              struct inode *inode)
48552 +{
48553 +       int result;
48554 +       int i;
48555 +       reiser4_extent new_ext;
48556 +       reiser4_item_data idata;
48557 +       reiser4_block_nr block;
48558 +       unix_file_info_t *uf_info;
48559 +       jnode *node;
48560 +
48561 +       /* first extent insertion starts at leaf level */
48562 +       assert("vs-719", znode_get_level(uf_coord->coord.node) == LEAF_LEVEL);
48563 +       assert("vs-711", coord_is_between_items(&uf_coord->coord));
48564 +
48565 +       if (get_key_offset(key) != 0) {
48566 +               result = insert_first_hole(&uf_coord->coord, uf_coord->lh, key);
48567 +               uf_coord->valid = 0;
48568 +               uf_info = unix_file_inode_data(inode);
48569 +
48570 +               /*
48571 +                * first item insertion is only possible when writing to empty
48572 +                * file or performing tail conversion
48573 +                */
48574 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
48575 +                           (reiser4_inode_get_flag(inode,
48576 +                                                   REISER4_PART_MIXED) &&
48577 +                            reiser4_inode_get_flag(inode,
48578 +                                                   REISER4_PART_IN_CONV))));
48579 +               /* if file was empty - update its state */
48580 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
48581 +                       uf_info->container = UF_CONTAINER_EXTENTS;
48582 +               return result;
48583 +       }
48584 +
48585 +       if (count == 0)
48586 +               return 0;
48587 +
48588 +       result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(jnodes[0])->host, count);
48589 +       BUG_ON(result != 0);
48590 +
48591 +       /*
48592 +        * prepare for tree modification: compose body of item and item data
48593 +        * structure needed for insertion
48594 +        */
48595 +       reiser4_set_extent(&new_ext, UNALLOCATED_EXTENT_START, count);
48596 +       init_new_extent(&idata, &new_ext, 1);
48597 +
48598 +       /* insert extent item into the tree */
48599 +       result = insert_extent_by_coord(&uf_coord->coord, &idata, key,
48600 +                                       uf_coord->lh);
48601 +       if (result)
48602 +               return result;
48603 +
48604 +       /*
48605 +        * make sure that we hold long term locked twig node containing all
48606 +        * jnodes we are about to capture
48607 +        */
48608 +       check_jnodes(uf_coord->lh->node, key, count);
48609 +       /*
48610 +        * assign fake block numbers to all jnodes, capture and mark them dirty
48611 +        */
48612 +       block = fake_blocknr_unformatted(count);
48613 +       for (i = 0; i < count; i ++, block ++) {
48614 +               node = jnodes[i];
48615 +               spin_lock_jnode(node);
48616 +               JF_SET(node, JNODE_CREATED);
48617 +               jnode_set_block(node, &block);
48618 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
48619 +               BUG_ON(result != 0);
48620 +               jnode_make_dirty_locked(node);
48621 +               spin_unlock_jnode(node);
48622 +       }
48623 +
48624 +       /*
48625 +        * invalidate coordinate, research must be performed to continue
48626 +        * because write will continue on twig level
48627 +        */
48628 +       uf_coord->valid = 0;
48629 +       return count;
48630 +}
48631 +
48632 +/**
48633 + * plug_hole - replace hole extent with unallocated and holes
48634 + * @uf_coord:
48635 + * @key:
48636 + * @node:
48637 + * @h: structure containing coordinate, lock handle, key, etc
48638 + *
48639 + * Creates an unallocated extent of width 1 within a hole. In worst case two
48640 + * additional extents can be created.
48641 + */
48642 +static int plug_hole(uf_coord_t *uf_coord, const reiser4_key *key, int *how)
48643 +{
48644 +       struct replace_handle rh;
48645 +       reiser4_extent *ext;
48646 +       reiser4_block_nr width, pos_in_unit;
48647 +       coord_t *coord;
48648 +       extent_coord_extension_t *ext_coord;
48649 +       int return_inserted_position;
48650 +
48651 +       check_uf_coord(uf_coord, key);
48652 +
48653 +       rh.coord = coord_by_uf_coord(uf_coord);
48654 +       rh.lh = uf_coord->lh;
48655 +       rh.flags = 0;
48656 +
48657 +       coord = coord_by_uf_coord(uf_coord);
48658 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
48659 +       ext = ext_by_ext_coord(uf_coord);
48660 +
48661 +       width = ext_coord->width;
48662 +       pos_in_unit = ext_coord->pos_in_unit;
48663 +
48664 +       *how = 0;
48665 +       if (width == 1) {
48666 +               reiser4_set_extent(ext, UNALLOCATED_EXTENT_START, 1);
48667 +               znode_make_dirty(coord->node);
48668 +               /* update uf_coord */
48669 +               ON_DEBUG(ext_coord->extent = *ext);
48670 +               *how = 1;
48671 +               return 0;
48672 +       } else if (pos_in_unit == 0) {
48673 +               /* we deal with first element of extent */
48674 +               if (coord->unit_pos) {
48675 +                       /* there is an extent to the left */
48676 +                       if (state_of_extent(ext - 1) == UNALLOCATED_EXTENT) {
48677 +                               /*
48678 +                                * left neighboring unit is an unallocated
48679 +                                * extent. Increase its width and decrease
48680 +                                * width of hole
48681 +                                */
48682 +                               extent_set_width(ext - 1,
48683 +                                                extent_get_width(ext - 1) + 1);
48684 +                               extent_set_width(ext, width - 1);
48685 +                               znode_make_dirty(coord->node);
48686 +
48687 +                               /* update coord extension */
48688 +                               coord->unit_pos--;
48689 +                               ext_coord->width = extent_get_width(ext - 1);
48690 +                               ext_coord->pos_in_unit = ext_coord->width - 1;
48691 +                               ext_coord->ext_offset -= sizeof(reiser4_extent);
48692 +                               ON_DEBUG(ext_coord->extent =
48693 +                                        *extent_by_coord(coord));
48694 +                               *how = 2;
48695 +                               return 0;
48696 +                       }
48697 +               }
48698 +               /* extent for replace */
48699 +               reiser4_set_extent(&rh.overwrite, UNALLOCATED_EXTENT_START, 1);
48700 +               /* extent to be inserted */
48701 +               reiser4_set_extent(&rh.new_extents[0], HOLE_EXTENT_START,
48702 +                                  width - 1);
48703 +               rh.nr_new_extents = 1;
48704 +
48705 +               /* have reiser4_replace_extent to return with @coord and
48706 +                  @uf_coord->lh set to unit which was replaced */
48707 +               return_inserted_position = 0;
48708 +               *how = 3;
48709 +       } else if (pos_in_unit == width - 1) {
48710 +               /* we deal with last element of extent */
48711 +               if (coord->unit_pos < nr_units_extent(coord) - 1) {
48712 +                       /* there is an extent unit to the right */
48713 +                       if (state_of_extent(ext + 1) == UNALLOCATED_EXTENT) {
48714 +                               /*
48715 +                                * right neighboring unit is an unallocated
48716 +                                * extent. Increase its width and decrease
48717 +                                * width of hole
48718 +                                */
48719 +                               extent_set_width(ext + 1,
48720 +                                                extent_get_width(ext + 1) + 1);
48721 +                               extent_set_width(ext, width - 1);
48722 +                               znode_make_dirty(coord->node);
48723 +
48724 +                               /* update coord extension */
48725 +                               coord->unit_pos++;
48726 +                               ext_coord->width = extent_get_width(ext + 1);
48727 +                               ext_coord->pos_in_unit = 0;
48728 +                               ext_coord->ext_offset += sizeof(reiser4_extent);
48729 +                               ON_DEBUG(ext_coord->extent =
48730 +                                        *extent_by_coord(coord));
48731 +                               *how = 4;
48732 +                               return 0;
48733 +                       }
48734 +               }
48735 +               /* extent for replace */
48736 +               reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START, width - 1);
48737 +               /* extent to be inserted */
48738 +               reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
48739 +                                  1);
48740 +               rh.nr_new_extents = 1;
48741 +
48742 +               /* have reiser4_replace_extent to return with @coord and
48743 +                  @uf_coord->lh set to unit which was inserted */
48744 +               return_inserted_position = 1;
48745 +               *how = 5;
48746 +       } else {
48747 +               /* extent for replace */
48748 +               reiser4_set_extent(&rh.overwrite, HOLE_EXTENT_START,
48749 +                                  pos_in_unit);
48750 +               /* extents to be inserted */
48751 +               reiser4_set_extent(&rh.new_extents[0], UNALLOCATED_EXTENT_START,
48752 +                                  1);
48753 +               reiser4_set_extent(&rh.new_extents[1], HOLE_EXTENT_START,
48754 +                                  width - pos_in_unit - 1);
48755 +               rh.nr_new_extents = 2;
48756 +
48757 +               /* have reiser4_replace_extent to return with @coord and
48758 +                  @uf_coord->lh set to first of units which were inserted */
48759 +               return_inserted_position = 1;
48760 +               *how = 6;
48761 +       }
48762 +       unit_key_by_coord(coord, &rh.paste_key);
48763 +       set_key_offset(&rh.paste_key, get_key_offset(&rh.paste_key) +
48764 +                      extent_get_width(&rh.overwrite) * current_blocksize);
48765 +
48766 +       uf_coord->valid = 0;
48767 +       return reiser4_replace_extent(&rh, return_inserted_position);
48768 +}
48769 +
48770 +/**
48771 + * overwrite_one_block -
48772 + * @uf_coord:
48773 + * @key:
48774 + * @node:
48775 + *
48776 + * If @node corresponds to hole extent - create unallocated extent for it and
48777 + * assign fake block number. If @node corresponds to allocated extent - assign
48778 + * block number of jnode
48779 + */
48780 +static int overwrite_one_block(uf_coord_t *uf_coord, const reiser4_key *key,
48781 +                              jnode *node, int *hole_plugged)
48782 +{
48783 +       int result;
48784 +       extent_coord_extension_t *ext_coord;
48785 +       reiser4_extent *ext;
48786 +       reiser4_block_nr block;
48787 +       int how;
48788 +
48789 +       assert("vs-1312", uf_coord->coord.between == AT_UNIT);
48790 +
48791 +       result = 0;
48792 +       ext_coord = ext_coord_by_uf_coord(uf_coord);
48793 +       ext = ext_by_ext_coord(uf_coord);
48794 +       assert("", state_of_extent(ext) != UNALLOCATED_EXTENT);
48795 +
48796 +       switch (state_of_extent(ext)) {
48797 +       case ALLOCATED_EXTENT:
48798 +               block = extent_get_start(ext) + ext_coord->pos_in_unit;
48799 +               break;
48800 +
48801 +       case HOLE_EXTENT:
48802 +               result = DQUOT_ALLOC_BLOCK_NODIRTY(mapping_jnode(node)->host, 1);
48803 +               BUG_ON(result != 0);
48804 +               result = plug_hole(uf_coord, key, &how);
48805 +               if (result)
48806 +                       return result;
48807 +               block = fake_blocknr_unformatted(1);
48808 +               if (hole_plugged)
48809 +                       *hole_plugged = 1;
48810 +               JF_SET(node, JNODE_CREATED);
48811 +               break;
48812 +
48813 +       default:
48814 +               return RETERR(-EIO);
48815 +       }
48816 +
48817 +       jnode_set_block(node, &block);
48818 +       return 0;
48819 +}
48820 +
48821 +/**
48822 + * move_coord - move coordinate forward
48823 + * @uf_coord:
48824 + *
48825 + * Move coordinate one data block pointer forward. Return 1 if coord is set to
48826 + * the last one already or is invalid.
48827 + */
48828 +static int move_coord(uf_coord_t *uf_coord)
48829 +{
48830 +       extent_coord_extension_t *ext_coord;
48831 +
48832 +       if (uf_coord->valid == 0)
48833 +               return 1;
48834 +       ext_coord = &uf_coord->extension.extent;
48835 +       ext_coord->pos_in_unit ++;
48836 +       if (ext_coord->pos_in_unit < ext_coord->width)
48837 +               /* coordinate moved within the unit */
48838 +               return 0;
48839 +
48840 +       /* end of unit is reached. Try to move to next unit */
48841 +       ext_coord->pos_in_unit = 0;
48842 +       uf_coord->coord.unit_pos ++;
48843 +       if (uf_coord->coord.unit_pos < ext_coord->nr_units) {
48844 +               /* coordinate moved to next unit */
48845 +               ext_coord->ext_offset += sizeof(reiser4_extent);
48846 +               ext_coord->width =
48847 +                       extent_get_width(ext_by_offset
48848 +                                        (uf_coord->coord.node,
48849 +                                         ext_coord->ext_offset));
48850 +               ON_DEBUG(ext_coord->extent =
48851 +                        *ext_by_offset(uf_coord->coord.node,
48852 +                                       ext_coord->ext_offset));
48853 +               return 0;
48854 +       }
48855 +       /* end of item is reached */
48856 +       uf_coord->valid = 0;
48857 +       return 1;
48858 +}
48859 +
48860 +/**
48861 + * overwrite_extent -
48862 + * @inode:
48863 + *
48864 + * Returns number of handled jnodes.
48865 + */
48866 +static int overwrite_extent(uf_coord_t *uf_coord, const reiser4_key *key,
48867 +                           jnode **jnodes, int count, int *plugged_hole)
48868 +{
48869 +       int result;
48870 +       reiser4_key k;
48871 +       int i;
48872 +       jnode *node;
48873 +
48874 +       k = *key;
48875 +       for (i = 0; i < count; i ++) {
48876 +               node = jnodes[i];
48877 +               if (*jnode_get_block(node) == 0) {
48878 +                       result = overwrite_one_block(uf_coord, &k, node, plugged_hole);
48879 +                       if (result)
48880 +                               return result;
48881 +               }
48882 +               /*
48883 +                * make sure that we hold long term locked twig node containing
48884 +                * all jnodes we are about to capture
48885 +                */
48886 +               check_jnodes(uf_coord->lh->node, &k, 1);
48887 +               /*
48888 +                * assign fake block numbers to all jnodes, capture and mark
48889 +                * them dirty
48890 +                */
48891 +               spin_lock_jnode(node);
48892 +               result = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
48893 +               BUG_ON(result != 0);
48894 +               jnode_make_dirty_locked(node);
48895 +               spin_unlock_jnode(node);
48896 +
48897 +               if (uf_coord->valid == 0)
48898 +                       return i + 1;
48899 +
48900 +               check_uf_coord(uf_coord, &k);
48901 +
48902 +               if (move_coord(uf_coord)) {
48903 +                       /*
48904 +                        * failed to move to the next node pointer. Either end
48905 +                        * of file or end of twig node is reached. In the later
48906 +                        * case we might go to the right neighbor.
48907 +                        */
48908 +                       uf_coord->valid = 0;
48909 +                       return i + 1;
48910 +               }
48911 +               set_key_offset(&k, get_key_offset(&k) + PAGE_CACHE_SIZE);
48912 +       }
48913 +
48914 +       return count;
48915 +}
48916 +
48917 +/**
48918 + * reiser4_update_extent
48919 + * @file:
48920 + * @jnodes:
48921 + * @count:
48922 + * @off:
48923 + *
48924 + */
48925 +int reiser4_update_extent(struct inode *inode, jnode *node, loff_t pos,
48926 +                 int *plugged_hole)
48927 +{
48928 +       int result;
48929 +       znode *loaded;
48930 +       uf_coord_t uf_coord;
48931 +       coord_t *coord;
48932 +       lock_handle lh;
48933 +       reiser4_key key;
48934 +
48935 +       assert("", reiser4_lock_counters()->d_refs == 0);
48936 +
48937 +       key_by_inode_and_offset_common(inode, pos, &key);
48938 +
48939 +       init_uf_coord(&uf_coord, &lh);
48940 +       coord = &uf_coord.coord;
48941 +       result = find_file_item_nohint(coord, &lh, &key,
48942 +                                      ZNODE_WRITE_LOCK, inode);
48943 +       if (IS_CBKERR(result)) {
48944 +               assert("", reiser4_lock_counters()->d_refs == 0);
48945 +               return result;
48946 +       }
48947 +
48948 +       result = zload(coord->node);
48949 +       BUG_ON(result != 0);
48950 +       loaded = coord->node;
48951 +
48952 +       if (coord->between == AFTER_UNIT) {
48953 +               /*
48954 +                * append existing extent item with unallocated extent of width
48955 +                * nr_jnodes
48956 +                */
48957 +               init_coord_extension_extent(&uf_coord,
48958 +                                           get_key_offset(&key));
48959 +               result = append_last_extent(&uf_coord, &key,
48960 +                                           &node, 1);
48961 +       } else if (coord->between == AT_UNIT) {
48962 +               /*
48963 +                * overwrite
48964 +                * not optimal yet. Will be optimized if new write will show
48965 +                * performance win.
48966 +                */
48967 +               init_coord_extension_extent(&uf_coord,
48968 +                                           get_key_offset(&key));
48969 +               result = overwrite_extent(&uf_coord, &key,
48970 +                                         &node, 1, plugged_hole);
48971 +       } else {
48972 +               /*
48973 +                * there are no items of this file in the tree yet. Create
48974 +                * first item of the file inserting one unallocated extent of
48975 +                * width nr_jnodes
48976 +                */
48977 +               result = insert_first_extent(&uf_coord, &key, &node, 1, inode);
48978 +       }
48979 +       assert("", result == 1 || result < 0);
48980 +       zrelse(loaded);
48981 +       done_lh(&lh);
48982 +       assert("", reiser4_lock_counters()->d_refs == 0);
48983 +       return (result == 1) ? 0 : result;
48984 +}
48985 +
48986 +/**
48987 + * update_extents
48988 + * @file:
48989 + * @jnodes:
48990 + * @count:
48991 + * @off:
48992 + *
48993 + */
48994 +static int update_extents(struct file *file, jnode **jnodes, int count, loff_t pos)
48995 +{
48996 +       struct inode *inode;
48997 +       struct hint hint;
48998 +       reiser4_key key;
48999 +       int result;
49000 +       znode *loaded;
49001 +
49002 +       result = load_file_hint(file, &hint);
49003 +       BUG_ON(result != 0);
49004 +
49005 +       inode = file->f_dentry->d_inode;
49006 +       if (count != 0)
49007 +               /*
49008 +                * count == 0 is special case: expanding truncate
49009 +                */
49010 +               pos = (loff_t)index_jnode(jnodes[0]) << PAGE_CACHE_SHIFT;
49011 +       key_by_inode_and_offset_common(inode, pos, &key);
49012 +
49013 +       assert("", reiser4_lock_counters()->d_refs == 0);
49014 +
49015 +       do {
49016 +               result = find_file_item(&hint, &key, ZNODE_WRITE_LOCK, inode);
49017 +               if (IS_CBKERR(result)) {
49018 +                       assert("", reiser4_lock_counters()->d_refs == 0);
49019 +                       return result;
49020 +               }
49021 +
49022 +               result = zload(hint.ext_coord.coord.node);
49023 +               BUG_ON(result != 0);
49024 +               loaded = hint.ext_coord.coord.node;
49025 +
49026 +               if (hint.ext_coord.coord.between == AFTER_UNIT) {
49027 +                       /*
49028 +                        * append existing extent item with unallocated extent
49029 +                        * of width nr_jnodes
49030 +                        */
49031 +                       if (hint.ext_coord.valid == 0)
49032 +                               /* NOTE: get statistics on this */
49033 +                               init_coord_extension_extent(&hint.ext_coord,
49034 +                                                           get_key_offset(&key));
49035 +                       result = append_last_extent(&hint.ext_coord, &key,
49036 +                                                   jnodes, count);
49037 +               } else if (hint.ext_coord.coord.between == AT_UNIT) {
49038 +                       /*
49039 +                        * overwrite
49040 +                        * not optimal yet. Will be optimized if new write will
49041 +                        * show performance win.
49042 +                        */
49043 +                       if (hint.ext_coord.valid == 0)
49044 +                               /* NOTE: get statistics on this */
49045 +                               init_coord_extension_extent(&hint.ext_coord,
49046 +                                                           get_key_offset(&key));
49047 +                       result = overwrite_extent(&hint.ext_coord, &key,
49048 +                                                 jnodes, count, NULL);
49049 +               } else {
49050 +                       /*
49051 +                        * there are no items of this file in the tree
49052 +                        * yet. Create first item of the file inserting one
49053 +                        * unallocated extent of * width nr_jnodes
49054 +                        */
49055 +                       result = insert_first_extent(&hint.ext_coord, &key,
49056 +                                                    jnodes, count, inode);
49057 +               }
49058 +               zrelse(loaded);
49059 +               if (result < 0) {
49060 +                       done_lh(hint.ext_coord.lh);
49061 +                       break;
49062 +               }
49063 +
49064 +               jnodes += result;
49065 +               count -= result;
49066 +               set_key_offset(&key, get_key_offset(&key) + result * PAGE_CACHE_SIZE);
49067 +
49068 +               /* seal and unlock znode */
49069 +               if (hint.ext_coord.valid)
49070 +                       reiser4_set_hint(&hint, &key, ZNODE_WRITE_LOCK);
49071 +               else
49072 +                       reiser4_unset_hint(&hint);
49073 +
49074 +       } while (count > 0);
49075 +
49076 +       save_file_hint(file, &hint);
49077 +       assert("", reiser4_lock_counters()->d_refs == 0);
49078 +       return result;
49079 +}
49080 +
49081 +/**
49082 + * write_extent_reserve_space - reserve space for extent write operation
49083 + * @inode:
49084 + *
49085 + * Estimates and reserves space which may be required for writing
49086 + * WRITE_GRANULARITY pages of file.
49087 + */
49088 +static int write_extent_reserve_space(struct inode *inode)
49089 +{
49090 +       __u64 count;
49091 +       reiser4_tree *tree;
49092 +
49093 +       /*
49094 +        * to write WRITE_GRANULARITY pages to a file by extents we have to
49095 +        * reserve disk space for:
49096 +
49097 +        * 1. find_file_item may have to insert empty node to the tree (empty
49098 +        * leaf node between two extent items). This requires 1 block and
49099 +        * number of blocks which are necessary to perform insertion of an
49100 +        * internal item into twig level.
49101 +
49102 +        * 2. for each of written pages there might be needed 1 block and
49103 +        * number of blocks which might be necessary to perform insertion of or
49104 +        * paste to an extent item.
49105 +
49106 +        * 3. stat data update
49107 +        */
49108 +       tree = reiser4_tree_by_inode(inode);
49109 +       count = estimate_one_insert_item(tree) +
49110 +               WRITE_GRANULARITY * (1 + estimate_one_insert_into_item(tree)) +
49111 +               estimate_one_insert_item(tree);
49112 +       grab_space_enable();
49113 +       return reiser4_grab_space(count, 0 /* flags */);
49114 +}
49115 +
49116 +/**
49117 + * reiser4_write_extent - write method of extent item plugin
49118 + * @file: file to write to
49119 + * @buf: address of user-space buffer
49120 + * @write_amount: number of bytes to write
49121 + * @off: position in file to write to
49122 + *
49123 + */
49124 +ssize_t reiser4_write_extent(struct file *file, const char __user *buf,
49125 +                            size_t count, loff_t *pos)
49126 +{
49127 +       int have_to_update_extent;
49128 +       int nr_pages;
49129 +       struct page *page;
49130 +       jnode *jnodes[WRITE_GRANULARITY + 1];
49131 +       struct inode *inode;
49132 +       unsigned long index;
49133 +       unsigned long end;
49134 +       int i;
49135 +       int to_page, page_off;
49136 +       size_t left, written;
49137 +       int result;
49138 +
49139 +       inode = file->f_dentry->d_inode;
49140 +       if (write_extent_reserve_space(inode))
49141 +               return RETERR(-ENOSPC);
49142 +
49143 +       if (count == 0) {
49144 +               /* truncate case */
49145 +               update_extents(file, jnodes, 0, *pos);
49146 +               return 0;
49147 +       }
49148 +
49149 +       BUG_ON(get_current_context()->trans->atom != NULL);
49150 +
49151 +       index = *pos >> PAGE_CACHE_SHIFT;
49152 +       /* calculate number of pages which are to be written */
49153 +       end = ((*pos + count - 1) >> PAGE_CACHE_SHIFT);
49154 +       nr_pages = end - index + 1;
49155 +       assert("", nr_pages <= WRITE_GRANULARITY + 1);
49156 +
49157 +       /* get pages and jnodes */
49158 +       for (i = 0; i < nr_pages; i ++) {
49159 +               page = find_or_create_page(inode->i_mapping, index + i,
49160 +                                          reiser4_ctx_gfp_mask_get());
49161 +               if (page == NULL) {
49162 +                       while(i --) {
49163 +                               unlock_page(jnode_page(jnodes[i]));
49164 +                               page_cache_release(jnode_page(jnodes[i]));
49165 +                       }
49166 +                       return RETERR(-ENOMEM);
49167 +               }
49168 +
49169 +               jnodes[i] = jnode_of_page(page);
49170 +               if (IS_ERR(jnodes[i])) {
49171 +                       unlock_page(page);
49172 +                       page_cache_release(page);
49173 +                       while (i --) {
49174 +                               jput(jnodes[i]);
49175 +                               page_cache_release(jnode_page(jnodes[i]));
49176 +                       }
49177 +                       return RETERR(-ENOMEM);
49178 +               }
49179 +               /* prevent jnode and page from disconnecting */
49180 +               JF_SET(jnodes[i], JNODE_WRITE_PREPARED);
49181 +               unlock_page(page);
49182 +       }
49183 +
49184 +       BUG_ON(get_current_context()->trans->atom != NULL);
49185 +
49186 +       have_to_update_extent = 0;
49187 +
49188 +       left = count;
49189 +       page_off = (*pos & (PAGE_CACHE_SIZE - 1));
49190 +       for (i = 0; i < nr_pages; i ++) {
49191 +               to_page = PAGE_CACHE_SIZE - page_off;
49192 +               if (to_page > left)
49193 +                       to_page = left;
49194 +               page = jnode_page(jnodes[i]);
49195 +               if (page_offset(page) < inode->i_size &&
49196 +                   !PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
49197 +                       /*
49198 +                        * the above is not optimal for partial write to last
49199 +                        * page of file when file size is not at boundary of
49200 +                        * page
49201 +                        */
49202 +                       lock_page(page);
49203 +                       if (!PageUptodate(page)) {
49204 +                               result = readpage_unix_file(NULL, page);
49205 +                               BUG_ON(result != 0);
49206 +                               /* wait for read completion */
49207 +                               lock_page(page);
49208 +                               BUG_ON(!PageUptodate(page));
49209 +                       } else
49210 +                               result = 0;
49211 +                       unlock_page(page);
49212 +               }
49213 +
49214 +               BUG_ON(get_current_context()->trans->atom != NULL);
49215 +               fault_in_pages_readable(buf, to_page);
49216 +               BUG_ON(get_current_context()->trans->atom != NULL);
49217 +
49218 +               lock_page(page);
49219 +               if (!PageUptodate(page) && to_page != PAGE_CACHE_SIZE) {
49220 +                       void *kaddr;
49221 +
49222 +                       kaddr = kmap_atomic(page, KM_USER0);
49223 +                       memset(kaddr, 0, page_off);
49224 +                       memset(kaddr + page_off + to_page, 0,
49225 +                              PAGE_CACHE_SIZE - (page_off + to_page));
49226 +                       flush_dcache_page(page);
49227 +                       kunmap_atomic(kaddr, KM_USER0);
49228 +               }
49229 +
49230 +               written = filemap_copy_from_user(page, page_off, buf, to_page);
49231 +               flush_dcache_page(page);
49232 +               reiser4_set_page_dirty_internal(page);
49233 +               unlock_page(page);
49234 +               mark_page_accessed(page);
49235 +               SetPageUptodate(page);
49236 +               page_cache_release(page);
49237 +
49238 +               if (jnodes[i]->blocknr == 0)
49239 +                       have_to_update_extent ++;
49240 +
49241 +               page_off = 0;
49242 +               buf += to_page;
49243 +               left -= to_page;
49244 +               BUG_ON(get_current_context()->trans->atom != NULL);
49245 +       }
49246 +
49247 +       if (have_to_update_extent) {
49248 +               update_extents(file, jnodes, nr_pages, *pos);
49249 +       } else {
49250 +               for (i = 0; i < nr_pages; i ++) {
49251 +                       spin_lock_jnode(jnodes[i]);
49252 +                       result = reiser4_try_capture(jnodes[i],
49253 +                                                    ZNODE_WRITE_LOCK, 0);
49254 +                       BUG_ON(result != 0);
49255 +                       jnode_make_dirty_locked(jnodes[i]);
49256 +                       spin_unlock_jnode(jnodes[i]);
49257 +               }
49258 +       }
49259 +
49260 +       for (i = 0; i < nr_pages; i ++) {
49261 +               JF_CLR(jnodes[i], JNODE_WRITE_PREPARED);
49262 +               jput(jnodes[i]);
49263 +       }
49264 +
49265 +       /* the only error handled so far is EFAULT on copy_from_user  */
49266 +       return (count - left) ? (count - left) : -EFAULT;
49267 +}
49268 +
49269 +static inline void zero_page(struct page *page)
49270 +{
49271 +       char *kaddr = kmap_atomic(page, KM_USER0);
49272 +
49273 +       memset(kaddr, 0, PAGE_CACHE_SIZE);
49274 +       flush_dcache_page(page);
49275 +       kunmap_atomic(kaddr, KM_USER0);
49276 +       SetPageUptodate(page);
49277 +       unlock_page(page);
49278 +}
49279 +
49280 +int reiser4_do_readpage_extent(reiser4_extent * ext, reiser4_block_nr pos,
49281 +                              struct page *page)
49282 +{
49283 +       jnode *j;
49284 +       struct address_space *mapping;
49285 +       unsigned long index;
49286 +       oid_t oid;
49287 +       reiser4_block_nr block;
49288 +
49289 +       mapping = page->mapping;
49290 +       oid = get_inode_oid(mapping->host);
49291 +       index = page->index;
49292 +
49293 +       switch (state_of_extent(ext)) {
49294 +       case HOLE_EXTENT:
49295 +               /*
49296 +                * it is possible to have hole page with jnode, if page was
49297 +                * eflushed previously.
49298 +                */
49299 +               j = jfind(mapping, index);
49300 +               if (j == NULL) {
49301 +                       zero_page(page);
49302 +                       return 0;
49303 +               }
49304 +               spin_lock_jnode(j);
49305 +               if (!jnode_page(j)) {
49306 +                       jnode_attach_page(j, page);
49307 +               } else {
49308 +                       BUG_ON(jnode_page(j) != page);
49309 +                       assert("vs-1504", jnode_page(j) == page);
49310 +               }
49311 +               block = *jnode_get_io_block(j);
49312 +               spin_unlock_jnode(j);
49313 +               if (block == 0) {
49314 +                       zero_page(page);
49315 +                       jput(j);
49316 +                       return 0;
49317 +               }
49318 +               break;
49319 +
49320 +       case ALLOCATED_EXTENT:
49321 +               j = jnode_of_page(page);
49322 +               if (IS_ERR(j))
49323 +                       return PTR_ERR(j);
49324 +               if (*jnode_get_block(j) == 0) {
49325 +                       reiser4_block_nr blocknr;
49326 +
49327 +                       blocknr = extent_get_start(ext) + pos;
49328 +                       jnode_set_block(j, &blocknr);
49329 +               } else
49330 +                       assert("vs-1403",
49331 +                              j->blocknr == extent_get_start(ext) + pos);
49332 +               break;
49333 +
49334 +       case UNALLOCATED_EXTENT:
49335 +               j = jfind(mapping, index);
49336 +               assert("nikita-2688", j);
49337 +               assert("vs-1426", jnode_page(j) == NULL);
49338 +
49339 +               spin_lock_jnode(j);
49340 +               jnode_attach_page(j, page);
49341 +               spin_unlock_jnode(j);
49342 +               break;
49343 +
49344 +       default:
49345 +               warning("vs-957", "wrong extent\n");
49346 +               return RETERR(-EIO);
49347 +       }
49348 +
49349 +       BUG_ON(j == 0);
49350 +       reiser4_page_io(page, j, READ, reiser4_ctx_gfp_mask_get());
49351 +       jput(j);
49352 +       return 0;
49353 +}
49354 +
49355 +/* Implements plugin->u.item.s.file.read operation for extent items. */
49356 +int reiser4_read_extent(struct file *file, flow_t *flow, hint_t *hint)
49357 +{
49358 +       int result;
49359 +       struct page *page;
49360 +       unsigned long cur_page, next_page;
49361 +       unsigned long page_off, count;
49362 +       struct address_space *mapping;
49363 +       loff_t file_off;
49364 +       uf_coord_t *uf_coord;
49365 +       coord_t *coord;
49366 +       extent_coord_extension_t *ext_coord;
49367 +       unsigned long nr_pages;
49368 +       char *kaddr;
49369 +
49370 +       assert("vs-1353", current_blocksize == PAGE_CACHE_SIZE);
49371 +       assert("vs-572", flow->user == 1);
49372 +       assert("vs-1351", flow->length > 0);
49373 +
49374 +       uf_coord = &hint->ext_coord;
49375 +
49376 +       check_uf_coord(uf_coord, NULL);
49377 +       assert("vs-33", uf_coord->lh == &hint->lh);
49378 +
49379 +       coord = &uf_coord->coord;
49380 +       assert("vs-1119", znode_is_rlocked(coord->node));
49381 +       assert("vs-1120", znode_is_loaded(coord->node));
49382 +       assert("vs-1256", coord_matches_key_extent(coord, &flow->key));
49383 +
49384 +       mapping = file->f_dentry->d_inode->i_mapping;
49385 +       ext_coord = &uf_coord->extension.extent;
49386 +
49387 +       /* offset in a file to start read from */
49388 +       file_off = get_key_offset(&flow->key);
49389 +       /* offset within the page to start read from */
49390 +       page_off = (unsigned long)(file_off & (PAGE_CACHE_SIZE - 1));
49391 +       /* bytes which can be read from the page which contains file_off */
49392 +       count = PAGE_CACHE_SIZE - page_off;
49393 +
49394 +       /* index of page containing offset read is to start from */
49395 +       cur_page = (unsigned long)(file_off >> PAGE_CACHE_SHIFT);
49396 +       next_page = cur_page;
49397 +       /* number of pages flow spans over */
49398 +       nr_pages =
49399 +           ((file_off + flow->length + PAGE_CACHE_SIZE -
49400 +             1) >> PAGE_CACHE_SHIFT) - cur_page;
49401 +
49402 +       /* we start having twig node read locked. However, we do not want to
49403 +          keep that lock all the time readahead works. So, set a sel and
49404 +          release twig node. */
49405 +       reiser4_set_hint(hint, &flow->key, ZNODE_READ_LOCK);
49406 +       /* &hint->lh is done-ed */
49407 +
49408 +       do {
49409 +               reiser4_txn_restart_current();
49410 +               page = read_mapping_page(mapping, cur_page, file);
49411 +               if (IS_ERR(page))
49412 +                       return PTR_ERR(page);
49413 +               lock_page(page);
49414 +               if (!PageUptodate(page)) {
49415 +                       unlock_page(page);
49416 +                       page_cache_release(page);
49417 +                       warning("jmacd-97178", "extent_read: page is not up to date");
49418 +                       return RETERR(-EIO);
49419 +               }
49420 +               mark_page_accessed(page);
49421 +               unlock_page(page);
49422 +
49423 +               /* If users can be writing to this page using arbitrary virtual
49424 +                  addresses, take care about potential aliasing before reading
49425 +                  the page on the kernel side.
49426 +                */
49427 +               if (mapping_writably_mapped(mapping))
49428 +                       flush_dcache_page(page);
49429 +
49430 +               assert("nikita-3034", reiser4_schedulable());
49431 +
49432 +               /* number of bytes which are to be read from the page */
49433 +               if (count > flow->length)
49434 +                       count = flow->length;
49435 +
49436 +               result = fault_in_pages_writeable(flow->data, count);
49437 +               if (result) {
49438 +                       page_cache_release(page);
49439 +                       return RETERR(-EFAULT);
49440 +               }
49441 +
49442 +               kaddr = kmap_atomic(page, KM_USER0);
49443 +               result = __copy_to_user_inatomic(flow->data,
49444 +                                              kaddr + page_off, count);
49445 +               kunmap_atomic(kaddr, KM_USER0);
49446 +               if (result != 0) {
49447 +                       kaddr = kmap(page);
49448 +                       result = __copy_to_user(flow->data, kaddr + page_off, count);
49449 +                       kunmap(page);
49450 +                       if (unlikely(result))
49451 +                               return RETERR(-EFAULT);
49452 +               }
49453 +
49454 +               page_cache_release(page);
49455 +
49456 +               /* increase key (flow->key), update user area pointer (flow->data) */
49457 +               move_flow_forward(flow, count);
49458 +
49459 +               page_off = 0;
49460 +               cur_page ++;
49461 +               count = PAGE_CACHE_SIZE;
49462 +               nr_pages--;
49463 +       } while (flow->length);
49464 +
49465 +       return 0;
49466 +}
49467 +
49468 +/*
49469 +   plugin->s.file.readpage
49470 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->extent_readpage
49471 +   or
49472 +   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_extent
49473 +
49474 +   At the beginning: coord->node is read locked, zloaded, page is
49475 +   locked, coord is set to existing unit inside of extent item (it is not necessary that coord matches to page->index)
49476 +*/
49477 +int reiser4_readpage_extent(void *vp, struct page *page)
49478 +{
49479 +       uf_coord_t *uf_coord = vp;
49480 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
49481 +       ON_DEBUG(reiser4_key key);
49482 +
49483 +       assert("vs-1040", PageLocked(page));
49484 +       assert("vs-1050", !PageUptodate(page));
49485 +       assert("vs-1039", page->mapping && page->mapping->host);
49486 +
49487 +       assert("vs-1044", znode_is_loaded(coord->node));
49488 +       assert("vs-758", item_is_extent(coord));
49489 +       assert("vs-1046", coord_is_existing_unit(coord));
49490 +       assert("vs-1045", znode_is_rlocked(coord->node));
49491 +       assert("vs-1047",
49492 +              page->mapping->host->i_ino ==
49493 +              get_key_objectid(item_key_by_coord(coord, &key)));
49494 +       check_uf_coord(uf_coord, NULL);
49495 +
49496 +       return reiser4_do_readpage_extent(
49497 +               ext_by_ext_coord(uf_coord),
49498 +               uf_coord->extension.extent.pos_in_unit, page);
49499 +}
49500 +
49501 +/**
49502 + * get_block_address_extent
49503 + * @coord:
49504 + * @block:
49505 + * @result:
49506 + *
49507 + *
49508 + */
49509 +int get_block_address_extent(const coord_t *coord, sector_t block,
49510 +                            sector_t *result)
49511 +{
49512 +       reiser4_extent *ext;
49513 +
49514 +       if (!coord_is_existing_unit(coord))
49515 +               return RETERR(-EINVAL);
49516 +
49517 +       ext = extent_by_coord(coord);
49518 +
49519 +       if (state_of_extent(ext) != ALLOCATED_EXTENT)
49520 +               /* FIXME: bad things may happen if it is unallocated extent */
49521 +               *result = 0;
49522 +       else {
49523 +               reiser4_key key;
49524 +
49525 +               unit_key_by_coord(coord, &key);
49526 +               assert("vs-1645",
49527 +                      block >= get_key_offset(&key) >> current_blocksize_bits);
49528 +               assert("vs-1646",
49529 +                      block <
49530 +                      (get_key_offset(&key) >> current_blocksize_bits) +
49531 +                      extent_get_width(ext));
49532 +               *result =
49533 +                   extent_get_start(ext) + (block -
49534 +                                            (get_key_offset(&key) >>
49535 +                                             current_blocksize_bits));
49536 +       }
49537 +       return 0;
49538 +}
49539 +
49540 +/*
49541 +  plugin->u.item.s.file.append_key
49542 +  key of first byte which is the next to last byte by addressed by this extent
49543 +*/
49544 +reiser4_key *append_key_extent(const coord_t * coord, reiser4_key * key)
49545 +{
49546 +       item_key_by_coord(coord, key);
49547 +       set_key_offset(key,
49548 +                      get_key_offset(key) + reiser4_extent_size(coord,
49549 +                                                                nr_units_extent
49550 +                                                                (coord)));
49551 +
49552 +       assert("vs-610", get_key_offset(key)
49553 +              && (get_key_offset(key) & (current_blocksize - 1)) == 0);
49554 +       return key;
49555 +}
49556 +
49557 +/* plugin->u.item.s.file.init_coord_extension */
49558 +void init_coord_extension_extent(uf_coord_t * uf_coord, loff_t lookuped)
49559 +{
49560 +       coord_t *coord;
49561 +       extent_coord_extension_t *ext_coord;
49562 +       reiser4_key key;
49563 +       loff_t offset;
49564 +
49565 +       assert("vs-1295", uf_coord->valid == 0);
49566 +
49567 +       coord = &uf_coord->coord;
49568 +       assert("vs-1288", coord_is_iplug_set(coord));
49569 +       assert("vs-1327", znode_is_loaded(coord->node));
49570 +
49571 +       if (coord->between != AFTER_UNIT && coord->between != AT_UNIT)
49572 +               return;
49573 +
49574 +       ext_coord = &uf_coord->extension.extent;
49575 +       ext_coord->nr_units = nr_units_extent(coord);
49576 +       ext_coord->ext_offset =
49577 +           (char *)extent_by_coord(coord) - zdata(coord->node);
49578 +       ext_coord->width = extent_get_width(extent_by_coord(coord));
49579 +       ON_DEBUG(ext_coord->extent = *extent_by_coord(coord));
49580 +       uf_coord->valid = 1;
49581 +
49582 +       /* pos_in_unit is the only uninitialized field in extended coord */
49583 +       if (coord->between == AFTER_UNIT) {
49584 +               assert("vs-1330",
49585 +                      coord->unit_pos == nr_units_extent(coord) - 1);
49586 +
49587 +               ext_coord->pos_in_unit = ext_coord->width - 1;
49588 +       } else {
49589 +               /* AT_UNIT */
49590 +               unit_key_by_coord(coord, &key);
49591 +               offset = get_key_offset(&key);
49592 +
49593 +               assert("vs-1328", offset <= lookuped);
49594 +               assert("vs-1329",
49595 +                      lookuped <
49596 +                      offset + ext_coord->width * current_blocksize);
49597 +               ext_coord->pos_in_unit =
49598 +                   ((lookuped - offset) >> current_blocksize_bits);
49599 +       }
49600 +}
49601 +
49602 +/*
49603 + * Local variables:
49604 + * c-indentation-style: "K&R"
49605 + * mode-name: "LC"
49606 + * c-basic-offset: 8
49607 + * tab-width: 8
49608 + * fill-column: 79
49609 + * scroll-step: 1
49610 + * End:
49611 + */
49612 diff --git a/fs/reiser4/plugin/item/extent_flush_ops.c b/fs/reiser4/plugin/item/extent_flush_ops.c
49613 new file mode 100644
49614 index 0000000..02dda3e
49615 --- /dev/null
49616 +++ b/fs/reiser4/plugin/item/extent_flush_ops.c
49617 @@ -0,0 +1,1028 @@
49618 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
49619 +
49620 +#include "item.h"
49621 +#include "../../tree.h"
49622 +#include "../../jnode.h"
49623 +#include "../../super.h"
49624 +#include "../../flush.h"
49625 +#include "../../carry.h"
49626 +#include "../object.h"
49627 +
49628 +#include <linux/pagemap.h>
49629 +
49630 +static reiser4_block_nr extent_unit_start(const coord_t * item);
49631 +
49632 +/* Return either first or last extent (depending on @side) of the item
49633 +   @coord is set to. Set @pos_in_unit either to first or to last block
49634 +   of extent. */
49635 +static reiser4_extent *extent_utmost_ext(const coord_t * coord, sideof side,
49636 +                                        reiser4_block_nr * pos_in_unit)
49637 +{
49638 +       reiser4_extent *ext;
49639 +
49640 +       if (side == LEFT_SIDE) {
49641 +               /* get first extent of item */
49642 +               ext = extent_item(coord);
49643 +               *pos_in_unit = 0;
49644 +       } else {
49645 +               /* get last extent of item and last position within it */
49646 +               assert("vs-363", side == RIGHT_SIDE);
49647 +               ext = extent_item(coord) + coord_last_unit_pos(coord);
49648 +               *pos_in_unit = extent_get_width(ext) - 1;
49649 +       }
49650 +
49651 +       return ext;
49652 +}
49653 +
49654 +/* item_plugin->f.utmost_child */
49655 +/* Return the child. Coord is set to extent item. Find jnode corresponding
49656 +   either to first or to last unformatted node pointed by the item */
49657 +int utmost_child_extent(const coord_t * coord, sideof side, jnode ** childp)
49658 +{
49659 +       reiser4_extent *ext;
49660 +       reiser4_block_nr pos_in_unit;
49661 +
49662 +       ext = extent_utmost_ext(coord, side, &pos_in_unit);
49663 +
49664 +       switch (state_of_extent(ext)) {
49665 +       case HOLE_EXTENT:
49666 +               *childp = NULL;
49667 +               return 0;
49668 +       case ALLOCATED_EXTENT:
49669 +       case UNALLOCATED_EXTENT:
49670 +               break;
49671 +       default:
49672 +               /* this should never happen */
49673 +               assert("vs-1417", 0);
49674 +       }
49675 +
49676 +       {
49677 +               reiser4_key key;
49678 +               reiser4_tree *tree;
49679 +               unsigned long index;
49680 +
49681 +               if (side == LEFT_SIDE) {
49682 +                       /* get key of first byte addressed by the extent */
49683 +                       item_key_by_coord(coord, &key);
49684 +               } else {
49685 +                       /* get key of byte which next after last byte addressed by the extent */
49686 +                       append_key_extent(coord, &key);
49687 +               }
49688 +
49689 +               assert("vs-544",
49690 +                      (get_key_offset(&key) >> PAGE_CACHE_SHIFT) < ~0ul);
49691 +               /* index of first or last (depending on @side) page addressed
49692 +                  by the extent */
49693 +               index =
49694 +                   (unsigned long)(get_key_offset(&key) >> PAGE_CACHE_SHIFT);
49695 +               if (side == RIGHT_SIDE)
49696 +                       index--;
49697 +
49698 +               tree = coord->node->zjnode.tree;
49699 +               *childp = jlookup(tree, get_key_objectid(&key), index);
49700 +       }
49701 +
49702 +       return 0;
49703 +}
49704 +
49705 +/* item_plugin->f.utmost_child_real_block */
49706 +/* Return the child's block, if allocated. */
49707 +int
49708 +utmost_child_real_block_extent(const coord_t * coord, sideof side,
49709 +                              reiser4_block_nr * block)
49710 +{
49711 +       reiser4_extent *ext;
49712 +
49713 +       ext = extent_by_coord(coord);
49714 +
49715 +       switch (state_of_extent(ext)) {
49716 +       case ALLOCATED_EXTENT:
49717 +               *block = extent_get_start(ext);
49718 +               if (side == RIGHT_SIDE)
49719 +                       *block += extent_get_width(ext) - 1;
49720 +               break;
49721 +       case HOLE_EXTENT:
49722 +       case UNALLOCATED_EXTENT:
49723 +               *block = 0;
49724 +               break;
49725 +       default:
49726 +               /* this should never happen */
49727 +               assert("vs-1418", 0);
49728 +       }
49729 +
49730 +       return 0;
49731 +}
49732 +
49733 +/* item_plugin->f.scan */
49734 +/* Performs leftward scanning starting from an unformatted node and its parent coordinate.
49735 +   This scan continues, advancing the parent coordinate, until either it encounters a
49736 +   formatted child or it finishes scanning this node.
49737 +
49738 +   If unallocated, the entire extent must be dirty and in the same atom.  (Actually, I'm
49739 +   not sure this is last property (same atom) is enforced, but it should be the case since
49740 +   one atom must write the parent and the others must read the parent, thus fusing?).  In
49741 +   any case, the code below asserts this case for unallocated extents.  Unallocated
49742 +   extents are thus optimized because we can skip to the endpoint when scanning.
49743 +
49744 +   It returns control to reiser4_scan_extent, handles these terminating conditions,
49745 +   e.g., by loading the next twig.
49746 +*/
49747 +int reiser4_scan_extent(flush_scan * scan)
49748 +{
49749 +       coord_t coord;
49750 +       jnode *neighbor;
49751 +       unsigned long scan_index, unit_index, unit_width, scan_max, scan_dist;
49752 +       reiser4_block_nr unit_start;
49753 +       __u64 oid;
49754 +       reiser4_key key;
49755 +       int ret = 0, allocated, incr;
49756 +       reiser4_tree *tree;
49757 +
49758 +       if (!JF_ISSET(scan->node, JNODE_DIRTY)) {
49759 +               scan->stop = 1;
49760 +               return 0;       /* Race with truncate, this node is already
49761 +                                * truncated. */
49762 +       }
49763 +
49764 +       coord_dup(&coord, &scan->parent_coord);
49765 +
49766 +       assert("jmacd-1404", !reiser4_scan_finished(scan));
49767 +       assert("jmacd-1405", jnode_get_level(scan->node) == LEAF_LEVEL);
49768 +       assert("jmacd-1406", jnode_is_unformatted(scan->node));
49769 +
49770 +       /* The scan_index variable corresponds to the current page index of the
49771 +          unformatted block scan position. */
49772 +       scan_index = index_jnode(scan->node);
49773 +
49774 +       assert("jmacd-7889", item_is_extent(&coord));
49775 +
49776 +      repeat:
49777 +       /* objectid of file */
49778 +       oid = get_key_objectid(item_key_by_coord(&coord, &key));
49779 +
49780 +       allocated = !extent_is_unallocated(&coord);
49781 +       /* Get the values of this extent unit: */
49782 +       unit_index = extent_unit_index(&coord);
49783 +       unit_width = extent_unit_width(&coord);
49784 +       unit_start = extent_unit_start(&coord);
49785 +
49786 +       assert("jmacd-7187", unit_width > 0);
49787 +       assert("jmacd-7188", scan_index >= unit_index);
49788 +       assert("jmacd-7189", scan_index <= unit_index + unit_width - 1);
49789 +
49790 +       /* Depending on the scan direction, we set different maximum values for scan_index
49791 +          (scan_max) and the number of nodes that would be passed if the scan goes the
49792 +          entire way (scan_dist).  Incr is an integer reflecting the incremental
49793 +          direction of scan_index. */
49794 +       if (reiser4_scanning_left(scan)) {
49795 +               scan_max = unit_index;
49796 +               scan_dist = scan_index - unit_index;
49797 +               incr = -1;
49798 +       } else {
49799 +               scan_max = unit_index + unit_width - 1;
49800 +               scan_dist = scan_max - unit_index;
49801 +               incr = +1;
49802 +       }
49803 +
49804 +       tree = coord.node->zjnode.tree;
49805 +
49806 +       /* If the extent is allocated we have to check each of its blocks.  If the extent
49807 +          is unallocated we can skip to the scan_max. */
49808 +       if (allocated) {
49809 +               do {
49810 +                       neighbor = jlookup(tree, oid, scan_index);
49811 +                       if (neighbor == NULL)
49812 +                               goto stop_same_parent;
49813 +
49814 +                       if (scan->node != neighbor
49815 +                           && !reiser4_scan_goto(scan, neighbor)) {
49816 +                               /* @neighbor was jput() by reiser4_scan_goto */
49817 +                               goto stop_same_parent;
49818 +                       }
49819 +
49820 +                       ret = scan_set_current(scan, neighbor, 1, &coord);
49821 +                       if (ret != 0) {
49822 +                               goto exit;
49823 +                       }
49824 +
49825 +                       /* reference to @neighbor is stored in @scan, no need
49826 +                          to jput(). */
49827 +                       scan_index += incr;
49828 +
49829 +               } while (incr + scan_max != scan_index);
49830 +
49831 +       } else {
49832 +               /* Optimized case for unallocated extents, skip to the end. */
49833 +               neighbor = jlookup(tree, oid, scan_max /*index */ );
49834 +               if (neighbor == NULL) {
49835 +                       /* Race with truncate */
49836 +                       scan->stop = 1;
49837 +                       ret = 0;
49838 +                       goto exit;
49839 +               }
49840 +
49841 +               assert("zam-1043",
49842 +                      reiser4_blocknr_is_fake(jnode_get_block(neighbor)));
49843 +
49844 +               ret = scan_set_current(scan, neighbor, scan_dist, &coord);
49845 +               if (ret != 0) {
49846 +                       goto exit;
49847 +               }
49848 +       }
49849 +
49850 +       if (coord_sideof_unit(&coord, scan->direction) == 0
49851 +           && item_is_extent(&coord)) {
49852 +               /* Continue as long as there are more extent units. */
49853 +
49854 +               scan_index =
49855 +                   extent_unit_index(&coord) +
49856 +                   (reiser4_scanning_left(scan) ?
49857 +                    extent_unit_width(&coord) - 1 : 0);
49858 +               goto repeat;
49859 +       }
49860 +
49861 +       if (0) {
49862 +             stop_same_parent:
49863 +
49864 +               /* If we are scanning left and we stop in the middle of an allocated
49865 +                  extent, we know the preceder immediately.. */
49866 +               /* middle of extent is (scan_index - unit_index) != 0. */
49867 +               if (reiser4_scanning_left(scan) &&
49868 +                   (scan_index - unit_index) != 0) {
49869 +                       /* FIXME(B): Someone should step-through and verify that this preceder
49870 +                          calculation is indeed correct. */
49871 +                       /* @unit_start is starting block (number) of extent
49872 +                          unit. Flush stopped at the @scan_index block from
49873 +                          the beginning of the file, which is (scan_index -
49874 +                          unit_index) block within extent.
49875 +                        */
49876 +                       if (unit_start) {
49877 +                               /* skip preceder update when we are at hole */
49878 +                               scan->preceder_blk =
49879 +                                   unit_start + scan_index - unit_index;
49880 +                               check_preceder(scan->preceder_blk);
49881 +                       }
49882 +               }
49883 +
49884 +               /* In this case, we leave coord set to the parent of scan->node. */
49885 +               scan->stop = 1;
49886 +
49887 +       } else {
49888 +               /* In this case, we are still scanning, coord is set to the next item which is
49889 +                  either off-the-end of the node or not an extent. */
49890 +               assert("jmacd-8912", scan->stop == 0);
49891 +               assert("jmacd-7812",
49892 +                      (coord_is_after_sideof_unit(&coord, scan->direction)
49893 +                       || !item_is_extent(&coord)));
49894 +       }
49895 +
49896 +       ret = 0;
49897 +      exit:
49898 +       return ret;
49899 +}
49900 +
49901 +/* ask block allocator for some blocks */
49902 +static void extent_allocate_blocks(reiser4_blocknr_hint *preceder,
49903 +                                  reiser4_block_nr wanted_count,
49904 +                                  reiser4_block_nr *first_allocated,
49905 +                                  reiser4_block_nr *allocated,
49906 +                                  block_stage_t block_stage)
49907 +{
49908 +       *allocated = wanted_count;
49909 +       preceder->max_dist = 0; /* scan whole disk, if needed */
49910 +
49911 +       /* that number of blocks (wanted_count) is either in UNALLOCATED or in GRABBED */
49912 +       preceder->block_stage = block_stage;
49913 +
49914 +       /* FIXME: we do not handle errors here now */
49915 +       check_me("vs-420",
49916 +                reiser4_alloc_blocks(preceder, first_allocated, allocated,
49917 +                                     BA_PERMANENT) == 0);
49918 +       /* update flush_pos's preceder to last allocated block number */
49919 +       preceder->blk = *first_allocated + *allocated - 1;
49920 +}
49921 +
49922 +/* when on flush time unallocated extent is to be replaced with allocated one it may happen that one unallocated extent
49923 +   will have to be replaced with set of allocated extents. In this case insert_into_item will be called which may have
49924 +   to add new nodes into tree. Space for that is taken from inviolable reserve (5%). */
49925 +static reiser4_block_nr reserve_replace(void)
49926 +{
49927 +       reiser4_block_nr grabbed, needed;
49928 +
49929 +       grabbed = get_current_context()->grabbed_blocks;
49930 +       needed = estimate_one_insert_into_item(current_tree);
49931 +       check_me("vpf-340", !reiser4_grab_space_force(needed, BA_RESERVED));
49932 +       return grabbed;
49933 +}
49934 +
49935 +static void free_replace_reserved(reiser4_block_nr grabbed)
49936 +{
49937 +       reiser4_context *ctx;
49938 +
49939 +       ctx = get_current_context();
49940 +       grabbed2free(ctx, get_super_private(ctx->super),
49941 +                    ctx->grabbed_blocks - grabbed);
49942 +}
49943 +
49944 +/* Block offset of first block addressed by unit */
49945 +__u64 extent_unit_index(const coord_t * item)
49946 +{
49947 +       reiser4_key key;
49948 +
49949 +       assert("vs-648", coord_is_existing_unit(item));
49950 +       unit_key_by_coord(item, &key);
49951 +       return get_key_offset(&key) >> current_blocksize_bits;
49952 +}
49953 +
49954 +/* AUDIT shouldn't return value be of reiser4_block_nr type?
49955 +   Josh's answer: who knows?  Is a "number of blocks" the same type as "block offset"? */
49956 +__u64 extent_unit_width(const coord_t * item)
49957 +{
49958 +       assert("vs-649", coord_is_existing_unit(item));
49959 +       return width_by_coord(item);
49960 +}
49961 +
49962 +/* Starting block location of this unit */
49963 +static reiser4_block_nr extent_unit_start(const coord_t * item)
49964 +{
49965 +       return extent_get_start(extent_by_coord(item));
49966 +}
49967 +
49968 +/**
49969 + * split_allocated_extent -
49970 + * @coord:
49971 + * @pos_in_unit:
49972 + *
49973 + * replace allocated extent with two allocated extents
49974 + */
49975 +static int split_allocated_extent(coord_t *coord, reiser4_block_nr pos_in_unit)
49976 +{
49977 +       int result;
49978 +       struct replace_handle *h;
49979 +       reiser4_extent *ext;
49980 +       reiser4_block_nr grabbed;
49981 +
49982 +       ext = extent_by_coord(coord);
49983 +       assert("vs-1410", state_of_extent(ext) == ALLOCATED_EXTENT);
49984 +       assert("vs-1411", extent_get_width(ext) > pos_in_unit);
49985 +
49986 +       h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
49987 +       if (h == NULL)
49988 +               return RETERR(-ENOMEM);
49989 +       h->coord = coord;
49990 +       h->lh = znode_lh(coord->node);
49991 +       h->pkey = &h->key;
49992 +       unit_key_by_coord(coord, h->pkey);
49993 +       set_key_offset(h->pkey,
49994 +                      (get_key_offset(h->pkey) +
49995 +                       pos_in_unit * current_blocksize));
49996 +       reiser4_set_extent(&h->overwrite, extent_get_start(ext),
49997 +                          pos_in_unit);
49998 +       reiser4_set_extent(&h->new_extents[0],
49999 +                          extent_get_start(ext) + pos_in_unit,
50000 +                          extent_get_width(ext) - pos_in_unit);
50001 +       h->nr_new_extents = 1;
50002 +       h->flags = COPI_DONT_SHIFT_LEFT;
50003 +       h->paste_key = h->key;
50004 +
50005 +       /* reserve space for extent unit paste, @grabbed is reserved before */
50006 +       grabbed = reserve_replace();
50007 +       result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
50008 +                                               extent */);
50009 +       /* restore reserved */
50010 +       free_replace_reserved(grabbed);
50011 +       kfree(h);
50012 +       return result;
50013 +}
50014 +
50015 +/* replace extent @ext by extent @replace. Try to merge @replace with previous extent of the item (if there is
50016 +   one). Return 1 if it succeeded, 0 - otherwise */
50017 +static int try_to_merge_with_left(coord_t *coord, reiser4_extent *ext,
50018 +                      reiser4_extent *replace)
50019 +{
50020 +       assert("vs-1415", extent_by_coord(coord) == ext);
50021 +
50022 +       if (coord->unit_pos == 0
50023 +           || state_of_extent(ext - 1) != ALLOCATED_EXTENT)
50024 +               /* @ext either does not exist or is not allocated extent */
50025 +               return 0;
50026 +       if (extent_get_start(ext - 1) + extent_get_width(ext - 1) !=
50027 +           extent_get_start(replace))
50028 +               return 0;
50029 +
50030 +       /* we can glue, widen previous unit */
50031 +       extent_set_width(ext - 1,
50032 +                        extent_get_width(ext - 1) + extent_get_width(replace));
50033 +
50034 +       if (extent_get_width(ext) != extent_get_width(replace)) {
50035 +               /* make current extent narrower */
50036 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
50037 +                       extent_set_start(ext,
50038 +                                        extent_get_start(ext) +
50039 +                                        extent_get_width(replace));
50040 +               extent_set_width(ext,
50041 +                                extent_get_width(ext) -
50042 +                                extent_get_width(replace));
50043 +       } else {
50044 +               /* current extent completely glued with its left neighbor, remove it */
50045 +               coord_t from, to;
50046 +
50047 +               coord_dup(&from, coord);
50048 +               from.unit_pos = nr_units_extent(coord) - 1;
50049 +               coord_dup(&to, &from);
50050 +
50051 +               /* currently cut from extent can cut either from the beginning or from the end. Move place which got
50052 +                  freed after unit removal to end of item */
50053 +               memmove(ext, ext + 1,
50054 +                       (from.unit_pos -
50055 +                        coord->unit_pos) * sizeof(reiser4_extent));
50056 +               /* wipe part of item which is going to be cut, so that node_check will not be confused */
50057 +               cut_node_content(&from, &to, NULL, NULL, NULL);
50058 +       }
50059 +       znode_make_dirty(coord->node);
50060 +       /* move coord back */
50061 +       coord->unit_pos--;
50062 +       return 1;
50063 +}
50064 +
50065 +/**
50066 + * conv_extent - replace extent with 2 ones
50067 + * @coord: coordinate of extent to be replaced
50068 + * @replace: extent to overwrite the one @coord is set to
50069 + *
50070 + * Overwrites extent @coord is set to and paste one extent unit after
50071 + * overwritten one if @replace is shorter than initial extent
50072 + */
50073 +static int conv_extent(coord_t *coord, reiser4_extent *replace)
50074 +{
50075 +       int result;
50076 +       struct replace_handle *h;
50077 +       reiser4_extent *ext;
50078 +       reiser4_block_nr start, width, new_width;
50079 +       reiser4_block_nr grabbed;
50080 +       extent_state state;
50081 +
50082 +       ext = extent_by_coord(coord);
50083 +       state = state_of_extent(ext);
50084 +       start = extent_get_start(ext);
50085 +       width = extent_get_width(ext);
50086 +       new_width = extent_get_width(replace);
50087 +
50088 +       assert("vs-1458", (state == UNALLOCATED_EXTENT ||
50089 +                          state == ALLOCATED_EXTENT));
50090 +       assert("vs-1459", width >= new_width);
50091 +
50092 +       if (try_to_merge_with_left(coord, ext, replace)) {
50093 +               /* merged @replace with left neighbor. Current unit is either
50094 +                  removed or narrowed */
50095 +               return 0;
50096 +       }
50097 +
50098 +       if (width == new_width) {
50099 +               /* replace current extent with @replace */
50100 +               *ext = *replace;
50101 +               znode_make_dirty(coord->node);
50102 +               return 0;
50103 +       }
50104 +
50105 +       h = kmalloc(sizeof(*h), reiser4_ctx_gfp_mask_get());
50106 +       if (h == NULL)
50107 +               return RETERR(-ENOMEM);
50108 +       h->coord = coord;
50109 +       h->lh = znode_lh(coord->node);
50110 +       h->pkey = &h->key;
50111 +       unit_key_by_coord(coord, h->pkey);
50112 +       set_key_offset(h->pkey,
50113 +                      (get_key_offset(h->pkey) + new_width * current_blocksize));
50114 +       h->overwrite = *replace;
50115 +
50116 +       /* replace @ext with @replace and padding extent */
50117 +       reiser4_set_extent(&h->new_extents[0],
50118 +                          (state == ALLOCATED_EXTENT) ?
50119 +                          (start + new_width) :
50120 +                          UNALLOCATED_EXTENT_START,
50121 +                          width - new_width);
50122 +       h->nr_new_extents = 1;
50123 +       h->flags = COPI_DONT_SHIFT_LEFT;
50124 +       h->paste_key = h->key;
50125 +
50126 +       /* reserve space for extent unit paste, @grabbed is reserved before */
50127 +       grabbed = reserve_replace();
50128 +       result = reiser4_replace_extent(h, 0 /* leave @coord set to overwritten
50129 +                                               extent */);
50130 +
50131 +       /* restore reserved */
50132 +       free_replace_reserved(grabbed);
50133 +       kfree(h);
50134 +       return result;
50135 +}
50136 +
50137 +/**
50138 + * assign_real_blocknrs
50139 + * @flush_pos:
50140 + * @oid: objectid of file jnodes to assign block number to belongs to
50141 + * @index: first jnode on the range
50142 + * @count: number of jnodes to assign block numbers to
50143 + * @first: start of allocated block range
50144 + *
50145 + * Assigns block numbers to each of @count jnodes. Index of first jnode is
50146 + * @index. Jnodes get lookuped with jlookup.
50147 + */
50148 +static void assign_real_blocknrs(flush_pos_t *flush_pos, oid_t oid,
50149 +                                unsigned long index, reiser4_block_nr count,
50150 +                                reiser4_block_nr first)
50151 +{
50152 +       unsigned long i;
50153 +       reiser4_tree *tree;
50154 +       txn_atom *atom;
50155 +       int nr;
50156 +
50157 +       atom = atom_locked_by_fq(flush_pos->fq);
50158 +       assert("vs-1468", atom);
50159 +       BUG_ON(atom == NULL);
50160 +
50161 +       nr = 0;
50162 +       tree = current_tree;
50163 +       for (i = 0; i < count; ++i, ++index) {
50164 +               jnode *node;
50165 +
50166 +               node = jlookup(tree, oid, index);
50167 +               assert("", node != NULL);
50168 +               BUG_ON(node == NULL);
50169 +
50170 +               spin_lock_jnode(node);
50171 +               assert("", !jnode_is_flushprepped(node));
50172 +               assert("vs-1475", node->atom == atom);
50173 +               assert("vs-1476", atomic_read(&node->x_count) > 0);
50174 +
50175 +               JF_CLR(node, JNODE_FLUSH_RESERVED);
50176 +               jnode_set_block(node, &first);
50177 +               unformatted_make_reloc(node, flush_pos->fq);
50178 +               ON_DEBUG(count_jnode(node->atom, node, NODE_LIST(node),
50179 +                                    FQ_LIST, 0));
50180 +               spin_unlock_jnode(node);
50181 +               first++;
50182 +
50183 +               atomic_dec(&node->x_count);
50184 +               nr ++;
50185 +       }
50186 +
50187 +       spin_unlock_atom(atom);
50188 +       return;
50189 +}
50190 +
50191 +/**
50192 + * make_node_ovrwr - assign node to overwrite set
50193 + * @jnodes: overwrite set list head
50194 + * @node: jnode to belong to overwrite set
50195 + *
50196 + * Sets OVRWR jnode state bit and puts @node to the end of list head @jnodes
50197 + * which is an accumulator for nodes before they get to overwrite set list of
50198 + * atom.
50199 + */
50200 +static void make_node_ovrwr(struct list_head *jnodes, jnode *node)
50201 +{
50202 +       spin_lock_jnode(node);
50203 +
50204 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
50205 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
50206 +
50207 +       JF_SET(node, JNODE_OVRWR);
50208 +       list_move_tail(&node->capture_link, jnodes);
50209 +       ON_DEBUG(count_jnode(node->atom, node, DIRTY_LIST, OVRWR_LIST, 0));
50210 +
50211 +       spin_unlock_jnode(node);
50212 +}
50213 +
50214 +/**
50215 + * mark_jnodes_overwrite - put bunch of jnodes to overwrite set
50216 + * @flush_pos: flush position
50217 + * @oid: objectid of file jnodes belong to
50218 + * @index: starting index
50219 + * @width: extent width
50220 + *
50221 + * Puts nodes of one extent (file objectid @oid, extent width @width) to atom's
50222 + * overwrite set. Starting from the one with index @index. If end of slum is
50223 + * detected (node is not found or flushprepped) - stop iterating and set flush
50224 + * position's state to POS_INVALID.
50225 + */
50226 +static void mark_jnodes_overwrite(flush_pos_t *flush_pos, oid_t oid,
50227 +                                 unsigned long index, reiser4_block_nr width)
50228 +{
50229 +       unsigned long i;
50230 +       reiser4_tree *tree;
50231 +       jnode *node;
50232 +       txn_atom *atom;
50233 +       LIST_HEAD(jnodes);
50234 +
50235 +       tree = current_tree;
50236 +
50237 +       atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
50238 +       assert("vs-1478", atom);
50239 +
50240 +       for (i = flush_pos->pos_in_unit; i < width; i++, index++) {
50241 +               node = jlookup(tree, oid, index);
50242 +               if (!node) {
50243 +                       flush_pos->state = POS_INVALID;
50244 +                       break;
50245 +               }
50246 +               if (jnode_check_flushprepped(node)) {
50247 +                       flush_pos->state = POS_INVALID;
50248 +                       atomic_dec(&node->x_count);
50249 +                       break;
50250 +               }
50251 +               if (node->atom != atom) {
50252 +                       flush_pos->state = POS_INVALID;
50253 +                       atomic_dec(&node->x_count);
50254 +                       break;
50255 +               }
50256 +               make_node_ovrwr(&jnodes, node);
50257 +               atomic_dec(&node->x_count);
50258 +       }
50259 +
50260 +       list_splice_init(&jnodes, ATOM_OVRWR_LIST(atom)->prev);
50261 +       spin_unlock_atom(atom);
50262 +}
50263 +
50264 +/**
50265 + * allocated_extent_slum_size
50266 + * @flush_pos:
50267 + * @oid:
50268 + * @index:
50269 + * @count:
50270 + *
50271 + *
50272 + */
50273 +static int allocated_extent_slum_size(flush_pos_t *flush_pos, oid_t oid,
50274 +                                     unsigned long index, unsigned long count)
50275 +{
50276 +       unsigned long i;
50277 +       reiser4_tree *tree;
50278 +       txn_atom *atom;
50279 +       int nr;
50280 +
50281 +       atom = atom_locked_by_fq(reiser4_pos_fq(flush_pos));
50282 +       assert("vs-1468", atom);
50283 +
50284 +       nr = 0;
50285 +       tree = current_tree;
50286 +       for (i = 0; i < count; ++i, ++index) {
50287 +               jnode *node;
50288 +
50289 +               node = jlookup(tree, oid, index);
50290 +               if (!node)
50291 +                       break;
50292 +
50293 +               if (jnode_check_flushprepped(node)) {
50294 +                       atomic_dec(&node->x_count);
50295 +                       break;
50296 +               }
50297 +
50298 +               if (node->atom != atom) {
50299 +                       /*
50300 +                        * this is possible on overwrite: extent_write may
50301 +                        * capture several unformatted nodes without capturing
50302 +                        * any formatted nodes.
50303 +                        */
50304 +                       atomic_dec(&node->x_count);
50305 +                       break;
50306 +               }
50307 +
50308 +               assert("vs-1476", atomic_read(&node->x_count) > 1);
50309 +               atomic_dec(&node->x_count);
50310 +               nr ++;
50311 +       }
50312 +
50313 +       spin_unlock_atom(atom);
50314 +       return nr;
50315 +}
50316 +
50317 +/**
50318 + * alloc_extent
50319 + * @flush_pos:
50320 + *
50321 + *
50322 + * this is called by handle_pos_on_twig to proceed extent unit flush_pos->coord
50323 + * is set to. It is to prepare for flushing sequence of not flushprepped nodes
50324 + * (slum). It supposes that slum starts at flush_pos->pos_in_unit position
50325 + * within the extent. Slum gets to relocate set if flush_pos->leaf_relocate is
50326 + * set to 1 and to overwrite set otherwise
50327 + */
50328 +int reiser4_alloc_extent(flush_pos_t *flush_pos)
50329 +{
50330 +       coord_t *coord;
50331 +       reiser4_extent *ext;
50332 +       reiser4_extent replace_ext;
50333 +       oid_t oid;
50334 +       reiser4_block_nr protected;
50335 +       reiser4_block_nr start;
50336 +       __u64 index;
50337 +       __u64 width;
50338 +       extent_state state;
50339 +       int result;
50340 +       reiser4_block_nr first_allocated;
50341 +       __u64 allocated;
50342 +       reiser4_key key;
50343 +       block_stage_t block_stage;
50344 +
50345 +       assert("vs-1468", flush_pos->state == POS_ON_EPOINT);
50346 +       assert("vs-1469", coord_is_existing_unit(&flush_pos->coord)
50347 +              && item_is_extent(&flush_pos->coord));
50348 +
50349 +       coord = &flush_pos->coord;
50350 +
50351 +       ext = extent_by_coord(coord);
50352 +       state = state_of_extent(ext);
50353 +       if (state == HOLE_EXTENT) {
50354 +               flush_pos->state = POS_INVALID;
50355 +               return 0;
50356 +       }
50357 +
50358 +       item_key_by_coord(coord, &key);
50359 +       oid = get_key_objectid(&key);
50360 +       index = extent_unit_index(coord) + flush_pos->pos_in_unit;
50361 +       start = extent_get_start(ext);
50362 +       width = extent_get_width(ext);
50363 +
50364 +       assert("vs-1457", width > flush_pos->pos_in_unit);
50365 +
50366 +       if (flush_pos->leaf_relocate || state == UNALLOCATED_EXTENT) {
50367 +               /* relocate */
50368 +               if (flush_pos->pos_in_unit) {
50369 +                       /* split extent unit into two */
50370 +                       result =
50371 +                           split_allocated_extent(coord,
50372 +                                                  flush_pos->pos_in_unit);
50373 +                       flush_pos->pos_in_unit = 0;
50374 +                       return result;
50375 +               }
50376 +
50377 +               /* limit number of nodes to allocate */
50378 +               if (flush_pos->nr_to_write < width)
50379 +                       width = flush_pos->nr_to_write;
50380 +
50381 +               if (state == ALLOCATED_EXTENT) {
50382 +                       /*
50383 +                        * all protected nodes are not flushprepped, therefore
50384 +                        * they are counted as flush_reserved
50385 +                        */
50386 +                       block_stage = BLOCK_FLUSH_RESERVED;
50387 +                       protected = allocated_extent_slum_size(flush_pos, oid,
50388 +                                                              index, width);
50389 +                       if (protected == 0) {
50390 +                               flush_pos->state = POS_INVALID;
50391 +                               flush_pos->pos_in_unit = 0;
50392 +                               return 0;
50393 +                       }
50394 +               } else {
50395 +                       block_stage = BLOCK_UNALLOCATED;
50396 +                       protected = width;
50397 +               }
50398 +
50399 +               /*
50400 +                * look at previous unit if possible. If it is allocated, make
50401 +                * preceder more precise
50402 +                */
50403 +               if (coord->unit_pos &&
50404 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50405 +                       reiser4_pos_hint(flush_pos)->blk =
50406 +                               extent_get_start(ext - 1) +
50407 +                               extent_get_width(ext - 1);
50408 +
50409 +               /* allocate new block numbers for protected nodes */
50410 +               extent_allocate_blocks(reiser4_pos_hint(flush_pos),
50411 +                                      protected,
50412 +                                      &first_allocated, &allocated,
50413 +                                      block_stage);
50414 +
50415 +               if (state == ALLOCATED_EXTENT)
50416 +                       /*
50417 +                        * on relocating - free nodes which are going to be
50418 +                        * relocated
50419 +                        */
50420 +                       reiser4_dealloc_blocks(&start, &allocated,
50421 +                                              BLOCK_ALLOCATED, BA_DEFER);
50422 +
50423 +               /* assign new block numbers to protected nodes */
50424 +               assign_real_blocknrs(flush_pos, oid, index, allocated, first_allocated);
50425 +
50426 +               /* prepare extent which will replace current one */
50427 +               reiser4_set_extent(&replace_ext, first_allocated, allocated);
50428 +
50429 +               /* adjust extent item */
50430 +               result = conv_extent(coord, &replace_ext);
50431 +               if (result != 0 && result != -ENOMEM) {
50432 +                       warning("vs-1461",
50433 +                               "Failed to allocate extent. Should not happen\n");
50434 +                       return result;
50435 +               }
50436 +
50437 +               /*
50438 +                * break flush: we prepared for flushing as many blocks as we
50439 +                * were asked for
50440 +                */
50441 +               if (flush_pos->nr_to_write == allocated)
50442 +                       flush_pos->state = POS_INVALID;
50443 +       } else {
50444 +               /* overwrite */
50445 +               mark_jnodes_overwrite(flush_pos, oid, index, width);
50446 +       }
50447 +       flush_pos->pos_in_unit = 0;
50448 +       return 0;
50449 +}
50450 +
50451 +/* if @key is glueable to the item @coord is set to */
50452 +static int must_insert(const coord_t *coord, const reiser4_key *key)
50453 +{
50454 +       reiser4_key last;
50455 +
50456 +       if (item_id_by_coord(coord) == EXTENT_POINTER_ID
50457 +           && keyeq(append_key_extent(coord, &last), key))
50458 +               return 0;
50459 +       return 1;
50460 +}
50461 +
50462 +/* copy extent @copy to the end of @node. It may have to either insert new item after the last one, or append last item,
50463 +   or modify last unit of last item to have greater width */
50464 +static int put_unit_to_end(znode *node, const reiser4_key *key,
50465 +                          reiser4_extent *copy_ext)
50466 +{
50467 +       int result;
50468 +       coord_t coord;
50469 +       cop_insert_flag flags;
50470 +       reiser4_extent *last_ext;
50471 +       reiser4_item_data data;
50472 +
50473 +       /* set coord after last unit in an item */
50474 +       coord_init_last_unit(&coord, node);
50475 +       coord.between = AFTER_UNIT;
50476 +
50477 +       flags =
50478 +           COPI_DONT_SHIFT_LEFT | COPI_DONT_SHIFT_RIGHT | COPI_DONT_ALLOCATE;
50479 +       if (must_insert(&coord, key)) {
50480 +               result =
50481 +                   insert_by_coord(&coord, init_new_extent(&data, copy_ext, 1),
50482 +                                   key, NULL /*lh */ , flags);
50483 +
50484 +       } else {
50485 +               /* try to glue with last unit */
50486 +               last_ext = extent_by_coord(&coord);
50487 +               if (state_of_extent(last_ext) &&
50488 +                   extent_get_start(last_ext) + extent_get_width(last_ext) ==
50489 +                   extent_get_start(copy_ext)) {
50490 +                       /* widen last unit of node */
50491 +                       extent_set_width(last_ext,
50492 +                                        extent_get_width(last_ext) +
50493 +                                        extent_get_width(copy_ext));
50494 +                       znode_make_dirty(node);
50495 +                       return 0;
50496 +               }
50497 +
50498 +               /* FIXME: put an assertion here that we can not merge last unit in @node and new unit */
50499 +               result =
50500 +                   insert_into_item(&coord, NULL /*lh */ , key,
50501 +                                    init_new_extent(&data, copy_ext, 1),
50502 +                                    flags);
50503 +       }
50504 +
50505 +       assert("vs-438", result == 0 || result == -E_NODE_FULL);
50506 +       return result;
50507 +}
50508 +
50509 +/* @coord is set to extent unit */
50510 +squeeze_result squalloc_extent(znode *left, const coord_t *coord,
50511 +                              flush_pos_t *flush_pos,
50512 +                              reiser4_key *stop_key)
50513 +{
50514 +       reiser4_extent *ext;
50515 +       __u64 index;
50516 +       __u64 width;
50517 +       reiser4_block_nr start;
50518 +       extent_state state;
50519 +       oid_t oid;
50520 +       reiser4_block_nr first_allocated;
50521 +       __u64 allocated;
50522 +       __u64 protected;
50523 +       reiser4_extent copy_extent;
50524 +       reiser4_key key;
50525 +       int result;
50526 +       block_stage_t block_stage;
50527 +
50528 +       assert("vs-1457", flush_pos->pos_in_unit == 0);
50529 +       assert("vs-1467", coord_is_leftmost_unit(coord));
50530 +       assert("vs-1467", item_is_extent(coord));
50531 +
50532 +       ext = extent_by_coord(coord);
50533 +       index = extent_unit_index(coord);
50534 +       start = extent_get_start(ext);
50535 +       width = extent_get_width(ext);
50536 +       state = state_of_extent(ext);
50537 +       unit_key_by_coord(coord, &key);
50538 +       oid = get_key_objectid(&key);
50539 +
50540 +       if ((flush_pos->leaf_relocate && state == ALLOCATED_EXTENT) ||
50541 +           (state == UNALLOCATED_EXTENT)) {
50542 +               /* relocate */
50543 +               if (state == ALLOCATED_EXTENT) {
50544 +                       /* all protected nodes are not flushprepped, therefore
50545 +                        * they are counted as flush_reserved */
50546 +                       block_stage = BLOCK_FLUSH_RESERVED;
50547 +                       protected = allocated_extent_slum_size(flush_pos, oid,
50548 +                                                              index, width);
50549 +                       if (protected == 0) {
50550 +                               flush_pos->state = POS_INVALID;
50551 +                               flush_pos->pos_in_unit = 0;
50552 +                               return 0;
50553 +                       }
50554 +               } else {
50555 +                       block_stage = BLOCK_UNALLOCATED;
50556 +                       protected = width;
50557 +               }
50558 +
50559 +               /*
50560 +                * look at previous unit if possible. If it is allocated, make
50561 +                * preceder more precise
50562 +                */
50563 +               if (coord->unit_pos &&
50564 +                   (state_of_extent(ext - 1) == ALLOCATED_EXTENT))
50565 +                       reiser4_pos_hint(flush_pos)->blk =
50566 +                               extent_get_start(ext - 1) +
50567 +                               extent_get_width(ext - 1);
50568 +
50569 +               /* allocate new block numbers for protected nodes */
50570 +               extent_allocate_blocks(reiser4_pos_hint(flush_pos),
50571 +                                      protected,
50572 +                                      &first_allocated, &allocated,
50573 +                                      block_stage);
50574 +
50575 +               /* prepare extent which will be copied to left */
50576 +               reiser4_set_extent(&copy_extent, first_allocated, allocated);
50577 +
50578 +               result = put_unit_to_end(left, &key, &copy_extent);
50579 +               if (result == -E_NODE_FULL) {
50580 +                       int target_block_stage;
50581 +
50582 +                       /* free blocks which were just allocated */
50583 +                       target_block_stage =
50584 +                           (state ==
50585 +                            ALLOCATED_EXTENT) ? BLOCK_FLUSH_RESERVED :
50586 +                           BLOCK_UNALLOCATED;
50587 +                       reiser4_dealloc_blocks(&first_allocated, &allocated,
50588 +                                              target_block_stage,
50589 +                                              BA_PERMANENT);
50590 +
50591 +                       /* rewind the preceder. */
50592 +                       flush_pos->preceder.blk = first_allocated;
50593 +                       check_preceder(flush_pos->preceder.blk);
50594 +
50595 +                       return SQUEEZE_TARGET_FULL;
50596 +               }
50597 +
50598 +               if (state == ALLOCATED_EXTENT) {
50599 +                       /* free nodes which were relocated */
50600 +                       reiser4_dealloc_blocks(&start, &allocated,
50601 +                                              BLOCK_ALLOCATED, BA_DEFER);
50602 +               }
50603 +
50604 +               /* assign new block numbers to protected nodes */
50605 +               assign_real_blocknrs(flush_pos, oid, index, allocated,
50606 +                                    first_allocated);
50607 +
50608 +               set_key_offset(&key,
50609 +                              get_key_offset(&key) +
50610 +                              (allocated << current_blocksize_bits));
50611 +       } else {
50612 +               /*
50613 +                * overwrite: try to copy unit as it is to left neighbor and
50614 +                * make all first not flushprepped nodes overwrite nodes
50615 +                */
50616 +               reiser4_set_extent(&copy_extent, start, width);
50617 +               result = put_unit_to_end(left, &key, &copy_extent);
50618 +               if (result == -E_NODE_FULL)
50619 +                       return SQUEEZE_TARGET_FULL;
50620 +
50621 +               if (state != HOLE_EXTENT)
50622 +                       mark_jnodes_overwrite(flush_pos, oid, index, width);
50623 +               set_key_offset(&key,
50624 +                              get_key_offset(&key) +
50625 +                              (width << current_blocksize_bits));
50626 +       }
50627 +       *stop_key = key;
50628 +       return SQUEEZE_CONTINUE;
50629 +}
50630 +
50631 +int key_by_offset_extent(struct inode *inode, loff_t off, reiser4_key * key)
50632 +{
50633 +       return key_by_inode_and_offset_common(inode, off, key);
50634 +}
50635 +
50636 +/*
50637 + * Local variables:
50638 + * c-indentation-style: "K&R"
50639 + * mode-name: "LC"
50640 + * c-basic-offset: 8
50641 + * tab-width: 8
50642 + * fill-column: 79
50643 + * scroll-step: 1
50644 + * End:
50645 + */
50646 diff --git a/fs/reiser4/plugin/item/extent_item_ops.c b/fs/reiser4/plugin/item/extent_item_ops.c
50647 new file mode 100644
50648 index 0000000..53ba8e7
50649 --- /dev/null
50650 +++ b/fs/reiser4/plugin/item/extent_item_ops.c
50651 @@ -0,0 +1,889 @@
50652 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
50653 +
50654 +#include "item.h"
50655 +#include "../../inode.h"
50656 +#include "../../tree_walk.h"   /* check_sibling_list() */
50657 +#include "../../page_cache.h"
50658 +#include "../../carry.h"
50659 +
50660 +#include <linux/quotaops.h>
50661 +
50662 +/* item_plugin->b.max_key_inside */
50663 +reiser4_key *max_key_inside_extent(const coord_t * coord, reiser4_key * key)
50664 +{
50665 +       item_key_by_coord(coord, key);
50666 +       set_key_offset(key, get_key_offset(reiser4_max_key()));
50667 +       return key;
50668 +}
50669 +
50670 +/* item_plugin->b.can_contain_key
50671 +   this checks whether @key of @data is matching to position set by @coord */
50672 +int
50673 +can_contain_key_extent(const coord_t * coord, const reiser4_key * key,
50674 +                      const reiser4_item_data * data)
50675 +{
50676 +       reiser4_key item_key;
50677 +
50678 +       if (item_plugin_by_coord(coord) != data->iplug)
50679 +               return 0;
50680 +
50681 +       item_key_by_coord(coord, &item_key);
50682 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
50683 +           get_key_objectid(key) != get_key_objectid(&item_key) ||
50684 +           get_key_ordering(key) != get_key_ordering(&item_key))
50685 +               return 0;
50686 +
50687 +       return 1;
50688 +}
50689 +
50690 +/* item_plugin->b.mergeable
50691 +   first item is of extent type */
50692 +/* Audited by: green(2002.06.13) */
50693 +int mergeable_extent(const coord_t * p1, const coord_t * p2)
50694 +{
50695 +       reiser4_key key1, key2;
50696 +
50697 +       assert("vs-299", item_id_by_coord(p1) == EXTENT_POINTER_ID);
50698 +       /* FIXME-VS: Which is it? Assert or return 0 */
50699 +       if (item_id_by_coord(p2) != EXTENT_POINTER_ID) {
50700 +               return 0;
50701 +       }
50702 +
50703 +       item_key_by_coord(p1, &key1);
50704 +       item_key_by_coord(p2, &key2);
50705 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
50706 +           get_key_objectid(&key1) != get_key_objectid(&key2) ||
50707 +           get_key_ordering(&key1) != get_key_ordering(&key2) ||
50708 +           get_key_type(&key1) != get_key_type(&key2))
50709 +               return 0;
50710 +       if (get_key_offset(&key1) +
50711 +           reiser4_extent_size(p1, nr_units_extent(p1)) !=
50712 +           get_key_offset(&key2))
50713 +               return 0;
50714 +       return 1;
50715 +}
50716 +
50717 +/* item_plugin->b.nr_units */
50718 +pos_in_node_t nr_units_extent(const coord_t * coord)
50719 +{
50720 +       /* length of extent item has to be multiple of extent size */
50721 +       assert("vs-1424",
50722 +              (item_length_by_coord(coord) % sizeof(reiser4_extent)) == 0);
50723 +       return item_length_by_coord(coord) / sizeof(reiser4_extent);
50724 +}
50725 +
50726 +/* item_plugin->b.lookup */
50727 +lookup_result
50728 +lookup_extent(const reiser4_key * key, lookup_bias bias UNUSED_ARG,
50729 +             coord_t * coord)
50730 +{                              /* znode and item_pos are
50731 +                                  set to an extent item to
50732 +                                  look through */
50733 +       reiser4_key item_key;
50734 +       reiser4_block_nr lookuped, offset;
50735 +       unsigned i, nr_units;
50736 +       reiser4_extent *ext;
50737 +       unsigned blocksize;
50738 +       unsigned char blocksize_bits;
50739 +
50740 +       item_key_by_coord(coord, &item_key);
50741 +       offset = get_key_offset(&item_key);
50742 +
50743 +       /* key we are looking for must be greater than key of item @coord */
50744 +       assert("vs-414", keygt(key, &item_key));
50745 +
50746 +       assert("umka-99945",
50747 +              !keygt(key, max_key_inside_extent(coord, &item_key)));
50748 +
50749 +       ext = extent_item(coord);
50750 +       assert("vs-1350", (char *)ext == (zdata(coord->node) + coord->offset));
50751 +
50752 +       blocksize = current_blocksize;
50753 +       blocksize_bits = current_blocksize_bits;
50754 +
50755 +       /* offset we are looking for */
50756 +       lookuped = get_key_offset(key);
50757 +
50758 +       nr_units = nr_units_extent(coord);
50759 +       /* go through all extents until the one which address given offset */
50760 +       for (i = 0; i < nr_units; i++, ext++) {
50761 +               offset += (extent_get_width(ext) << blocksize_bits);
50762 +               if (offset > lookuped) {
50763 +                       /* desired byte is somewhere in this extent */
50764 +                       coord->unit_pos = i;
50765 +                       coord->between = AT_UNIT;
50766 +                       return CBK_COORD_FOUND;
50767 +               }
50768 +       }
50769 +
50770 +       /* set coord after last unit */
50771 +       coord->unit_pos = nr_units - 1;
50772 +       coord->between = AFTER_UNIT;
50773 +       return CBK_COORD_FOUND;
50774 +}
50775 +
50776 +/* item_plugin->b.paste
50777 +   item @coord is set to has been appended with @data->length of free
50778 +   space. data->data contains data to be pasted into the item in position
50779 +   @coord->in_item.unit_pos. It must fit into that free space.
50780 +   @coord must be set between units.
50781 +*/
50782 +int
50783 +paste_extent(coord_t * coord, reiser4_item_data * data,
50784 +            carry_plugin_info * info UNUSED_ARG)
50785 +{
50786 +       unsigned old_nr_units;
50787 +       reiser4_extent *ext;
50788 +       int item_length;
50789 +
50790 +       ext = extent_item(coord);
50791 +       item_length = item_length_by_coord(coord);
50792 +       old_nr_units = (item_length - data->length) / sizeof(reiser4_extent);
50793 +
50794 +       /* this is also used to copy extent into newly created item, so
50795 +          old_nr_units could be 0 */
50796 +       assert("vs-260", item_length >= data->length);
50797 +
50798 +       /* make sure that coord is set properly */
50799 +       assert("vs-35",
50800 +              ((!coord_is_existing_unit(coord))
50801 +               || (!old_nr_units && !coord->unit_pos)));
50802 +
50803 +       /* first unit to be moved */
50804 +       switch (coord->between) {
50805 +       case AFTER_UNIT:
50806 +               coord->unit_pos++;
50807 +       case BEFORE_UNIT:
50808 +               coord->between = AT_UNIT;
50809 +               break;
50810 +       case AT_UNIT:
50811 +               assert("vs-331", !old_nr_units && !coord->unit_pos);
50812 +               break;
50813 +       default:
50814 +               impossible("vs-330", "coord is set improperly");
50815 +       }
50816 +
50817 +       /* prepare space for new units */
50818 +       memmove(ext + coord->unit_pos + data->length / sizeof(reiser4_extent),
50819 +               ext + coord->unit_pos,
50820 +               (old_nr_units - coord->unit_pos) * sizeof(reiser4_extent));
50821 +
50822 +       /* copy new data from kernel space */
50823 +       assert("vs-556", data->user == 0);
50824 +       memcpy(ext + coord->unit_pos, data->data, (unsigned)data->length);
50825 +
50826 +       /* after paste @coord is set to first of pasted units */
50827 +       assert("vs-332", coord_is_existing_unit(coord));
50828 +       assert("vs-333",
50829 +              !memcmp(data->data, extent_by_coord(coord),
50830 +                      (unsigned)data->length));
50831 +       return 0;
50832 +}
50833 +
50834 +/* item_plugin->b.can_shift */
50835 +int
50836 +can_shift_extent(unsigned free_space, coord_t * source,
50837 +                znode * target UNUSED_ARG, shift_direction pend UNUSED_ARG,
50838 +                unsigned *size, unsigned want)
50839 +{
50840 +       *size = item_length_by_coord(source);
50841 +       if (*size > free_space)
50842 +               /* never split a unit of extent item */
50843 +               *size = free_space - free_space % sizeof(reiser4_extent);
50844 +
50845 +       /* we can shift *size bytes, calculate how many do we want to shift */
50846 +       if (*size > want * sizeof(reiser4_extent))
50847 +               *size = want * sizeof(reiser4_extent);
50848 +
50849 +       if (*size % sizeof(reiser4_extent) != 0)
50850 +               impossible("vs-119", "Wrong extent size: %i %zd", *size,
50851 +                          sizeof(reiser4_extent));
50852 +       return *size / sizeof(reiser4_extent);
50853 +
50854 +}
50855 +
50856 +/* item_plugin->b.copy_units */
50857 +void
50858 +copy_units_extent(coord_t * target, coord_t * source,
50859 +                 unsigned from, unsigned count,
50860 +                 shift_direction where_is_free_space, unsigned free_space)
50861 +{
50862 +       char *from_ext, *to_ext;
50863 +
50864 +       assert("vs-217", free_space == count * sizeof(reiser4_extent));
50865 +
50866 +       from_ext = item_body_by_coord(source);
50867 +       to_ext = item_body_by_coord(target);
50868 +
50869 +       if (where_is_free_space == SHIFT_LEFT) {
50870 +               assert("vs-215", from == 0);
50871 +
50872 +               /* At this moment, item length was already updated in the item
50873 +                  header by shifting code, hence nr_units_extent() will
50874 +                  return "new" number of units---one we obtain after copying
50875 +                  units.
50876 +                */
50877 +               to_ext +=
50878 +                   (nr_units_extent(target) - count) * sizeof(reiser4_extent);
50879 +       } else {
50880 +               reiser4_key key;
50881 +               coord_t coord;
50882 +
50883 +               assert("vs-216",
50884 +                      from + count == coord_last_unit_pos(source) + 1);
50885 +
50886 +               from_ext += item_length_by_coord(source) - free_space;
50887 +
50888 +               /* new units are inserted before first unit in an item,
50889 +                  therefore, we have to update item key */
50890 +               coord = *source;
50891 +               coord.unit_pos = from;
50892 +               unit_key_extent(&coord, &key);
50893 +
50894 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
50895 +                                                                  NULL /*info */);
50896 +       }
50897 +
50898 +       memcpy(to_ext, from_ext, free_space);
50899 +}
50900 +
50901 +/* item_plugin->b.create_hook
50902 +   @arg is znode of leaf node for which we need to update right delimiting key */
50903 +int create_hook_extent(const coord_t * coord, void *arg)
50904 +{
50905 +       coord_t *child_coord;
50906 +       znode *node;
50907 +       reiser4_key key;
50908 +       reiser4_tree *tree;
50909 +
50910 +       if (!arg)
50911 +               return 0;
50912 +
50913 +       child_coord = arg;
50914 +       tree = znode_get_tree(coord->node);
50915 +
50916 +       assert("nikita-3246", znode_get_level(child_coord->node) == LEAF_LEVEL);
50917 +
50918 +       write_lock_tree(tree);
50919 +       write_lock_dk(tree);
50920 +       /* find a node on the left level for which right delimiting key has to
50921 +          be updated */
50922 +       if (coord_wrt(child_coord) == COORD_ON_THE_LEFT) {
50923 +               assert("vs-411", znode_is_left_connected(child_coord->node));
50924 +               node = child_coord->node->left;
50925 +       } else {
50926 +               assert("vs-412", coord_wrt(child_coord) == COORD_ON_THE_RIGHT);
50927 +               node = child_coord->node;
50928 +               assert("nikita-3314", node != NULL);
50929 +       }
50930 +
50931 +       if (node != NULL) {
50932 +               znode_set_rd_key(node, item_key_by_coord(coord, &key));
50933 +
50934 +               assert("nikita-3282", check_sibling_list(node));
50935 +               /* break sibling links */
50936 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && node->right) {
50937 +                       ON_DEBUG(node->right->left_version =
50938 +                                atomic_inc_return(&delim_key_version);
50939 +                                node->right_version =
50940 +                                atomic_inc_return(&delim_key_version););
50941 +
50942 +                       node->right->left = NULL;
50943 +                       node->right = NULL;
50944 +               }
50945 +       }
50946 +       write_unlock_dk(tree);
50947 +       write_unlock_tree(tree);
50948 +       return 0;
50949 +}
50950 +
50951 +#define ITEM_TAIL_KILLED 0
50952 +#define ITEM_HEAD_KILLED 1
50953 +#define ITEM_KILLED 2
50954 +
50955 +/* item_plugin->b.kill_hook
50956 +   this is called when @count units starting from @from-th one are going to be removed
50957 +   */
50958 +int
50959 +kill_hook_extent(const coord_t * coord, pos_in_node_t from, pos_in_node_t count,
50960 +                struct carry_kill_data *kdata)
50961 +{
50962 +       reiser4_extent *ext;
50963 +       reiser4_block_nr start, length;
50964 +       const reiser4_key *pfrom_key, *pto_key;
50965 +       struct inode *inode;
50966 +       reiser4_tree *tree;
50967 +       pgoff_t from_off, to_off, offset, skip;
50968 +       int retval;
50969 +
50970 +       /* these are located in memory kmalloc-ed by kill_node_content */
50971 +       reiser4_key *min_item_key, *max_item_key, *from_key, *to_key, *key;
50972 +       coord_t *dup, *next;
50973 +
50974 +       assert("zam-811", znode_is_write_locked(coord->node));
50975 +       assert("nikita-3315", kdata != NULL);
50976 +       assert("vs-34", kdata->buf != NULL);
50977 +
50978 +       /* map structures to kdata->buf */
50979 +       min_item_key = (reiser4_key *) (kdata->buf);
50980 +       max_item_key = min_item_key + 1;
50981 +       from_key = max_item_key + 1;
50982 +       to_key = from_key + 1;
50983 +       key = to_key + 1;
50984 +       dup = (coord_t *) (key + 1);
50985 +       next = dup + 1;
50986 +
50987 +       item_key_by_coord(coord, min_item_key);
50988 +       max_item_key_by_coord(coord, max_item_key);
50989 +
50990 +       if (kdata->params.from_key) {
50991 +               pfrom_key = kdata->params.from_key;
50992 +               pto_key = kdata->params.to_key;
50993 +       } else {
50994 +               assert("vs-1549", from == coord->unit_pos);
50995 +               unit_key_by_coord(coord, from_key);
50996 +               pfrom_key = from_key;
50997 +
50998 +               coord_dup(dup, coord);
50999 +               dup->unit_pos = from + count - 1;
51000 +               max_unit_key_by_coord(dup, to_key);
51001 +               pto_key = to_key;
51002 +       }
51003 +
51004 +       if (!keylt(pto_key, max_item_key)) {
51005 +               if (!keygt(pfrom_key, min_item_key)) {
51006 +                       znode *left, *right;
51007 +
51008 +                       /* item is to be removed completely */
51009 +                       assert("nikita-3316", kdata->left != NULL
51010 +                              && kdata->right != NULL);
51011 +
51012 +                       left = kdata->left->node;
51013 +                       right = kdata->right->node;
51014 +
51015 +                       tree = current_tree;
51016 +                       /* we have to do two things:
51017 +                        *
51018 +                        *     1. link left and right formatted neighbors of
51019 +                        *        extent being removed, and
51020 +                        *
51021 +                        *     2. update their delimiting keys.
51022 +                        *
51023 +                        * atomicity of these operations is protected by
51024 +                        * taking dk-lock and tree-lock.
51025 +                        */
51026 +                       /* if neighbors of item being removed are znodes -
51027 +                        * link them */
51028 +                       write_lock_tree(tree);
51029 +                       write_lock_dk(tree);
51030 +                       link_left_and_right(left, right);
51031 +                       if (left) {
51032 +                               /* update right delimiting key of left
51033 +                                * neighbor of extent item */
51034 +                               /*coord_t next;
51035 +                                  reiser4_key key; */
51036 +
51037 +                               coord_dup(next, coord);
51038 +
51039 +                               if (coord_next_item(next))
51040 +                                       *key = *znode_get_rd_key(coord->node);
51041 +                               else
51042 +                                       item_key_by_coord(next, key);
51043 +                               znode_set_rd_key(left, key);
51044 +                       }
51045 +                       write_unlock_dk(tree);
51046 +                       write_unlock_tree(tree);
51047 +
51048 +                       from_off =
51049 +                           get_key_offset(min_item_key) >> PAGE_CACHE_SHIFT;
51050 +                       to_off =
51051 +                           (get_key_offset(max_item_key) +
51052 +                            1) >> PAGE_CACHE_SHIFT;
51053 +                       retval = ITEM_KILLED;
51054 +               } else {
51055 +                       /* tail of item is to be removed */
51056 +                       from_off =
51057 +                           (get_key_offset(pfrom_key) + PAGE_CACHE_SIZE -
51058 +                            1) >> PAGE_CACHE_SHIFT;
51059 +                       to_off =
51060 +                           (get_key_offset(max_item_key) +
51061 +                            1) >> PAGE_CACHE_SHIFT;
51062 +                       retval = ITEM_TAIL_KILLED;
51063 +               }
51064 +       } else {
51065 +               /* head of item is to be removed */
51066 +               assert("vs-1571", keyeq(pfrom_key, min_item_key));
51067 +               assert("vs-1572",
51068 +                      (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) ==
51069 +                      0);
51070 +               assert("vs-1573",
51071 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
51072 +                                                        1)) == 0);
51073 +
51074 +               if (kdata->left->node) {
51075 +                       /* update right delimiting key of left neighbor of extent item */
51076 +                       /*reiser4_key key; */
51077 +
51078 +                       *key = *pto_key;
51079 +                       set_key_offset(key, get_key_offset(pto_key) + 1);
51080 +
51081 +                       write_lock_dk(current_tree);
51082 +                       znode_set_rd_key(kdata->left->node, key);
51083 +                       write_unlock_dk(current_tree);
51084 +               }
51085 +
51086 +               from_off = get_key_offset(pfrom_key) >> PAGE_CACHE_SHIFT;
51087 +               to_off = (get_key_offset(pto_key) + 1) >> PAGE_CACHE_SHIFT;
51088 +               retval = ITEM_HEAD_KILLED;
51089 +       }
51090 +
51091 +       inode = kdata->inode;
51092 +       assert("vs-1545", inode != NULL);
51093 +       if (inode != NULL)
51094 +               /* take care of pages and jnodes corresponding to part of item being killed */
51095 +               reiser4_invalidate_pages(inode->i_mapping, from_off,
51096 +                                        to_off - from_off,
51097 +                                        kdata->params.truncate);
51098 +
51099 +       ext = extent_item(coord) + from;
51100 +       offset =
51101 +           (get_key_offset(min_item_key) +
51102 +            reiser4_extent_size(coord, from)) >> PAGE_CACHE_SHIFT;
51103 +
51104 +       assert("vs-1551", from_off >= offset);
51105 +       assert("vs-1552", from_off - offset <= extent_get_width(ext));
51106 +       skip = from_off - offset;
51107 +       offset = from_off;
51108 +
51109 +       while (offset < to_off) {
51110 +               length = extent_get_width(ext) - skip;
51111 +               if (state_of_extent(ext) == HOLE_EXTENT) {
51112 +                       skip = 0;
51113 +                       offset += length;
51114 +                       ext++;
51115 +                       continue;
51116 +               }
51117 +
51118 +               if (offset + length > to_off) {
51119 +                       length = to_off - offset;
51120 +               }
51121 +
51122 +               DQUOT_FREE_BLOCK_NODIRTY(inode, length);
51123 +
51124 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
51125 +                       /* some jnodes corresponding to this unallocated extent */
51126 +                       fake_allocated2free(length, 0 /* unformatted */ );
51127 +
51128 +                       skip = 0;
51129 +                       offset += length;
51130 +                       ext++;
51131 +                       continue;
51132 +               }
51133 +
51134 +               assert("vs-1218", state_of_extent(ext) == ALLOCATED_EXTENT);
51135 +
51136 +               if (length != 0) {
51137 +                       start = extent_get_start(ext) + skip;
51138 +
51139 +                       /* BA_DEFER bit parameter is turned on because blocks which get freed are not safe to be freed
51140 +                          immediately */
51141 +                       reiser4_dealloc_blocks(&start, &length,
51142 +                                              0 /* not used */ ,
51143 +                                              BA_DEFER
51144 +                                              /* unformatted with defer */ );
51145 +               }
51146 +               skip = 0;
51147 +               offset += length;
51148 +               ext++;
51149 +       }
51150 +       return retval;
51151 +}
51152 +
51153 +/* item_plugin->b.kill_units */
51154 +int
51155 +kill_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51156 +                 struct carry_kill_data *kdata, reiser4_key * smallest_removed,
51157 +                 reiser4_key * new_first)
51158 +{
51159 +       reiser4_extent *ext;
51160 +       reiser4_key item_key;
51161 +       pos_in_node_t count;
51162 +       reiser4_key from_key, to_key;
51163 +       const reiser4_key *pfrom_key, *pto_key;
51164 +       loff_t off;
51165 +       int result;
51166 +
51167 +       assert("vs-1541",
51168 +              ((kdata->params.from_key == NULL && kdata->params.to_key == NULL)
51169 +               || (kdata->params.from_key != NULL
51170 +                   && kdata->params.to_key != NULL)));
51171 +
51172 +       if (kdata->params.from_key) {
51173 +               pfrom_key = kdata->params.from_key;
51174 +               pto_key = kdata->params.to_key;
51175 +       } else {
51176 +               coord_t dup;
51177 +
51178 +               /* calculate key range of kill */
51179 +               assert("vs-1549", from == coord->unit_pos);
51180 +               unit_key_by_coord(coord, &from_key);
51181 +               pfrom_key = &from_key;
51182 +
51183 +               coord_dup(&dup, coord);
51184 +               dup.unit_pos = to;
51185 +               max_unit_key_by_coord(&dup, &to_key);
51186 +               pto_key = &to_key;
51187 +       }
51188 +
51189 +       item_key_by_coord(coord, &item_key);
51190 +
51191 +#if REISER4_DEBUG
51192 +       {
51193 +               reiser4_key max_item_key;
51194 +
51195 +               max_item_key_by_coord(coord, &max_item_key);
51196 +
51197 +               if (new_first) {
51198 +                       /* head of item is to be cut */
51199 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
51200 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
51201 +               } else {
51202 +                       /* tail of item is to be cut */
51203 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
51204 +                       assert("vs-1543", !keylt(pto_key, &max_item_key));
51205 +               }
51206 +       }
51207 +#endif
51208 +
51209 +       if (smallest_removed)
51210 +               *smallest_removed = *pfrom_key;
51211 +
51212 +       if (new_first) {
51213 +               /* item head is cut. Item key will change. This new key is calculated here */
51214 +               assert("vs-1556",
51215 +                      (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
51216 +                      (PAGE_CACHE_SIZE - 1));
51217 +               *new_first = *pto_key;
51218 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
51219 +       }
51220 +
51221 +       count = to - from + 1;
51222 +       result = kill_hook_extent(coord, from, count, kdata);
51223 +       if (result == ITEM_TAIL_KILLED) {
51224 +               assert("vs-1553",
51225 +                      get_key_offset(pfrom_key) >=
51226 +                      get_key_offset(&item_key) +
51227 +                      reiser4_extent_size(coord, from));
51228 +               off =
51229 +                   get_key_offset(pfrom_key) -
51230 +                       (get_key_offset(&item_key) +
51231 +                        reiser4_extent_size(coord, from));
51232 +               if (off) {
51233 +                       /* unit @from is to be cut partially. Its width decreases */
51234 +                       ext = extent_item(coord) + from;
51235 +                       extent_set_width(ext,
51236 +                                        (off + PAGE_CACHE_SIZE -
51237 +                                         1) >> PAGE_CACHE_SHIFT);
51238 +                       count--;
51239 +               }
51240 +       } else {
51241 +               __u64 max_to_offset;
51242 +               __u64 rest;
51243 +
51244 +               assert("vs-1575", result == ITEM_HEAD_KILLED);
51245 +               assert("", from == 0);
51246 +               assert("",
51247 +                      ((get_key_offset(pto_key) + 1) & (PAGE_CACHE_SIZE -
51248 +                                                        1)) == 0);
51249 +               assert("",
51250 +                      get_key_offset(pto_key) + 1 >
51251 +                      get_key_offset(&item_key) +
51252 +                      reiser4_extent_size(coord, to));
51253 +               max_to_offset =
51254 +                   get_key_offset(&item_key) +
51255 +                       reiser4_extent_size(coord, to + 1) - 1;
51256 +               assert("", get_key_offset(pto_key) <= max_to_offset);
51257 +
51258 +               rest =
51259 +                   (max_to_offset -
51260 +                    get_key_offset(pto_key)) >> PAGE_CACHE_SHIFT;
51261 +               if (rest) {
51262 +                       /* unit @to is to be cut partially */
51263 +                       ext = extent_item(coord) + to;
51264 +
51265 +                       assert("", extent_get_width(ext) > rest);
51266 +
51267 +                       if (state_of_extent(ext) == ALLOCATED_EXTENT)
51268 +                               extent_set_start(ext,
51269 +                                                extent_get_start(ext) +
51270 +                                                (extent_get_width(ext) -
51271 +                                                 rest));
51272 +
51273 +                       extent_set_width(ext, rest);
51274 +                       count--;
51275 +               }
51276 +       }
51277 +       return count * sizeof(reiser4_extent);
51278 +}
51279 +
51280 +/* item_plugin->b.cut_units
51281 +   this is too similar to kill_units_extent */
51282 +int
51283 +cut_units_extent(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
51284 +                struct carry_cut_data *cdata, reiser4_key * smallest_removed,
51285 +                reiser4_key * new_first)
51286 +{
51287 +       reiser4_extent *ext;
51288 +       reiser4_key item_key;
51289 +       pos_in_node_t count;
51290 +       reiser4_key from_key, to_key;
51291 +       const reiser4_key *pfrom_key, *pto_key;
51292 +       loff_t off;
51293 +
51294 +       assert("vs-1541",
51295 +              ((cdata->params.from_key == NULL && cdata->params.to_key == NULL)
51296 +               || (cdata->params.from_key != NULL
51297 +                   && cdata->params.to_key != NULL)));
51298 +
51299 +       if (cdata->params.from_key) {
51300 +               pfrom_key = cdata->params.from_key;
51301 +               pto_key = cdata->params.to_key;
51302 +       } else {
51303 +               coord_t dup;
51304 +
51305 +               /* calculate key range of kill */
51306 +               coord_dup(&dup, coord);
51307 +               dup.unit_pos = from;
51308 +               unit_key_by_coord(&dup, &from_key);
51309 +
51310 +               dup.unit_pos = to;
51311 +               max_unit_key_by_coord(&dup, &to_key);
51312 +
51313 +               pfrom_key = &from_key;
51314 +               pto_key = &to_key;
51315 +       }
51316 +
51317 +       assert("vs-1555",
51318 +              (get_key_offset(pfrom_key) & (PAGE_CACHE_SIZE - 1)) == 0);
51319 +       assert("vs-1556",
51320 +              (get_key_offset(pto_key) & (PAGE_CACHE_SIZE - 1)) ==
51321 +              (PAGE_CACHE_SIZE - 1));
51322 +
51323 +       item_key_by_coord(coord, &item_key);
51324 +
51325 +#if REISER4_DEBUG
51326 +       {
51327 +               reiser4_key max_item_key;
51328 +
51329 +               assert("vs-1584",
51330 +                      get_key_locality(pfrom_key) ==
51331 +                      get_key_locality(&item_key));
51332 +               assert("vs-1585",
51333 +                      get_key_type(pfrom_key) == get_key_type(&item_key));
51334 +               assert("vs-1586",
51335 +                      get_key_objectid(pfrom_key) ==
51336 +                      get_key_objectid(&item_key));
51337 +               assert("vs-1587",
51338 +                      get_key_ordering(pfrom_key) ==
51339 +                      get_key_ordering(&item_key));
51340 +
51341 +               max_item_key_by_coord(coord, &max_item_key);
51342 +
51343 +               if (new_first != NULL) {
51344 +                       /* head of item is to be cut */
51345 +                       assert("vs-1542", keyeq(pfrom_key, &item_key));
51346 +                       assert("vs-1538", keylt(pto_key, &max_item_key));
51347 +               } else {
51348 +                       /* tail of item is to be cut */
51349 +                       assert("vs-1540", keygt(pfrom_key, &item_key));
51350 +                       assert("vs-1543", keyeq(pto_key, &max_item_key));
51351 +               }
51352 +       }
51353 +#endif
51354 +
51355 +       if (smallest_removed)
51356 +               *smallest_removed = *pfrom_key;
51357 +
51358 +       if (new_first) {
51359 +               /* item head is cut. Item key will change. This new key is calculated here */
51360 +               *new_first = *pto_key;
51361 +               set_key_offset(new_first, get_key_offset(new_first) + 1);
51362 +       }
51363 +
51364 +       count = to - from + 1;
51365 +
51366 +       assert("vs-1553",
51367 +              get_key_offset(pfrom_key) >=
51368 +              get_key_offset(&item_key) + reiser4_extent_size(coord, from));
51369 +       off =
51370 +           get_key_offset(pfrom_key) - (get_key_offset(&item_key) +
51371 +                                        reiser4_extent_size(coord, from));
51372 +       if (off) {
51373 +               /* tail of unit @from is to be cut partially. Its width decreases */
51374 +               assert("vs-1582", new_first == NULL);
51375 +               ext = extent_item(coord) + from;
51376 +               extent_set_width(ext, off >> PAGE_CACHE_SHIFT);
51377 +               count--;
51378 +       }
51379 +
51380 +       assert("vs-1554",
51381 +              get_key_offset(pto_key) <=
51382 +              get_key_offset(&item_key) +
51383 +              reiser4_extent_size(coord, to + 1) - 1);
51384 +       off =
51385 +               (get_key_offset(&item_key) +
51386 +                reiser4_extent_size(coord, to + 1) - 1) -
51387 +               get_key_offset(pto_key);
51388 +       if (off) {
51389 +               /* @to_key is smaller than max key of unit @to. Unit @to will not be removed. It gets start increased
51390 +                  and width decreased. */
51391 +               assert("vs-1583", (off & (PAGE_CACHE_SIZE - 1)) == 0);
51392 +               ext = extent_item(coord) + to;
51393 +               if (state_of_extent(ext) == ALLOCATED_EXTENT)
51394 +                       extent_set_start(ext,
51395 +                                        extent_get_start(ext) +
51396 +                                        (extent_get_width(ext) -
51397 +                                         (off >> PAGE_CACHE_SHIFT)));
51398 +
51399 +               extent_set_width(ext, (off >> PAGE_CACHE_SHIFT));
51400 +               count--;
51401 +       }
51402 +       return count * sizeof(reiser4_extent);
51403 +}
51404 +
51405 +/* item_plugin->b.unit_key */
51406 +reiser4_key *unit_key_extent(const coord_t * coord, reiser4_key * key)
51407 +{
51408 +       assert("vs-300", coord_is_existing_unit(coord));
51409 +
51410 +       item_key_by_coord(coord, key);
51411 +       set_key_offset(key,
51412 +                      (get_key_offset(key) +
51413 +                       reiser4_extent_size(coord, coord->unit_pos)));
51414 +
51415 +       return key;
51416 +}
51417 +
51418 +/* item_plugin->b.max_unit_key */
51419 +reiser4_key *max_unit_key_extent(const coord_t * coord, reiser4_key * key)
51420 +{
51421 +       assert("vs-300", coord_is_existing_unit(coord));
51422 +
51423 +       item_key_by_coord(coord, key);
51424 +       set_key_offset(key,
51425 +                      (get_key_offset(key) +
51426 +                       reiser4_extent_size(coord, coord->unit_pos + 1) - 1));
51427 +       return key;
51428 +}
51429 +
51430 +/* item_plugin->b.estimate
51431 +   item_plugin->b.item_data_by_flow */
51432 +
51433 +#if REISER4_DEBUG
51434 +
51435 +/* item_plugin->b.check
51436 +   used for debugging, every item should have here the most complete
51437 +   possible check of the consistency of the item that the inventor can
51438 +   construct
51439 +*/
51440 +int reiser4_check_extent(const coord_t * coord /* coord of item to check */,
51441 +                        const char **error /* where to store error message */)
51442 +{
51443 +       reiser4_extent *ext, *first;
51444 +       unsigned i, j;
51445 +       reiser4_block_nr start, width, blk_cnt;
51446 +       unsigned num_units;
51447 +       reiser4_tree *tree;
51448 +       oid_t oid;
51449 +       reiser4_key key;
51450 +       coord_t scan;
51451 +
51452 +       assert("vs-933", REISER4_DEBUG);
51453 +
51454 +       if (znode_get_level(coord->node) != TWIG_LEVEL) {
51455 +               *error = "Extent on the wrong level";
51456 +               return -1;
51457 +       }
51458 +       if (item_length_by_coord(coord) % sizeof(reiser4_extent) != 0) {
51459 +               *error = "Wrong item size";
51460 +               return -1;
51461 +       }
51462 +       ext = first = extent_item(coord);
51463 +       blk_cnt = reiser4_block_count(reiser4_get_current_sb());
51464 +       num_units = coord_num_units(coord);
51465 +       tree = znode_get_tree(coord->node);
51466 +       item_key_by_coord(coord, &key);
51467 +       oid = get_key_objectid(&key);
51468 +       coord_dup(&scan, coord);
51469 +
51470 +       for (i = 0; i < num_units; ++i, ++ext) {
51471 +               __u64 index;
51472 +
51473 +               scan.unit_pos = i;
51474 +               index = extent_unit_index(&scan);
51475 +
51476 +#if 0
51477 +               /* check that all jnodes are present for the unallocated
51478 +                * extent */
51479 +               if (state_of_extent(ext) == UNALLOCATED_EXTENT) {
51480 +                       for (j = 0; j < extent_get_width(ext); j++) {
51481 +                               jnode *node;
51482 +
51483 +                               node = jlookup(tree, oid, index + j);
51484 +                               if (node == NULL) {
51485 +                                       print_coord("scan", &scan, 0);
51486 +                                       *error = "Jnode missing";
51487 +                                       return -1;
51488 +                               }
51489 +                               jput(node);
51490 +                       }
51491 +               }
51492 +#endif
51493 +
51494 +               start = extent_get_start(ext);
51495 +               if (start < 2)
51496 +                       continue;
51497 +               /* extent is allocated one */
51498 +               width = extent_get_width(ext);
51499 +               if (start >= blk_cnt) {
51500 +                       *error = "Start too large";
51501 +                       return -1;
51502 +               }
51503 +               if (start + width > blk_cnt) {
51504 +                       *error = "End too large";
51505 +                       return -1;
51506 +               }
51507 +               /* make sure that this extent does not overlap with other
51508 +                  allocated extents extents */
51509 +               for (j = 0; j < i; j++) {
51510 +                       if (state_of_extent(first + j) != ALLOCATED_EXTENT)
51511 +                               continue;
51512 +                       if (!
51513 +                           ((extent_get_start(ext) >=
51514 +                             extent_get_start(first + j) +
51515 +                             extent_get_width(first + j))
51516 +                            || (extent_get_start(ext) +
51517 +                                extent_get_width(ext) <=
51518 +                                extent_get_start(first + j)))) {
51519 +                               *error = "Extent overlaps with others";
51520 +                               return -1;
51521 +                       }
51522 +               }
51523 +
51524 +       }
51525 +
51526 +       return 0;
51527 +}
51528 +
51529 +#endif                         /* REISER4_DEBUG */
51530 +
51531 +/*
51532 +   Local variables:
51533 +   c-indentation-style: "K&R"
51534 +   mode-name: "LC"
51535 +   c-basic-offset: 8
51536 +   tab-width: 8
51537 +   fill-column: 120
51538 +   scroll-step: 1
51539 +   End:
51540 +*/
51541 diff --git a/fs/reiser4/plugin/item/internal.c b/fs/reiser4/plugin/item/internal.c
51542 new file mode 100644
51543 index 0000000..eb79388
51544 --- /dev/null
51545 +++ b/fs/reiser4/plugin/item/internal.c
51546 @@ -0,0 +1,396 @@
51547 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51548 +
51549 +/* Implementation of internal-item plugin methods. */
51550 +
51551 +#include "../../forward.h"
51552 +#include "../../debug.h"
51553 +#include "../../dformat.h"
51554 +#include "../../key.h"
51555 +#include "../../coord.h"
51556 +#include "internal.h"
51557 +#include "item.h"
51558 +#include "../node/node.h"
51559 +#include "../plugin.h"
51560 +#include "../../jnode.h"
51561 +#include "../../znode.h"
51562 +#include "../../tree_walk.h"
51563 +#include "../../tree_mod.h"
51564 +#include "../../tree.h"
51565 +#include "../../super.h"
51566 +#include "../../block_alloc.h"
51567 +
51568 +/* see internal.h for explanation */
51569 +
51570 +/* plugin->u.item.b.mergeable */
51571 +int mergeable_internal(const coord_t * p1 UNUSED_ARG /* first item */ ,
51572 +                      const coord_t * p2 UNUSED_ARG /* second item */ )
51573 +{
51574 +       /* internal items are not mergeable */
51575 +       return 0;
51576 +}
51577 +
51578 +/* ->lookup() method for internal items */
51579 +lookup_result lookup_internal(const reiser4_key * key /* key to look up */ ,
51580 +                             lookup_bias bias UNUSED_ARG /* lookup bias */ ,
51581 +                             coord_t * coord /* coord of item */ )
51582 +{
51583 +       reiser4_key ukey;
51584 +
51585 +       switch (keycmp(unit_key_by_coord(coord, &ukey), key)) {
51586 +       default:
51587 +               impossible("", "keycmp()?!");
51588 +       case LESS_THAN:
51589 +               /* FIXME-VS: AFTER_ITEM used to be here. But with new coord
51590 +                  item plugin can not be taken using coord set this way */
51591 +               assert("vs-681", coord->unit_pos == 0);
51592 +               coord->between = AFTER_UNIT;
51593 +       case EQUAL_TO:
51594 +               return CBK_COORD_FOUND;
51595 +       case GREATER_THAN:
51596 +               return CBK_COORD_NOTFOUND;
51597 +       }
51598 +}
51599 +
51600 +/* return body of internal item at @coord */
51601 +static internal_item_layout *internal_at(const coord_t * coord /* coord of
51602 +                                                                * item */ )
51603 +{
51604 +       assert("nikita-607", coord != NULL);
51605 +       assert("nikita-1650",
51606 +              item_plugin_by_coord(coord) ==
51607 +              item_plugin_by_id(NODE_POINTER_ID));
51608 +       return (internal_item_layout *) item_body_by_coord(coord);
51609 +}
51610 +
51611 +void reiser4_update_internal(const coord_t * coord,
51612 +                            const reiser4_block_nr * blocknr)
51613 +{
51614 +       internal_item_layout *item = internal_at(coord);
51615 +       assert("nikita-2959", reiser4_blocknr_is_sane(blocknr));
51616 +
51617 +       put_unaligned(cpu_to_le64(*blocknr), &item->pointer);
51618 +}
51619 +
51620 +/* return child block number stored in the internal item at @coord */
51621 +static reiser4_block_nr pointer_at(const coord_t * coord /* coord of item */ )
51622 +{
51623 +       assert("nikita-608", coord != NULL);
51624 +       return le64_to_cpu(get_unaligned(&internal_at(coord)->pointer));
51625 +}
51626 +
51627 +/* get znode pointed to by internal @item */
51628 +static znode *znode_at(const coord_t * item /* coord of item */ ,
51629 +                      znode * parent /* parent node */ )
51630 +{
51631 +       return child_znode(item, parent, 1, 0);
51632 +}
51633 +
51634 +/* store pointer from internal item into "block". Implementation of
51635 +    ->down_link() method */
51636 +void down_link_internal(const coord_t * coord /* coord of item */ ,
51637 +                       const reiser4_key * key UNUSED_ARG      /* key to get
51638 +                                                                * pointer for */ ,
51639 +                       reiser4_block_nr * block /* resulting block number */ )
51640 +{
51641 +       ON_DEBUG(reiser4_key item_key);
51642 +
51643 +       assert("nikita-609", coord != NULL);
51644 +       assert("nikita-611", block != NULL);
51645 +       assert("nikita-612", (key == NULL) ||
51646 +              /* twig horrors */
51647 +              (znode_get_level(coord->node) == TWIG_LEVEL)
51648 +              || keyle(item_key_by_coord(coord, &item_key), key));
51649 +
51650 +       *block = pointer_at(coord);
51651 +       assert("nikita-2960", reiser4_blocknr_is_sane(block));
51652 +}
51653 +
51654 +/* Get the child's block number, or 0 if the block is unallocated. */
51655 +int
51656 +utmost_child_real_block_internal(const coord_t * coord, sideof side UNUSED_ARG,
51657 +                                reiser4_block_nr * block)
51658 +{
51659 +       assert("jmacd-2059", coord != NULL);
51660 +
51661 +       *block = pointer_at(coord);
51662 +       assert("nikita-2961", reiser4_blocknr_is_sane(block));
51663 +
51664 +       if (reiser4_blocknr_is_fake(block)) {
51665 +               *block = 0;
51666 +       }
51667 +
51668 +       return 0;
51669 +}
51670 +
51671 +/* Return the child. */
51672 +int
51673 +utmost_child_internal(const coord_t * coord, sideof side UNUSED_ARG,
51674 +                     jnode ** childp)
51675 +{
51676 +       reiser4_block_nr block = pointer_at(coord);
51677 +       znode *child;
51678 +
51679 +       assert("jmacd-2059", childp != NULL);
51680 +       assert("nikita-2962", reiser4_blocknr_is_sane(&block));
51681 +
51682 +       child = zlook(znode_get_tree(coord->node), &block);
51683 +
51684 +       if (IS_ERR(child)) {
51685 +               return PTR_ERR(child);
51686 +       }
51687 +
51688 +       *childp = ZJNODE(child);
51689 +
51690 +       return 0;
51691 +}
51692 +
51693 +#if REISER4_DEBUG
51694 +
51695 +static void check_link(znode * left, znode * right)
51696 +{
51697 +       znode *scan;
51698 +
51699 +       for (scan = left; scan != right; scan = scan->right) {
51700 +               if (ZF_ISSET(scan, JNODE_RIP))
51701 +                       break;
51702 +               if (znode_is_right_connected(scan) && scan->right != NULL) {
51703 +                       if (ZF_ISSET(scan->right, JNODE_RIP))
51704 +                               break;
51705 +                       assert("nikita-3285",
51706 +                              znode_is_left_connected(scan->right));
51707 +                       assert("nikita-3265",
51708 +                              ergo(scan != left,
51709 +                                   ZF_ISSET(scan, JNODE_HEARD_BANSHEE)));
51710 +                       assert("nikita-3284", scan->right->left == scan);
51711 +               } else
51712 +                       break;
51713 +       }
51714 +}
51715 +
51716 +int check__internal(const coord_t * coord, const char **error)
51717 +{
51718 +       reiser4_block_nr blk;
51719 +       znode *child;
51720 +       coord_t cpy;
51721 +
51722 +       blk = pointer_at(coord);
51723 +       if (!reiser4_blocknr_is_sane(&blk)) {
51724 +               *error = "Invalid pointer";
51725 +               return -1;
51726 +       }
51727 +       coord_dup(&cpy, coord);
51728 +       child = znode_at(&cpy, cpy.node);
51729 +       if (child != NULL) {
51730 +               znode *left_child;
51731 +               znode *right_child;
51732 +
51733 +               left_child = right_child = NULL;
51734 +
51735 +               assert("nikita-3256", znode_invariant(child));
51736 +               if (coord_prev_item(&cpy) == 0 && item_is_internal(&cpy)) {
51737 +                       left_child = znode_at(&cpy, cpy.node);
51738 +                       if (left_child != NULL) {
51739 +                               read_lock_tree(znode_get_tree(child));
51740 +                               check_link(left_child, child);
51741 +                               read_unlock_tree(znode_get_tree(child));
51742 +                               zput(left_child);
51743 +                       }
51744 +               }
51745 +               coord_dup(&cpy, coord);
51746 +               if (coord_next_item(&cpy) == 0 && item_is_internal(&cpy)) {
51747 +                       right_child = znode_at(&cpy, cpy.node);
51748 +                       if (right_child != NULL) {
51749 +                               read_lock_tree(znode_get_tree(child));
51750 +                               check_link(child, right_child);
51751 +                               read_unlock_tree(znode_get_tree(child));
51752 +                               zput(right_child);
51753 +                       }
51754 +               }
51755 +               zput(child);
51756 +       }
51757 +       return 0;
51758 +}
51759 +
51760 +#endif  /*  REISER4_DEBUG  */
51761 +
51762 +/* return true only if this item really points to "block" */
51763 +/* Audited by: green(2002.06.14) */
51764 +int has_pointer_to_internal(const coord_t * coord /* coord of item */ ,
51765 +                           const reiser4_block_nr * block      /* block number to
51766 +                                                                * check */ )
51767 +{
51768 +       assert("nikita-613", coord != NULL);
51769 +       assert("nikita-614", block != NULL);
51770 +
51771 +       return pointer_at(coord) == *block;
51772 +}
51773 +
51774 +/* hook called by ->create_item() method of node plugin after new internal
51775 +   item was just created.
51776 +
51777 +   This is point where pointer to new node is inserted into tree. Initialize
51778 +   parent pointer in child znode, insert child into sibling list and slum.
51779 +
51780 +*/
51781 +int create_hook_internal(const coord_t * item /* coord of item */ ,
51782 +                        void *arg /* child's left neighbor, if any */ )
51783 +{
51784 +       znode *child;
51785 +       __u64 child_ptr;
51786 +
51787 +       assert("nikita-1252", item != NULL);
51788 +       assert("nikita-1253", item->node != NULL);
51789 +       assert("nikita-1181", znode_get_level(item->node) > LEAF_LEVEL);
51790 +       assert("nikita-1450", item->unit_pos == 0);
51791 +
51792 +       /*
51793 +        * preparing to item insertion build_child_ptr_data sets pointer to
51794 +        * data to be inserted to jnode's blocknr which is in cpu byte
51795 +        * order. Node's create_item simply copied those data. As result we
51796 +        * have child pointer in cpu's byte order. Convert content of internal
51797 +        * item to little endian byte order.
51798 +        */
51799 +       child_ptr = get_unaligned((__u64 *)item_body_by_coord(item));
51800 +       reiser4_update_internal(item, &child_ptr);
51801 +
51802 +       child = znode_at(item, item->node);
51803 +       if (child != NULL && !IS_ERR(child)) {
51804 +               znode *left;
51805 +               int result = 0;
51806 +               reiser4_tree *tree;
51807 +
51808 +               left = arg;
51809 +               tree = znode_get_tree(item->node);
51810 +               write_lock_tree(tree);
51811 +               write_lock_dk(tree);
51812 +               assert("nikita-1400", (child->in_parent.node == NULL)
51813 +                      || (znode_above_root(child->in_parent.node)));
51814 +               ++item->node->c_count;
51815 +               coord_to_parent_coord(item, &child->in_parent);
51816 +               sibling_list_insert_nolock(child, left);
51817 +
51818 +               assert("nikita-3297", ZF_ISSET(child, JNODE_ORPHAN));
51819 +               ZF_CLR(child, JNODE_ORPHAN);
51820 +
51821 +               if ((left != NULL) && !keyeq(znode_get_rd_key(left),
51822 +                                            znode_get_rd_key(child))) {
51823 +                       znode_set_rd_key(child, znode_get_rd_key(left));
51824 +               }
51825 +               write_unlock_dk(tree);
51826 +               write_unlock_tree(tree);
51827 +               zput(child);
51828 +               return result;
51829 +       } else {
51830 +               if (child == NULL)
51831 +                       child = ERR_PTR(-EIO);
51832 +               return PTR_ERR(child);
51833 +       }
51834 +}
51835 +
51836 +/* hook called by ->cut_and_kill() method of node plugin just before internal
51837 +   item is removed.
51838 +
51839 +   This is point where empty node is removed from the tree. Clear parent
51840 +   pointer in child, and mark node for pending deletion.
51841 +
51842 +   Node will be actually deleted later and in several installations:
51843 +
51844 +    . when last lock on this node will be released, node will be removed from
51845 +    the sibling list and its lock will be invalidated
51846 +
51847 +    . when last reference to this node will be dropped, bitmap will be updated
51848 +    and node will be actually removed from the memory.
51849 +
51850 +*/
51851 +int kill_hook_internal(const coord_t * item /* coord of item */ ,
51852 +                      pos_in_node_t from UNUSED_ARG /* start unit */ ,
51853 +                      pos_in_node_t count UNUSED_ARG /* stop unit */ ,
51854 +                      struct carry_kill_data *p UNUSED_ARG)
51855 +{
51856 +       znode *child;
51857 +
51858 +       assert("nikita-1222", item != NULL);
51859 +       assert("nikita-1224", from == 0);
51860 +       assert("nikita-1225", count == 1);
51861 +
51862 +       child = znode_at(item, item->node);
51863 +       if (IS_ERR(child))
51864 +               return PTR_ERR(child);
51865 +       else if (node_is_empty(child)) {
51866 +               reiser4_tree *tree;
51867 +
51868 +               assert("nikita-1397", znode_is_write_locked(child));
51869 +               assert("nikita-1398", child->c_count == 0);
51870 +               assert("nikita-2546", ZF_ISSET(child, JNODE_HEARD_BANSHEE));
51871 +
51872 +               tree = znode_get_tree(item->node);
51873 +               write_lock_tree(tree);
51874 +               init_parent_coord(&child->in_parent, NULL);
51875 +               --item->node->c_count;
51876 +               write_unlock_tree(tree);
51877 +               zput(child);
51878 +               return 0;
51879 +       } else {
51880 +               warning("nikita-1223",
51881 +                       "Cowardly refuse to remove link to non-empty node");
51882 +               zput(child);
51883 +               return RETERR(-EIO);
51884 +       }
51885 +}
51886 +
51887 +/* hook called by ->shift() node plugin method when iternal item was just
51888 +   moved from one node to another.
51889 +
51890 +   Update parent pointer in child and c_counts in old and new parent
51891 +
51892 +*/
51893 +int shift_hook_internal(const coord_t * item /* coord of item */ ,
51894 +                       unsigned from UNUSED_ARG /* start unit */ ,
51895 +                       unsigned count UNUSED_ARG /* stop unit */ ,
51896 +                       znode * old_node /* old parent */ )
51897 +{
51898 +       znode *child;
51899 +       znode *new_node;
51900 +       reiser4_tree *tree;
51901 +
51902 +       assert("nikita-1276", item != NULL);
51903 +       assert("nikita-1277", from == 0);
51904 +       assert("nikita-1278", count == 1);
51905 +       assert("nikita-1451", item->unit_pos == 0);
51906 +
51907 +       new_node = item->node;
51908 +       assert("nikita-2132", new_node != old_node);
51909 +       tree = znode_get_tree(item->node);
51910 +       child = child_znode(item, old_node, 1, 0);
51911 +       if (child == NULL)
51912 +               return 0;
51913 +       if (!IS_ERR(child)) {
51914 +               write_lock_tree(tree);
51915 +               ++new_node->c_count;
51916 +               assert("nikita-1395", znode_parent(child) == old_node);
51917 +               assert("nikita-1396", old_node->c_count > 0);
51918 +               coord_to_parent_coord(item, &child->in_parent);
51919 +               assert("nikita-1781", znode_parent(child) == new_node);
51920 +               assert("nikita-1782",
51921 +                      check_tree_pointer(item, child) == NS_FOUND);
51922 +               --old_node->c_count;
51923 +               write_unlock_tree(tree);
51924 +               zput(child);
51925 +               return 0;
51926 +       } else
51927 +               return PTR_ERR(child);
51928 +}
51929 +
51930 +/* plugin->u.item.b.max_key_inside - not defined */
51931 +
51932 +/* plugin->u.item.b.nr_units - item.c:single_unit */
51933 +
51934 +/* Make Linus happy.
51935 +   Local variables:
51936 +   c-indentation-style: "K&R"
51937 +   mode-name: "LC"
51938 +   c-basic-offset: 8
51939 +   tab-width: 8
51940 +   fill-column: 120
51941 +   End:
51942 +*/
51943 diff --git a/fs/reiser4/plugin/item/internal.h b/fs/reiser4/plugin/item/internal.h
51944 new file mode 100644
51945 index 0000000..27aa27d
51946 --- /dev/null
51947 +++ b/fs/reiser4/plugin/item/internal.h
51948 @@ -0,0 +1,57 @@
51949 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
51950 +/* Internal item contains down-link to the child of the internal/twig
51951 +   node in a tree. It is internal items that are actually used during
51952 +   tree traversal. */
51953 +
51954 +#if !defined( __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ )
51955 +#define __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__
51956 +
51957 +#include "../../forward.h"
51958 +#include "../../dformat.h"
51959 +
51960 +/* on-disk layout of internal item */
51961 +typedef struct internal_item_layout {
51962 +       /*  0 */ reiser4_dblock_nr pointer;
51963 +       /*  4 */
51964 +} internal_item_layout;
51965 +
51966 +struct cut_list;
51967 +
51968 +int mergeable_internal(const coord_t * p1, const coord_t * p2);
51969 +lookup_result lookup_internal(const reiser4_key * key, lookup_bias bias,
51970 +                             coord_t * coord);
51971 +/* store pointer from internal item into "block". Implementation of
51972 +    ->down_link() method */
51973 +extern void down_link_internal(const coord_t * coord, const reiser4_key * key,
51974 +                              reiser4_block_nr * block);
51975 +extern int has_pointer_to_internal(const coord_t * coord,
51976 +                                  const reiser4_block_nr * block);
51977 +extern int create_hook_internal(const coord_t * item, void *arg);
51978 +extern int kill_hook_internal(const coord_t * item, pos_in_node_t from,
51979 +                             pos_in_node_t count, struct carry_kill_data *);
51980 +extern int shift_hook_internal(const coord_t * item, unsigned from,
51981 +                              unsigned count, znode * old_node);
51982 +extern void reiser4_print_internal(const char *prefix, coord_t * coord);
51983 +
51984 +extern int utmost_child_internal(const coord_t * coord, sideof side,
51985 +                                jnode ** child);
51986 +int utmost_child_real_block_internal(const coord_t * coord, sideof side,
51987 +                                    reiser4_block_nr * block);
51988 +
51989 +extern void reiser4_update_internal(const coord_t * coord,
51990 +                                   const reiser4_block_nr * blocknr);
51991 +/* FIXME: reiserfs has check_internal */
51992 +extern int check__internal(const coord_t * coord, const char **error);
51993 +
51994 +/* __FS_REISER4_PLUGIN_ITEM_INTERNAL_H__ */
51995 +#endif
51996 +
51997 +/* Make Linus happy.
51998 +   Local variables:
51999 +   c-indentation-style: "K&R"
52000 +   mode-name: "LC"
52001 +   c-basic-offset: 8
52002 +   tab-width: 8
52003 +   fill-column: 120
52004 +   End:
52005 +*/
52006 diff --git a/fs/reiser4/plugin/item/item.c b/fs/reiser4/plugin/item/item.c
52007 new file mode 100644
52008 index 0000000..e226f04
52009 --- /dev/null
52010 +++ b/fs/reiser4/plugin/item/item.c
52011 @@ -0,0 +1,719 @@
52012 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52013 +
52014 +/* definition of item plugins. */
52015 +
52016 +#include "../../forward.h"
52017 +#include "../../debug.h"
52018 +#include "../../key.h"
52019 +#include "../../coord.h"
52020 +#include "../plugin_header.h"
52021 +#include "sde.h"
52022 +#include "internal.h"
52023 +#include "item.h"
52024 +#include "static_stat.h"
52025 +#include "../plugin.h"
52026 +#include "../../znode.h"
52027 +#include "../../tree.h"
52028 +#include "../../context.h"
52029 +#include "ctail.h"
52030 +
52031 +/* return pointer to item body */
52032 +void item_body_by_coord_hard(coord_t * coord /* coord to query */ )
52033 +{
52034 +       assert("nikita-324", coord != NULL);
52035 +       assert("nikita-325", coord->node != NULL);
52036 +       assert("nikita-326", znode_is_loaded(coord->node));
52037 +       assert("nikita-3200", coord->offset == INVALID_OFFSET);
52038 +
52039 +       coord->offset =
52040 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
52041 +           zdata(coord->node);
52042 +       ON_DEBUG(coord->body_v = coord->node->times_locked);
52043 +}
52044 +
52045 +void *item_body_by_coord_easy(const coord_t * coord /* coord to query */ )
52046 +{
52047 +       return zdata(coord->node) + coord->offset;
52048 +}
52049 +
52050 +#if REISER4_DEBUG
52051 +
52052 +int item_body_is_valid(const coord_t * coord)
52053 +{
52054 +       return
52055 +           coord->offset ==
52056 +           node_plugin_by_node(coord->node)->item_by_coord(coord) -
52057 +           zdata(coord->node);
52058 +}
52059 +
52060 +#endif
52061 +
52062 +/* return length of item at @coord */
52063 +pos_in_node_t item_length_by_coord(const coord_t * coord /* coord to query */ )
52064 +{
52065 +       int len;
52066 +
52067 +       assert("nikita-327", coord != NULL);
52068 +       assert("nikita-328", coord->node != NULL);
52069 +       assert("nikita-329", znode_is_loaded(coord->node));
52070 +
52071 +       len = node_plugin_by_node(coord->node)->length_by_coord(coord);
52072 +       return len;
52073 +}
52074 +
52075 +void obtain_item_plugin(const coord_t * coord)
52076 +{
52077 +       assert("nikita-330", coord != NULL);
52078 +       assert("nikita-331", coord->node != NULL);
52079 +       assert("nikita-332", znode_is_loaded(coord->node));
52080 +
52081 +       coord_set_iplug((coord_t *) coord,
52082 +                       node_plugin_by_node(coord->node)->
52083 +                       plugin_by_coord(coord));
52084 +       assert("nikita-2479",
52085 +              coord_iplug(coord) ==
52086 +              node_plugin_by_node(coord->node)->plugin_by_coord(coord));
52087 +}
52088 +
52089 +/* return id of item */
52090 +/* Audited by: green(2002.06.15) */
52091 +item_id item_id_by_coord(const coord_t * coord /* coord to query */ )
52092 +{
52093 +       assert("vs-539", coord != NULL);
52094 +       assert("vs-538", coord->node != NULL);
52095 +       assert("vs-537", znode_is_loaded(coord->node));
52096 +       assert("vs-536", item_plugin_by_coord(coord) != NULL);
52097 +       assert("vs-540",
52098 +              item_id_by_plugin(item_plugin_by_coord(coord)) < LAST_ITEM_ID);
52099 +
52100 +       return item_id_by_plugin(item_plugin_by_coord(coord));
52101 +}
52102 +
52103 +/* return key of item at @coord */
52104 +/* Audited by: green(2002.06.15) */
52105 +reiser4_key *item_key_by_coord(const coord_t * coord /* coord to query */ ,
52106 +                              reiser4_key * key /* result */ )
52107 +{
52108 +       assert("nikita-338", coord != NULL);
52109 +       assert("nikita-339", coord->node != NULL);
52110 +       assert("nikita-340", znode_is_loaded(coord->node));
52111 +
52112 +       return node_plugin_by_node(coord->node)->key_at(coord, key);
52113 +}
52114 +
52115 +/* this returns max key in the item */
52116 +reiser4_key *max_item_key_by_coord(const coord_t * coord /* coord to query */ ,
52117 +                                  reiser4_key * key /* result */ )
52118 +{
52119 +       coord_t last;
52120 +
52121 +       assert("nikita-338", coord != NULL);
52122 +       assert("nikita-339", coord->node != NULL);
52123 +       assert("nikita-340", znode_is_loaded(coord->node));
52124 +
52125 +       /* make coord pointing to last item's unit */
52126 +       coord_dup(&last, coord);
52127 +       last.unit_pos = coord_num_units(&last) - 1;
52128 +       assert("vs-1560", coord_is_existing_unit(&last));
52129 +
52130 +       max_unit_key_by_coord(&last, key);
52131 +       return key;
52132 +}
52133 +
52134 +/* return key of unit at @coord */
52135 +reiser4_key *unit_key_by_coord(const coord_t * coord /* coord to query */ ,
52136 +                              reiser4_key * key /* result */ )
52137 +{
52138 +       assert("nikita-772", coord != NULL);
52139 +       assert("nikita-774", coord->node != NULL);
52140 +       assert("nikita-775", znode_is_loaded(coord->node));
52141 +
52142 +       if (item_plugin_by_coord(coord)->b.unit_key != NULL)
52143 +               return item_plugin_by_coord(coord)->b.unit_key(coord, key);
52144 +       else
52145 +               return item_key_by_coord(coord, key);
52146 +}
52147 +
52148 +/* return the biggest key contained the unit @coord */
52149 +reiser4_key *max_unit_key_by_coord(const coord_t * coord /* coord to query */ ,
52150 +                                  reiser4_key * key /* result */ )
52151 +{
52152 +       assert("nikita-772", coord != NULL);
52153 +       assert("nikita-774", coord->node != NULL);
52154 +       assert("nikita-775", znode_is_loaded(coord->node));
52155 +
52156 +       if (item_plugin_by_coord(coord)->b.max_unit_key != NULL)
52157 +               return item_plugin_by_coord(coord)->b.max_unit_key(coord, key);
52158 +       else
52159 +               return unit_key_by_coord(coord, key);
52160 +}
52161 +
52162 +/* ->max_key_inside() method for items consisting of exactly one key (like
52163 +    stat-data) */
52164 +static reiser4_key *max_key_inside_single_key(const coord_t *
52165 +                                             coord /* coord of item */ ,
52166 +                                             reiser4_key *
52167 +                                             result /* resulting key */ )
52168 +{
52169 +       assert("nikita-604", coord != NULL);
52170 +
52171 +       /* coord -> key is starting key of this item and it has to be already
52172 +          filled in */
52173 +       return unit_key_by_coord(coord, result);
52174 +}
52175 +
52176 +/* ->nr_units() method for items consisting of exactly one unit always */
52177 +pos_in_node_t
52178 +nr_units_single_unit(const coord_t * coord UNUSED_ARG /* coord of item */ )
52179 +{
52180 +       return 1;
52181 +}
52182 +
52183 +static int
52184 +paste_no_paste(coord_t * coord UNUSED_ARG,
52185 +              reiser4_item_data * data UNUSED_ARG,
52186 +              carry_plugin_info * info UNUSED_ARG)
52187 +{
52188 +       return 0;
52189 +}
52190 +
52191 +/* default ->fast_paste() method */
52192 +static int
52193 +agree_to_fast_op(const coord_t * coord UNUSED_ARG /* coord of item */ )
52194 +{
52195 +       return 1;
52196 +}
52197 +
52198 +int item_can_contain_key(const coord_t * item /* coord of item */ ,
52199 +                        const reiser4_key * key /* key to check */ ,
52200 +                        const reiser4_item_data * data /* parameters of item
52201 +                                                        * being created */ )
52202 +{
52203 +       item_plugin *iplug;
52204 +       reiser4_key min_key_in_item;
52205 +       reiser4_key max_key_in_item;
52206 +
52207 +       assert("nikita-1658", item != NULL);
52208 +       assert("nikita-1659", key != NULL);
52209 +
52210 +       iplug = item_plugin_by_coord(item);
52211 +       if (iplug->b.can_contain_key != NULL)
52212 +               return iplug->b.can_contain_key(item, key, data);
52213 +       else {
52214 +               assert("nikita-1681", iplug->b.max_key_inside != NULL);
52215 +               item_key_by_coord(item, &min_key_in_item);
52216 +               iplug->b.max_key_inside(item, &max_key_in_item);
52217 +
52218 +               /* can contain key if
52219 +                  min_key_in_item <= key &&
52220 +                  key <= max_key_in_item
52221 +                */
52222 +               return keyle(&min_key_in_item, key)
52223 +                   && keyle(key, &max_key_in_item);
52224 +       }
52225 +}
52226 +
52227 +/* mergeable method for non mergeable items */
52228 +static int
52229 +not_mergeable(const coord_t * i1 UNUSED_ARG, const coord_t * i2 UNUSED_ARG)
52230 +{
52231 +       return 0;
52232 +}
52233 +
52234 +/* return 0 if @item1 and @item2 are not mergeable, !0 - otherwise */
52235 +int are_items_mergeable(const coord_t * i1 /* coord of first item */ ,
52236 +                       const coord_t * i2 /* coord of second item */ )
52237 +{
52238 +       item_plugin *iplug;
52239 +       reiser4_key k1;
52240 +       reiser4_key k2;
52241 +
52242 +       assert("nikita-1336", i1 != NULL);
52243 +       assert("nikita-1337", i2 != NULL);
52244 +
52245 +       iplug = item_plugin_by_coord(i1);
52246 +       assert("nikita-1338", iplug != NULL);
52247 +
52248 +       /* NOTE-NIKITA are_items_mergeable() is also called by assertions in
52249 +          shifting code when nodes are in "suspended" state. */
52250 +       assert("nikita-1663",
52251 +              keyle(item_key_by_coord(i1, &k1), item_key_by_coord(i2, &k2)));
52252 +
52253 +       if (iplug->b.mergeable != NULL) {
52254 +               return iplug->b.mergeable(i1, i2);
52255 +       } else if (iplug->b.max_key_inside != NULL) {
52256 +               iplug->b.max_key_inside(i1, &k1);
52257 +               item_key_by_coord(i2, &k2);
52258 +
52259 +               /* mergeable if ->max_key_inside() >= key of i2; */
52260 +               return keyge(iplug->b.max_key_inside(i1, &k1),
52261 +                            item_key_by_coord(i2, &k2));
52262 +       } else {
52263 +               item_key_by_coord(i1, &k1);
52264 +               item_key_by_coord(i2, &k2);
52265 +
52266 +               return
52267 +                   (get_key_locality(&k1) == get_key_locality(&k2)) &&
52268 +                   (get_key_objectid(&k1) == get_key_objectid(&k2))
52269 +                   && (iplug == item_plugin_by_coord(i2));
52270 +       }
52271 +}
52272 +
52273 +int item_is_extent(const coord_t * item)
52274 +{
52275 +       assert("vs-482", coord_is_existing_item(item));
52276 +       return item_id_by_coord(item) == EXTENT_POINTER_ID;
52277 +}
52278 +
52279 +int item_is_tail(const coord_t * item)
52280 +{
52281 +       assert("vs-482", coord_is_existing_item(item));
52282 +       return item_id_by_coord(item) == FORMATTING_ID;
52283 +}
52284 +
52285 +#if REISER4_DEBUG
52286 +
52287 +int item_is_statdata(const coord_t * item)
52288 +{
52289 +       assert("vs-516", coord_is_existing_item(item));
52290 +       return plugin_of_group(item_plugin_by_coord(item), STAT_DATA_ITEM_TYPE);
52291 +}
52292 +
52293 +int item_is_ctail(const coord_t * item)
52294 +{
52295 +       assert("edward-xx", coord_is_existing_item(item));
52296 +       return item_id_by_coord(item) == CTAIL_ID;
52297 +}
52298 +
52299 +#endif  /*  REISER4_DEBUG  */
52300 +
52301 +static int change_item(struct inode *inode,
52302 +                      reiser4_plugin * plugin,
52303 +                      pset_member memb)
52304 +{
52305 +       /* cannot change constituent item (sd, or dir_item) */
52306 +       return RETERR(-EINVAL);
52307 +}
52308 +
52309 +static reiser4_plugin_ops item_plugin_ops = {
52310 +       .init = NULL,
52311 +       .load = NULL,
52312 +       .save_len = NULL,
52313 +       .save = NULL,
52314 +       .change = change_item
52315 +};
52316 +
52317 +item_plugin item_plugins[LAST_ITEM_ID] = {
52318 +       [STATIC_STAT_DATA_ID] = {
52319 +               .h = {
52320 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52321 +                       .id = STATIC_STAT_DATA_ID,
52322 +                       .groups = (1 << STAT_DATA_ITEM_TYPE),
52323 +                       .pops = &item_plugin_ops,
52324 +                       .label = "sd",
52325 +                       .desc = "stat-data",
52326 +                       .linkage = {NULL, NULL}
52327 +               },
52328 +               .b = {
52329 +                       .max_key_inside = max_key_inside_single_key,
52330 +                       .can_contain_key = NULL,
52331 +                       .mergeable = not_mergeable,
52332 +                       .nr_units = nr_units_single_unit,
52333 +                       .lookup = NULL,
52334 +                       .init = NULL,
52335 +                       .paste = paste_no_paste,
52336 +                       .fast_paste = NULL,
52337 +                       .can_shift = NULL,
52338 +                       .copy_units = NULL,
52339 +                       .create_hook = NULL,
52340 +                       .kill_hook = NULL,
52341 +                       .shift_hook = NULL,
52342 +                       .cut_units = NULL,
52343 +                       .kill_units = NULL,
52344 +                       .unit_key = NULL,
52345 +                       .max_unit_key = NULL,
52346 +                       .estimate = NULL,
52347 +                       .item_data_by_flow = NULL,
52348 +#if REISER4_DEBUG
52349 +                       .check = NULL
52350 +#endif
52351 +               },
52352 +               .f = {
52353 +                       .utmost_child = NULL,
52354 +                       .utmost_child_real_block = NULL,
52355 +                       .update = NULL,
52356 +                       .scan = NULL,
52357 +                       .convert = NULL
52358 +               },
52359 +               .s = {
52360 +                       .sd = {
52361 +                               .init_inode = init_inode_static_sd,
52362 +                               .save_len = save_len_static_sd,
52363 +                               .save = save_static_sd
52364 +                       }
52365 +               }
52366 +       },
52367 +       [SIMPLE_DIR_ENTRY_ID] = {
52368 +               .h = {
52369 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52370 +                       .id = SIMPLE_DIR_ENTRY_ID,
52371 +                       .groups = (1 << DIR_ENTRY_ITEM_TYPE),
52372 +                       .pops = &item_plugin_ops,
52373 +                       .label = "de",
52374 +                       .desc = "directory entry",
52375 +                       .linkage = {NULL, NULL}
52376 +               },
52377 +               .b = {
52378 +                       .max_key_inside = max_key_inside_single_key,
52379 +                       .can_contain_key = NULL,
52380 +                       .mergeable = NULL,
52381 +                       .nr_units = nr_units_single_unit,
52382 +                       .lookup = NULL,
52383 +                       .init = NULL,
52384 +                       .paste = NULL,
52385 +                       .fast_paste = NULL,
52386 +                       .can_shift = NULL,
52387 +                       .copy_units = NULL,
52388 +                       .create_hook = NULL,
52389 +                       .kill_hook = NULL,
52390 +                       .shift_hook = NULL,
52391 +                       .cut_units = NULL,
52392 +                       .kill_units = NULL,
52393 +                       .unit_key = NULL,
52394 +                       .max_unit_key = NULL,
52395 +                       .estimate = NULL,
52396 +                       .item_data_by_flow = NULL,
52397 +#if REISER4_DEBUG
52398 +                       .check = NULL
52399 +#endif
52400 +               },
52401 +               .f = {
52402 +                       .utmost_child = NULL,
52403 +                       .utmost_child_real_block = NULL,
52404 +                       .update = NULL,
52405 +                       .scan = NULL,
52406 +                       .convert = NULL
52407 +               },
52408 +               .s = {
52409 +                       .dir = {
52410 +                               .extract_key = extract_key_de,
52411 +                               .update_key = update_key_de,
52412 +                               .extract_name = extract_name_de,
52413 +                               .extract_file_type = extract_file_type_de,
52414 +                               .add_entry = add_entry_de,
52415 +                               .rem_entry = rem_entry_de,
52416 +                               .max_name_len = max_name_len_de
52417 +                       }
52418 +               }
52419 +       },
52420 +       [COMPOUND_DIR_ID] = {
52421 +               .h = {
52422 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52423 +                       .id = COMPOUND_DIR_ID,
52424 +                       .groups = (1 << DIR_ENTRY_ITEM_TYPE),
52425 +                       .pops = &item_plugin_ops,
52426 +                       .label = "cde",
52427 +                       .desc = "compressed directory entry",
52428 +                       .linkage = {NULL, NULL}
52429 +               },
52430 +               .b = {
52431 +                       .max_key_inside = max_key_inside_cde,
52432 +                       .can_contain_key = can_contain_key_cde,
52433 +                       .mergeable = mergeable_cde,
52434 +                       .nr_units = nr_units_cde,
52435 +                       .lookup = lookup_cde,
52436 +                       .init = init_cde,
52437 +                       .paste = paste_cde,
52438 +                       .fast_paste = agree_to_fast_op,
52439 +                       .can_shift = can_shift_cde,
52440 +                       .copy_units = copy_units_cde,
52441 +                       .create_hook = NULL,
52442 +                       .kill_hook = NULL,
52443 +                       .shift_hook = NULL,
52444 +                       .cut_units = cut_units_cde,
52445 +                       .kill_units = kill_units_cde,
52446 +                       .unit_key = unit_key_cde,
52447 +                       .max_unit_key = unit_key_cde,
52448 +                       .estimate = estimate_cde,
52449 +                       .item_data_by_flow = NULL,
52450 +#if REISER4_DEBUG
52451 +                       .check = reiser4_check_cde
52452 +#endif
52453 +               },
52454 +               .f = {
52455 +                       .utmost_child = NULL,
52456 +                       .utmost_child_real_block = NULL,
52457 +                       .update = NULL,
52458 +                       .scan = NULL,
52459 +                       .convert = NULL
52460 +               },
52461 +               .s = {
52462 +                       .dir = {
52463 +                               .extract_key = extract_key_cde,
52464 +                               .update_key = update_key_cde,
52465 +                               .extract_name = extract_name_cde,
52466 +                               .extract_file_type = extract_file_type_de,
52467 +                               .add_entry = add_entry_cde,
52468 +                               .rem_entry = rem_entry_cde,
52469 +                               .max_name_len = max_name_len_cde
52470 +                       }
52471 +               }
52472 +       },
52473 +       [NODE_POINTER_ID] = {
52474 +               .h = {
52475 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52476 +                       .id = NODE_POINTER_ID,
52477 +                       .groups = (1 << INTERNAL_ITEM_TYPE),
52478 +                       .pops = NULL,
52479 +                       .label = "internal",
52480 +                       .desc = "internal item",
52481 +                       .linkage = {NULL, NULL}
52482 +               },
52483 +               .b = {
52484 +                       .max_key_inside = NULL,
52485 +                       .can_contain_key = NULL,
52486 +                       .mergeable = mergeable_internal,
52487 +                       .nr_units = nr_units_single_unit,
52488 +                       .lookup = lookup_internal,
52489 +                       .init = NULL,
52490 +                       .paste = NULL,
52491 +                       .fast_paste = NULL,
52492 +                       .can_shift = NULL,
52493 +                       .copy_units = NULL,
52494 +                       .create_hook = create_hook_internal,
52495 +                       .kill_hook = kill_hook_internal,
52496 +                       .shift_hook = shift_hook_internal,
52497 +                       .cut_units = NULL,
52498 +                       .kill_units = NULL,
52499 +                       .unit_key = NULL,
52500 +                       .max_unit_key = NULL,
52501 +                       .estimate = NULL,
52502 +                       .item_data_by_flow = NULL,
52503 +#if REISER4_DEBUG
52504 +                       .check = check__internal
52505 +#endif
52506 +               },
52507 +               .f = {
52508 +                       .utmost_child = utmost_child_internal,
52509 +                       .utmost_child_real_block =
52510 +                       utmost_child_real_block_internal,
52511 +                       .update = reiser4_update_internal,
52512 +                       .scan = NULL,
52513 +                       .convert = NULL
52514 +               },
52515 +               .s = {
52516 +                       .internal = {
52517 +                               .down_link = down_link_internal,
52518 +                               .has_pointer_to = has_pointer_to_internal
52519 +                       }
52520 +               }
52521 +       },
52522 +       [EXTENT_POINTER_ID] = {
52523 +               .h = {
52524 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52525 +                       .id = EXTENT_POINTER_ID,
52526 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52527 +                       .pops = NULL,
52528 +                       .label = "extent",
52529 +                       .desc = "extent item",
52530 +                       .linkage = {NULL, NULL}
52531 +               },
52532 +               .b = {
52533 +                       .max_key_inside = max_key_inside_extent,
52534 +                       .can_contain_key = can_contain_key_extent,
52535 +                       .mergeable = mergeable_extent,
52536 +                       .nr_units = nr_units_extent,
52537 +                       .lookup = lookup_extent,
52538 +                       .init = NULL,
52539 +                       .paste = paste_extent,
52540 +                       .fast_paste = agree_to_fast_op,
52541 +                       .can_shift = can_shift_extent,
52542 +                       .create_hook = create_hook_extent,
52543 +                       .copy_units = copy_units_extent,
52544 +                       .kill_hook = kill_hook_extent,
52545 +                       .shift_hook = NULL,
52546 +                       .cut_units = cut_units_extent,
52547 +                       .kill_units = kill_units_extent,
52548 +                       .unit_key = unit_key_extent,
52549 +                       .max_unit_key = max_unit_key_extent,
52550 +                       .estimate = NULL,
52551 +                       .item_data_by_flow = NULL,
52552 +#if REISER4_DEBUG
52553 +                       .check = reiser4_check_extent
52554 +#endif
52555 +               },
52556 +               .f = {
52557 +                       .utmost_child = utmost_child_extent,
52558 +                       .utmost_child_real_block =
52559 +                       utmost_child_real_block_extent,
52560 +                       .update = NULL,
52561 +                       .scan = reiser4_scan_extent,
52562 +                       .convert = NULL,
52563 +                       .key_by_offset = key_by_offset_extent
52564 +               },
52565 +               .s = {
52566 +                       .file = {
52567 +                               .write = reiser4_write_extent,
52568 +                               .read = reiser4_read_extent,
52569 +                               .readpage = reiser4_readpage_extent,
52570 +                               .get_block = get_block_address_extent,
52571 +                               .append_key = append_key_extent,
52572 +                               .init_coord_extension =
52573 +                               init_coord_extension_extent
52574 +                       }
52575 +               }
52576 +       },
52577 +       [FORMATTING_ID] = {
52578 +               .h = {
52579 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52580 +                       .id = FORMATTING_ID,
52581 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52582 +                       .pops = NULL,
52583 +                       .label = "body",
52584 +                       .desc = "body (or tail?) item",
52585 +                       .linkage = {NULL, NULL}
52586 +               },
52587 +               .b = {
52588 +                       .max_key_inside = max_key_inside_tail,
52589 +                       .can_contain_key = can_contain_key_tail,
52590 +                       .mergeable = mergeable_tail,
52591 +                       .nr_units = nr_units_tail,
52592 +                       .lookup = lookup_tail,
52593 +                       .init = NULL,
52594 +                       .paste = paste_tail,
52595 +                       .fast_paste = agree_to_fast_op,
52596 +                       .can_shift = can_shift_tail,
52597 +                       .create_hook = NULL,
52598 +                       .copy_units = copy_units_tail,
52599 +                       .kill_hook = kill_hook_tail,
52600 +                       .shift_hook = NULL,
52601 +                       .cut_units = cut_units_tail,
52602 +                       .kill_units = kill_units_tail,
52603 +                       .unit_key = unit_key_tail,
52604 +                       .max_unit_key = unit_key_tail,
52605 +                       .estimate = NULL,
52606 +                       .item_data_by_flow = NULL,
52607 +#if REISER4_DEBUG
52608 +                       .check = NULL
52609 +#endif
52610 +               },
52611 +               .f = {
52612 +                       .utmost_child = NULL,
52613 +                       .utmost_child_real_block = NULL,
52614 +                       .update = NULL,
52615 +                       .scan = NULL,
52616 +                       .convert = NULL
52617 +               },
52618 +               .s = {
52619 +                       .file = {
52620 +                               .write = reiser4_write_tail,
52621 +                               .read = reiser4_read_tail,
52622 +                               .readpage = readpage_tail,
52623 +                               .get_block = get_block_address_tail,
52624 +                               .append_key = append_key_tail,
52625 +                               .init_coord_extension =
52626 +                               init_coord_extension_tail
52627 +                       }
52628 +               }
52629 +       },
52630 +       [CTAIL_ID] = {
52631 +               .h = {
52632 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52633 +                       .id = CTAIL_ID,
52634 +                       .groups = (1 << UNIX_FILE_METADATA_ITEM_TYPE),
52635 +                       .pops = NULL,
52636 +                       .label = "ctail",
52637 +                       .desc = "cryptcompress tail item",
52638 +                       .linkage = {NULL, NULL}
52639 +               },
52640 +               .b = {
52641 +                       .max_key_inside = max_key_inside_tail,
52642 +                       .can_contain_key = can_contain_key_ctail,
52643 +                       .mergeable = mergeable_ctail,
52644 +                       .nr_units = nr_units_ctail,
52645 +                       .lookup = NULL,
52646 +                       .init = init_ctail,
52647 +                       .paste = paste_ctail,
52648 +                       .fast_paste = agree_to_fast_op,
52649 +                       .can_shift = can_shift_ctail,
52650 +                       .create_hook = create_hook_ctail,
52651 +                       .copy_units = copy_units_ctail,
52652 +                       .kill_hook = kill_hook_ctail,
52653 +                       .shift_hook = shift_hook_ctail,
52654 +                       .cut_units = cut_units_ctail,
52655 +                       .kill_units = kill_units_ctail,
52656 +                       .unit_key = unit_key_tail,
52657 +                       .max_unit_key = unit_key_tail,
52658 +                       .estimate = estimate_ctail,
52659 +                       .item_data_by_flow = NULL,
52660 +#if REISER4_DEBUG
52661 +                       .check = check_ctail
52662 +#endif
52663 +               },
52664 +               .f = {
52665 +                       .utmost_child = utmost_child_ctail,
52666 +                       /* FIXME-EDWARD: write this */
52667 +                       .utmost_child_real_block = NULL,
52668 +                       .update = NULL,
52669 +                       .scan = scan_ctail,
52670 +                       .convert = convert_ctail
52671 +               },
52672 +               .s = {
52673 +                       .file = {
52674 +                               .write = NULL,
52675 +                               .read = read_ctail,
52676 +                               .readpage = readpage_ctail,
52677 +                               .get_block = get_block_address_tail,
52678 +                               .append_key = append_key_ctail,
52679 +                               .init_coord_extension =
52680 +                               init_coord_extension_tail
52681 +                       }
52682 +               }
52683 +       },
52684 +       [BLACK_BOX_ID] = {
52685 +               .h = {
52686 +                       .type_id = REISER4_ITEM_PLUGIN_TYPE,
52687 +                       .id = BLACK_BOX_ID,
52688 +                       .groups = (1 << OTHER_ITEM_TYPE),
52689 +                       .pops = NULL,
52690 +                       .label = "blackbox",
52691 +                       .desc = "black box item",
52692 +                       .linkage = {NULL, NULL}
52693 +               },
52694 +               .b = {
52695 +                       .max_key_inside = NULL,
52696 +                       .can_contain_key = NULL,
52697 +                       .mergeable = not_mergeable,
52698 +                       .nr_units = nr_units_single_unit,
52699 +                       /* to need for ->lookup method */
52700 +                       .lookup = NULL,
52701 +                       .init = NULL,
52702 +                       .paste = NULL,
52703 +                       .fast_paste = NULL,
52704 +                       .can_shift = NULL,
52705 +                       .copy_units = NULL,
52706 +                       .create_hook = NULL,
52707 +                       .kill_hook = NULL,
52708 +                       .shift_hook = NULL,
52709 +                       .cut_units = NULL,
52710 +                       .kill_units = NULL,
52711 +                       .unit_key = NULL,
52712 +                       .max_unit_key = NULL,
52713 +                       .estimate = NULL,
52714 +                       .item_data_by_flow = NULL,
52715 +#if REISER4_DEBUG
52716 +                       .check = NULL
52717 +#endif
52718 +               }
52719 +       }
52720 +};
52721 +
52722 +/* Make Linus happy.
52723 +   Local variables:
52724 +   c-indentation-style: "K&R"
52725 +   mode-name: "LC"
52726 +   c-basic-offset: 8
52727 +   tab-width: 8
52728 +   fill-column: 120
52729 +   End:
52730 +*/
52731 diff --git a/fs/reiser4/plugin/item/item.h b/fs/reiser4/plugin/item/item.h
52732 new file mode 100644
52733 index 0000000..0822296
52734 --- /dev/null
52735 +++ b/fs/reiser4/plugin/item/item.h
52736 @@ -0,0 +1,400 @@
52737 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
52738 +
52739 +/* first read balance.c comments before reading this */
52740 +
52741 +/* An item_plugin implements all of the operations required for
52742 +   balancing that are item specific. */
52743 +
52744 +/* an item plugin also implements other operations that are specific to that
52745 +   item.  These go into the item specific operations portion of the item
52746 +   handler, and all of the item specific portions of the item handler are put
52747 +   into a union. */
52748 +
52749 +#if !defined( __REISER4_ITEM_H__ )
52750 +#define __REISER4_ITEM_H__
52751 +
52752 +#include "../../forward.h"
52753 +#include "../plugin_header.h"
52754 +#include "../../dformat.h"
52755 +#include "../../seal.h"
52756 +#include "../../plugin/file/file.h"
52757 +
52758 +#include <linux/fs.h>          /* for struct file, struct inode  */
52759 +#include <linux/mm.h>          /* for struct page */
52760 +#include <linux/dcache.h>      /* for struct dentry */
52761 +
52762 +typedef enum {
52763 +       STAT_DATA_ITEM_TYPE,
52764 +       DIR_ENTRY_ITEM_TYPE,
52765 +       INTERNAL_ITEM_TYPE,
52766 +       UNIX_FILE_METADATA_ITEM_TYPE,
52767 +       OTHER_ITEM_TYPE
52768 +} item_type_id;
52769 +
52770 +/* this is the part of each item plugin that all items are expected to
52771 +   support or at least explicitly fail to support by setting the
52772 +   pointer to null. */
52773 +typedef struct {
52774 +       /* operations called by balancing
52775 +
52776 +          It is interesting to consider that some of these item
52777 +          operations could be given sources or targets that are not
52778 +          really items in nodes.  This could be ok/useful.
52779 +
52780 +        */
52781 +       /* maximal key that can _possibly_ be occupied by this item
52782 +
52783 +          When inserting, and node ->lookup() method (called by
52784 +          coord_by_key()) reaches an item after binary search,
52785 +          the  ->max_key_inside() item plugin method is used to determine
52786 +          whether new item should pasted into existing item
52787 +          (new_key<=max_key_inside()) or new item has to be created
52788 +          (new_key>max_key_inside()).
52789 +
52790 +          For items that occupy exactly one key (like stat-data)
52791 +          this method should return this key. For items that can
52792 +          grow indefinitely (extent, directory item) this should
52793 +          return reiser4_max_key().
52794 +
52795 +          For example extent with the key
52796 +
52797 +          (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52798 +
52799 +          ->max_key_inside is (LOCALITY,4,OBJID,0xffffffffffffffff), and
52800 +        */
52801 +       reiser4_key *(*max_key_inside) (const coord_t *, reiser4_key *);
52802 +
52803 +       /* true if item @coord can merge data at @key. */
52804 +       int (*can_contain_key) (const coord_t *, const reiser4_key *,
52805 +                               const reiser4_item_data *);
52806 +       /* mergeable() - check items for mergeability
52807 +
52808 +          Optional method. Returns true if two items can be merged.
52809 +
52810 +        */
52811 +       int (*mergeable) (const coord_t *, const coord_t *);
52812 +
52813 +       /* number of atomic things in an item.
52814 +          NOTE FOR CONTRIBUTORS: use a generic method
52815 +          nr_units_single_unit() for solid (atomic) items, as
52816 +          tree operations use it as a criterion of solidness
52817 +          (see is_solid_item macro) */
52818 +       pos_in_node_t(*nr_units) (const coord_t *);
52819 +
52820 +       /* search within item for a unit within the item, and return a
52821 +          pointer to it.  This can be used to calculate how many
52822 +          bytes to shrink an item if you use pointer arithmetic and
52823 +          compare to the start of the item body if the item's data
52824 +          are continuous in the node, if the item's data are not
52825 +          continuous in the node, all sorts of other things are maybe
52826 +          going to break as well. */
52827 +        lookup_result(*lookup) (const reiser4_key *, lookup_bias, coord_t *);
52828 +       /* method called by ode_plugin->create_item() to initialise new
52829 +          item */
52830 +       int (*init) (coord_t * target, coord_t * from,
52831 +                    reiser4_item_data * data);
52832 +       /* method called (e.g., by reiser4_resize_item()) to place new data
52833 +          into item when it grows */
52834 +       int (*paste) (coord_t *, reiser4_item_data *, carry_plugin_info *);
52835 +       /* return true if paste into @coord is allowed to skip
52836 +          carry. That is, if such paste would require any changes
52837 +          at the parent level
52838 +        */
52839 +       int (*fast_paste) (const coord_t *);
52840 +       /* how many but not more than @want units of @source can be
52841 +          shifted into @target node. If pend == append - we try to
52842 +          append last item of @target by first units of @source. If
52843 +          pend == prepend - we try to "prepend" first item in @target
52844 +          by last units of @source. @target node has @free_space
52845 +          bytes of free space. Total size of those units are returned
52846 +          via @size.
52847 +
52848 +          @target is not NULL if shifting to the mergeable item and
52849 +          NULL is new item will be created during shifting.
52850 +        */
52851 +       int (*can_shift) (unsigned free_space, coord_t *,
52852 +                         znode *, shift_direction, unsigned *size,
52853 +                         unsigned want);
52854 +
52855 +       /* starting off @from-th unit of item @source append or
52856 +          prepend @count units to @target. @target has been already
52857 +          expanded by @free_space bytes. That must be exactly what is
52858 +          needed for those items in @target. If @where_is_free_space
52859 +          == SHIFT_LEFT - free space is at the end of @target item,
52860 +          othersize - it is in the beginning of it. */
52861 +       void (*copy_units) (coord_t *, coord_t *,
52862 +                           unsigned from, unsigned count,
52863 +                           shift_direction where_is_free_space,
52864 +                           unsigned free_space);
52865 +
52866 +       int (*create_hook) (const coord_t *, void *);
52867 +       /* do whatever is necessary to do when @count units starting
52868 +          from @from-th one are removed from the tree */
52869 +       /* FIXME-VS: this is used to be here for, in particular,
52870 +          extents and items of internal type to free blocks they point
52871 +          to at the same time with removing items from a
52872 +          tree. Problems start, however, when dealloc_block fails due
52873 +          to some reason. Item gets removed, but blocks it pointed to
52874 +          are not freed. It is not clear how to fix this for items of
52875 +          internal type because a need to remove internal item may
52876 +          appear in the middle of balancing, and there is no way to
52877 +          undo changes made. OTOH, if space allocator involves
52878 +          balancing to perform dealloc_block - this will probably
52879 +          break balancing due to deadlock issues
52880 +        */
52881 +       int (*kill_hook) (const coord_t *, pos_in_node_t from,
52882 +                         pos_in_node_t count, struct carry_kill_data *);
52883 +       int (*shift_hook) (const coord_t *, unsigned from, unsigned count,
52884 +                          znode * _node);
52885 +
52886 +       /* unit @*from contains @from_key. unit @*to contains @to_key. Cut all keys between @from_key and @to_key
52887 +          including boundaries. When units are cut from item beginning - move space which gets freed to head of
52888 +          item. When units are cut from item end - move freed space to item end. When units are cut from the middle of
52889 +          item - move freed space to item head. Return amount of space which got freed. Save smallest removed key in
52890 +          @smallest_removed if it is not 0. Save new first item key in @new_first_key if it is not 0
52891 +        */
52892 +       int (*cut_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52893 +                         struct carry_cut_data *,
52894 +                         reiser4_key * smallest_removed,
52895 +                         reiser4_key * new_first_key);
52896 +
52897 +       /* like cut_units, except that these units are removed from the
52898 +          tree, not only from a node */
52899 +       int (*kill_units) (coord_t *, pos_in_node_t from, pos_in_node_t to,
52900 +                          struct carry_kill_data *,
52901 +                          reiser4_key * smallest_removed,
52902 +                          reiser4_key * new_first);
52903 +
52904 +       /* if @key_of_coord == 1 - returned key of coord, otherwise -
52905 +          key of unit is returned. If @coord is not set to certain
52906 +          unit - ERR_PTR(-ENOENT) is returned */
52907 +       reiser4_key *(*unit_key) (const coord_t *, reiser4_key *);
52908 +       reiser4_key *(*max_unit_key) (const coord_t *, reiser4_key *);
52909 +       /* estimate how much space is needed for paste @data into item at
52910 +          @coord. if @coord==0 - estimate insertion, otherwise - estimate
52911 +          pasting
52912 +        */
52913 +       int (*estimate) (const coord_t *, const reiser4_item_data *);
52914 +
52915 +       /* converts flow @f to item data. @coord == 0 on insert */
52916 +       int (*item_data_by_flow) (const coord_t *, const flow_t *,
52917 +                                 reiser4_item_data *);
52918 +
52919 +       /*void (*show) (struct seq_file *, coord_t *); */
52920 +
52921 +#if REISER4_DEBUG
52922 +       /* used for debugging, every item should have here the most
52923 +          complete possible check of the consistency of the item that
52924 +          the inventor can construct */
52925 +       int (*check) (const coord_t *, const char **error);
52926 +#endif
52927 +
52928 +} balance_ops;
52929 +
52930 +typedef struct {
52931 +       /* return the right or left child of @coord, only if it is in memory */
52932 +       int (*utmost_child) (const coord_t *, sideof side, jnode ** child);
52933 +
52934 +       /* return whether the right or left child of @coord has a non-fake
52935 +          block number. */
52936 +       int (*utmost_child_real_block) (const coord_t *, sideof side,
52937 +                                       reiser4_block_nr *);
52938 +       /* relocate child at @coord to the @block */
52939 +       void (*update) (const coord_t *, const reiser4_block_nr *);
52940 +       /* count unformatted nodes per item for leave relocation policy, etc.. */
52941 +       int (*scan) (flush_scan * scan);
52942 +       /* convert item by flush */
52943 +       int (*convert) (flush_pos_t * pos);
52944 +       /* backward mapping from jnode offset to a key.  */
52945 +       int (*key_by_offset) (struct inode *, loff_t, reiser4_key *);
52946 +} flush_ops;
52947 +
52948 +/* operations specific to the directory item */
52949 +typedef struct {
52950 +       /* extract stat-data key from directory entry at @coord and place it
52951 +          into @key. */
52952 +       int (*extract_key) (const coord_t *, reiser4_key * key);
52953 +       /* update object key in item. */
52954 +       int (*update_key) (const coord_t *, const reiser4_key *, lock_handle *);
52955 +       /* extract name from directory entry at @coord and return it */
52956 +       char *(*extract_name) (const coord_t *, char *buf);
52957 +       /* extract file type (DT_* stuff) from directory entry at @coord and
52958 +          return it */
52959 +       unsigned (*extract_file_type) (const coord_t *);
52960 +       int (*add_entry) (struct inode * dir,
52961 +                         coord_t *, lock_handle *,
52962 +                         const struct dentry * name,
52963 +                         reiser4_dir_entry_desc * entry);
52964 +       int (*rem_entry) (struct inode * dir, const struct qstr * name,
52965 +                         coord_t *, lock_handle *,
52966 +                         reiser4_dir_entry_desc * entry);
52967 +       int (*max_name_len) (const struct inode * dir);
52968 +} dir_entry_ops;
52969 +
52970 +/* operations specific to items regular (unix) file metadata are built of */
52971 +typedef struct {
52972 +       int (*write) (struct file *, const char __user *, size_t, loff_t *pos);
52973 +       int (*read) (struct file *, flow_t *, hint_t *);
52974 +       int (*readpage) (void *, struct page *);
52975 +       int (*get_block) (const coord_t *, sector_t, sector_t *);
52976 +       /*
52977 +        * key of first byte which is not addressed by the item @coord is set
52978 +        * to.
52979 +        * For example, for extent item with the key
52980 +        *
52981 +        * (LOCALITY,4,OBJID,STARTING-OFFSET), and length BLK blocks,
52982 +        *
52983 +        * ->append_key is
52984 +        *
52985 +        * (LOCALITY,4,OBJID,STARTING-OFFSET + BLK * block_size)
52986 +        */
52987 +       reiser4_key *(*append_key) (const coord_t *, reiser4_key *);
52988 +
52989 +       void (*init_coord_extension) (uf_coord_t *, loff_t);
52990 +} file_ops;
52991 +
52992 +/* operations specific to items of stat data type */
52993 +typedef struct {
52994 +       int (*init_inode) (struct inode * inode, char *sd, int len);
52995 +       int (*save_len) (struct inode * inode);
52996 +       int (*save) (struct inode * inode, char **area);
52997 +} sd_ops;
52998 +
52999 +/* operations specific to internal item */
53000 +typedef struct {
53001 +       /* all tree traversal want to know from internal item is where
53002 +          to go next. */
53003 +       void (*down_link) (const coord_t * coord,
53004 +                          const reiser4_key * key, reiser4_block_nr * block);
53005 +       /* check that given internal item contains given pointer. */
53006 +       int (*has_pointer_to) (const coord_t * coord,
53007 +                              const reiser4_block_nr * block);
53008 +} internal_item_ops;
53009 +
53010 +struct item_plugin {
53011 +       /* generic fields */
53012 +       plugin_header h;
53013 +
53014 +       /* methods common for all item types */
53015 +       balance_ops b;
53016 +       /* methods used during flush */
53017 +       flush_ops f;
53018 +
53019 +       /* methods specific to particular type of item */
53020 +       union {
53021 +               dir_entry_ops dir;
53022 +               file_ops file;
53023 +               sd_ops sd;
53024 +               internal_item_ops internal;
53025 +       } s;
53026 +
53027 +};
53028 +
53029 +#define is_solid_item(iplug) ((iplug)->b.nr_units == nr_units_single_unit)
53030 +
53031 +static inline item_id item_id_by_plugin(item_plugin * plugin)
53032 +{
53033 +       return plugin->h.id;
53034 +}
53035 +
53036 +static inline char get_iplugid(item_plugin * iplug)
53037 +{
53038 +       assert("nikita-2838", iplug != NULL);
53039 +       assert("nikita-2839", iplug->h.id < 0xff);
53040 +       return (char)item_id_by_plugin(iplug);
53041 +}
53042 +
53043 +extern unsigned long znode_times_locked(const znode * z);
53044 +
53045 +static inline void coord_set_iplug(coord_t * coord, item_plugin * iplug)
53046 +{
53047 +       assert("nikita-2837", coord != NULL);
53048 +       assert("nikita-2838", iplug != NULL);
53049 +       coord->iplugid = get_iplugid(iplug);
53050 +       ON_DEBUG(coord->plug_v = znode_times_locked(coord->node));
53051 +}
53052 +
53053 +static inline item_plugin *coord_iplug(const coord_t * coord)
53054 +{
53055 +       assert("nikita-2833", coord != NULL);
53056 +       assert("nikita-2834", coord->iplugid != INVALID_PLUGID);
53057 +       assert("nikita-3549", coord->plug_v == znode_times_locked(coord->node));
53058 +       return (item_plugin *) plugin_by_id(REISER4_ITEM_PLUGIN_TYPE,
53059 +                                           coord->iplugid);
53060 +}
53061 +
53062 +extern int item_can_contain_key(const coord_t * item, const reiser4_key * key,
53063 +                               const reiser4_item_data *);
53064 +extern int are_items_mergeable(const coord_t * i1, const coord_t * i2);
53065 +extern int item_is_extent(const coord_t *);
53066 +extern int item_is_tail(const coord_t *);
53067 +extern int item_is_statdata(const coord_t * item);
53068 +extern int item_is_ctail(const coord_t *);
53069 +
53070 +extern pos_in_node_t item_length_by_coord(const coord_t * coord);
53071 +extern pos_in_node_t nr_units_single_unit(const coord_t * coord);
53072 +extern item_id item_id_by_coord(const coord_t * coord /* coord to query */ );
53073 +extern reiser4_key *item_key_by_coord(const coord_t * coord, reiser4_key * key);
53074 +extern reiser4_key *max_item_key_by_coord(const coord_t *, reiser4_key *);
53075 +extern reiser4_key *unit_key_by_coord(const coord_t * coord, reiser4_key * key);
53076 +extern reiser4_key *max_unit_key_by_coord(const coord_t * coord,
53077 +                                         reiser4_key * key);
53078 +extern void obtain_item_plugin(const coord_t * coord);
53079 +
53080 +#if defined(REISER4_DEBUG)
53081 +extern int znode_is_loaded(const znode * node);
53082 +#endif
53083 +
53084 +/* return plugin of item at @coord */
53085 +static inline item_plugin *item_plugin_by_coord(const coord_t *
53086 +                                               coord /* coord to query */ )
53087 +{
53088 +       assert("nikita-330", coord != NULL);
53089 +       assert("nikita-331", coord->node != NULL);
53090 +       assert("nikita-332", znode_is_loaded(coord->node));
53091 +
53092 +       if (unlikely(!coord_is_iplug_set(coord)))
53093 +               obtain_item_plugin(coord);
53094 +       return coord_iplug(coord);
53095 +}
53096 +
53097 +/* this returns true if item is of internal type */
53098 +static inline int item_is_internal(const coord_t * item)
53099 +{
53100 +       assert("vs-483", coord_is_existing_item(item));
53101 +       return plugin_of_group(item_plugin_by_coord(item), INTERNAL_ITEM_TYPE);
53102 +}
53103 +
53104 +extern void item_body_by_coord_hard(coord_t * coord);
53105 +extern void *item_body_by_coord_easy(const coord_t * coord);
53106 +#if REISER4_DEBUG
53107 +extern int item_body_is_valid(const coord_t * coord);
53108 +#endif
53109 +
53110 +/* return pointer to item body */
53111 +static inline void *item_body_by_coord(const coord_t *
53112 +                                      coord /* coord to query */ )
53113 +{
53114 +       assert("nikita-324", coord != NULL);
53115 +       assert("nikita-325", coord->node != NULL);
53116 +       assert("nikita-326", znode_is_loaded(coord->node));
53117 +
53118 +       if (coord->offset == INVALID_OFFSET)
53119 +               item_body_by_coord_hard((coord_t *) coord);
53120 +       assert("nikita-3201", item_body_is_valid(coord));
53121 +       assert("nikita-3550", coord->body_v == znode_times_locked(coord->node));
53122 +       return item_body_by_coord_easy(coord);
53123 +}
53124 +
53125 +/* __REISER4_ITEM_H__ */
53126 +#endif
53127 +/* Make Linus happy.
53128 +   Local variables:
53129 +   c-indentation-style: "K&R"
53130 +   mode-name: "LC"
53131 +   c-basic-offset: 8
53132 +   tab-width: 8
53133 +   fill-column: 120
53134 +   scroll-step: 1
53135 +   End:
53136 +*/
53137 diff --git a/fs/reiser4/plugin/item/sde.c b/fs/reiser4/plugin/item/sde.c
53138 new file mode 100644
53139 index 0000000..27f2400
53140 --- /dev/null
53141 +++ b/fs/reiser4/plugin/item/sde.c
53142 @@ -0,0 +1,190 @@
53143 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53144 +
53145 +/* Directory entry implementation */
53146 +#include "../../forward.h"
53147 +#include "../../debug.h"
53148 +#include "../../dformat.h"
53149 +#include "../../kassign.h"
53150 +#include "../../coord.h"
53151 +#include "sde.h"
53152 +#include "item.h"
53153 +#include "../plugin.h"
53154 +#include "../../znode.h"
53155 +#include "../../carry.h"
53156 +#include "../../tree.h"
53157 +#include "../../inode.h"
53158 +
53159 +#include <linux/fs.h>          /* for struct inode */
53160 +#include <linux/dcache.h>      /* for struct dentry */
53161 +#include <linux/quotaops.h>
53162 +
53163 +/* ->extract_key() method of simple directory item plugin. */
53164 +int extract_key_de(const coord_t * coord /* coord of item */ ,
53165 +                  reiser4_key * key /* resulting key */ )
53166 +{
53167 +       directory_entry_format *dent;
53168 +
53169 +       assert("nikita-1458", coord != NULL);
53170 +       assert("nikita-1459", key != NULL);
53171 +
53172 +       dent = (directory_entry_format *) item_body_by_coord(coord);
53173 +       assert("nikita-1158", item_length_by_coord(coord) >= (int)sizeof *dent);
53174 +       return extract_key_from_id(&dent->id, key);
53175 +}
53176 +
53177 +int
53178 +update_key_de(const coord_t * coord, const reiser4_key * key,
53179 +             lock_handle * lh UNUSED_ARG)
53180 +{
53181 +       directory_entry_format *dent;
53182 +       obj_key_id obj_id;
53183 +       int result;
53184 +
53185 +       assert("nikita-2342", coord != NULL);
53186 +       assert("nikita-2343", key != NULL);
53187 +
53188 +       dent = (directory_entry_format *) item_body_by_coord(coord);
53189 +       result = build_obj_key_id(key, &obj_id);
53190 +       if (result == 0) {
53191 +               dent->id = obj_id;
53192 +               znode_make_dirty(coord->node);
53193 +       }
53194 +       return 0;
53195 +}
53196 +
53197 +char *extract_dent_name(const coord_t * coord, directory_entry_format * dent,
53198 +                       char *buf)
53199 +{
53200 +       reiser4_key key;
53201 +
53202 +       unit_key_by_coord(coord, &key);
53203 +       if (get_key_type(&key) != KEY_FILE_NAME_MINOR)
53204 +               reiser4_print_address("oops", znode_get_block(coord->node));
53205 +       if (!is_longname_key(&key)) {
53206 +               if (is_dot_key(&key))
53207 +                       return (char *)".";
53208 +               else
53209 +                       return extract_name_from_key(&key, buf);
53210 +       } else
53211 +               return (char *)dent->name;
53212 +}
53213 +
53214 +/* ->extract_name() method of simple directory item plugin. */
53215 +char *extract_name_de(const coord_t * coord /* coord of item */ , char *buf)
53216 +{
53217 +       directory_entry_format *dent;
53218 +
53219 +       assert("nikita-1460", coord != NULL);
53220 +
53221 +       dent = (directory_entry_format *) item_body_by_coord(coord);
53222 +       return extract_dent_name(coord, dent, buf);
53223 +}
53224 +
53225 +/* ->extract_file_type() method of simple directory item plugin. */
53226 +unsigned extract_file_type_de(const coord_t * coord UNUSED_ARG /* coord of
53227 +                                                                * item */ )
53228 +{
53229 +       assert("nikita-1764", coord != NULL);
53230 +       /* we don't store file type in the directory entry yet.
53231 +
53232 +          But see comments at kassign.h:obj_key_id
53233 +        */
53234 +       return DT_UNKNOWN;
53235 +}
53236 +
53237 +int add_entry_de(struct inode *dir /* directory of item */ ,
53238 +                coord_t * coord /* coord of item */ ,
53239 +                lock_handle * lh /* insertion lock handle */ ,
53240 +                const struct dentry *de /* name to add */ ,
53241 +                reiser4_dir_entry_desc * entry /* parameters of new directory
53242 +                                                * entry */ )
53243 +{
53244 +       reiser4_item_data data;
53245 +       directory_entry_format *dent;
53246 +       int result;
53247 +       const char *name;
53248 +       int len;
53249 +       int longname;
53250 +
53251 +       name = de->d_name.name;
53252 +       len = de->d_name.len;
53253 +       assert("nikita-1163", strlen(name) == len);
53254 +
53255 +       longname = is_longname(name, len);
53256 +
53257 +       data.length = sizeof *dent;
53258 +       if (longname)
53259 +               data.length += len + 1;
53260 +       data.data = NULL;
53261 +       data.user = 0;
53262 +       data.iplug = item_plugin_by_id(SIMPLE_DIR_ENTRY_ID);
53263 +
53264 +       /* NOTE-NIKITA quota plugin */
53265 +       if (DQUOT_ALLOC_SPACE_NODIRTY(dir, data.length))
53266 +               return -EDQUOT;
53267 +
53268 +       result = insert_by_coord(coord, &data, &entry->key, lh, 0 /*flags */ );
53269 +       if (result != 0)
53270 +               return result;
53271 +
53272 +       dent = (directory_entry_format *) item_body_by_coord(coord);
53273 +       build_inode_key_id(entry->obj, &dent->id);
53274 +       if (longname) {
53275 +               memcpy(dent->name, name, len);
53276 +               put_unaligned(0, &dent->name[len]);
53277 +       }
53278 +       return 0;
53279 +}
53280 +
53281 +int rem_entry_de(struct inode *dir /* directory of item */ ,
53282 +                const struct qstr *name UNUSED_ARG,
53283 +                coord_t * coord /* coord of item */ ,
53284 +                lock_handle * lh UNUSED_ARG    /* lock handle for
53285 +                                                * removal */ ,
53286 +                reiser4_dir_entry_desc * entry UNUSED_ARG      /* parameters of
53287 +                                                                * directory entry
53288 +                                                                * being removed */ )
53289 +{
53290 +       coord_t shadow;
53291 +       int result;
53292 +       int length;
53293 +
53294 +       length = item_length_by_coord(coord);
53295 +       if (inode_get_bytes(dir) < length) {
53296 +               warning("nikita-2627", "Dir is broke: %llu: %llu",
53297 +                       (unsigned long long)get_inode_oid(dir),
53298 +                       inode_get_bytes(dir));
53299 +
53300 +               return RETERR(-EIO);
53301 +       }
53302 +
53303 +       /* cut_node() is supposed to take pointers to _different_
53304 +          coords, because it will modify them without respect to
53305 +          possible aliasing. To work around this, create temporary copy
53306 +          of @coord.
53307 +        */
53308 +       coord_dup(&shadow, coord);
53309 +       result =
53310 +           kill_node_content(coord, &shadow, NULL, NULL, NULL, NULL, NULL, 0);
53311 +       if (result == 0) {
53312 +               /* NOTE-NIKITA quota plugin */
53313 +               DQUOT_FREE_SPACE_NODIRTY(dir, length);
53314 +       }
53315 +       return result;
53316 +}
53317 +
53318 +int max_name_len_de(const struct inode *dir)
53319 +{
53320 +       return reiser4_tree_by_inode(dir)->nplug->max_item_size() -
53321 +               sizeof(directory_entry_format) - 2;
53322 +}
53323 +
53324 +/* Make Linus happy.
53325 +   Local variables:
53326 +   c-indentation-style: "K&R"
53327 +   mode-name: "LC"
53328 +   c-basic-offset: 8
53329 +   tab-width: 8
53330 +   fill-column: 120
53331 +   End:
53332 +*/
53333 diff --git a/fs/reiser4/plugin/item/sde.h b/fs/reiser4/plugin/item/sde.h
53334 new file mode 100644
53335 index 0000000..f26762a
53336 --- /dev/null
53337 +++ b/fs/reiser4/plugin/item/sde.h
53338 @@ -0,0 +1,66 @@
53339 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53340 +
53341 +/* Directory entry. */
53342 +
53343 +#if !defined( __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ )
53344 +#define __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__
53345 +
53346 +#include "../../forward.h"
53347 +#include "../../dformat.h"
53348 +#include "../../kassign.h"
53349 +#include "../../key.h"
53350 +
53351 +#include <linux/fs.h>
53352 +#include <linux/dcache.h>      /* for struct dentry */
53353 +
53354 +typedef struct directory_entry_format {
53355 +       /* key of object stat-data. It's not necessary to store whole
53356 +          key here, because it's always key of stat-data, so minor
53357 +          packing locality and offset can be omitted here. But this
53358 +          relies on particular key allocation scheme for stat-data, so,
53359 +          for extensibility sake, whole key can be stored here.
53360 +
53361 +          We store key as array of bytes, because we don't want 8-byte
53362 +          alignment of dir entries.
53363 +        */
53364 +       obj_key_id id;
53365 +       /* file name. Null terminated string. */
53366 +       d8 name[0];
53367 +} directory_entry_format;
53368 +
53369 +void print_de(const char *prefix, coord_t * coord);
53370 +int extract_key_de(const coord_t * coord, reiser4_key * key);
53371 +int update_key_de(const coord_t * coord, const reiser4_key * key,
53372 +                 lock_handle * lh);
53373 +char *extract_name_de(const coord_t * coord, char *buf);
53374 +unsigned extract_file_type_de(const coord_t * coord);
53375 +int add_entry_de(struct inode *dir, coord_t * coord,
53376 +                lock_handle * lh, const struct dentry *name,
53377 +                reiser4_dir_entry_desc * entry);
53378 +int rem_entry_de(struct inode *dir, const struct qstr *name, coord_t * coord,
53379 +                lock_handle * lh, reiser4_dir_entry_desc * entry);
53380 +int max_name_len_de(const struct inode *dir);
53381 +
53382 +int de_rem_and_shrink(struct inode *dir, coord_t * coord, int length);
53383 +
53384 +char *extract_dent_name(const coord_t * coord,
53385 +                       directory_entry_format * dent, char *buf);
53386 +
53387 +#if REISER4_LARGE_KEY
53388 +#define DE_NAME_BUF_LEN (24)
53389 +#else
53390 +#define DE_NAME_BUF_LEN (16)
53391 +#endif
53392 +
53393 +/* __FS_REISER4_PLUGIN_DIRECTORY_ENTRY_H__ */
53394 +#endif
53395 +
53396 +/* Make Linus happy.
53397 +   Local variables:
53398 +   c-indentation-style: "K&R"
53399 +   mode-name: "LC"
53400 +   c-basic-offset: 8
53401 +   tab-width: 8
53402 +   fill-column: 120
53403 +   End:
53404 +*/
53405 diff --git a/fs/reiser4/plugin/item/static_stat.c b/fs/reiser4/plugin/item/static_stat.c
53406 new file mode 100644
53407 index 0000000..c38e44a
53408 --- /dev/null
53409 +++ b/fs/reiser4/plugin/item/static_stat.c
53410 @@ -0,0 +1,1106 @@
53411 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
53412 +
53413 +/* stat data manipulation. */
53414 +
53415 +#include "../../forward.h"
53416 +#include "../../super.h"
53417 +#include "../../vfs_ops.h"
53418 +#include "../../inode.h"
53419 +#include "../../debug.h"
53420 +#include "../../dformat.h"
53421 +#include "../object.h"
53422 +#include "../plugin.h"
53423 +#include "../plugin_header.h"
53424 +#include "static_stat.h"
53425 +#include "item.h"
53426 +
53427 +#include <linux/types.h>
53428 +#include <linux/fs.h>
53429 +
53430 +/* see static_stat.h for explanation */
53431 +
53432 +/* helper function used while we are dumping/loading inode/plugin state
53433 +    to/from the stat-data. */
53434 +
53435 +static void move_on(int *length /* space remaining in stat-data */ ,
53436 +                   char **area /* current coord in stat data */ ,
53437 +                   int size_of /* how many bytes to move forward */ )
53438 +{
53439 +       assert("nikita-615", length != NULL);
53440 +       assert("nikita-616", area != NULL);
53441 +
53442 +       *length -= size_of;
53443 +       *area += size_of;
53444 +
53445 +       assert("nikita-617", *length >= 0);
53446 +}
53447 +
53448 +/* helper function used while loading inode/plugin state from stat-data.
53449 +    Complain if there is less space in stat-data than was expected.
53450 +    Can only happen on disk corruption. */
53451 +static int not_enough_space(struct inode *inode /* object being processed */ ,
53452 +                           const char *where /* error message */ )
53453 +{
53454 +       assert("nikita-618", inode != NULL);
53455 +
53456 +       warning("nikita-619", "Not enough space in %llu while loading %s",
53457 +               (unsigned long long)get_inode_oid(inode), where);
53458 +
53459 +       return RETERR(-EINVAL);
53460 +}
53461 +
53462 +/* helper function used while loading inode/plugin state from
53463 +    stat-data. Call it if invalid plugin id was found. */
53464 +static int unknown_plugin(reiser4_plugin_id id /* invalid id */ ,
53465 +                         struct inode *inode /* object being processed */ )
53466 +{
53467 +       warning("nikita-620", "Unknown plugin %i in %llu",
53468 +               id, (unsigned long long)get_inode_oid(inode));
53469 +
53470 +       return RETERR(-EINVAL);
53471 +}
53472 +
53473 +/* this is installed as ->init_inode() method of
53474 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c).
53475 +    Copies data from on-disk stat-data format into inode.
53476 +    Handles stat-data extensions. */
53477 +/* was sd_load */
53478 +int init_inode_static_sd(struct inode *inode /* object being processed */ ,
53479 +                        char *sd /* stat-data body */ ,
53480 +                        int len /* length of stat-data */ )
53481 +{
53482 +       int result;
53483 +       int bit;
53484 +       int chunk;
53485 +       __u16 mask;
53486 +       __u64 bigmask;
53487 +       reiser4_stat_data_base *sd_base;
53488 +       reiser4_inode *state;
53489 +
53490 +       assert("nikita-625", inode != NULL);
53491 +       assert("nikita-626", sd != NULL);
53492 +
53493 +       result = 0;
53494 +       sd_base = (reiser4_stat_data_base *) sd;
53495 +       state = reiser4_inode_data(inode);
53496 +       mask = le16_to_cpu(get_unaligned(&sd_base->extmask));
53497 +       bigmask = mask;
53498 +       reiser4_inode_set_flag(inode, REISER4_SDLEN_KNOWN);
53499 +
53500 +       move_on(&len, &sd, sizeof *sd_base);
53501 +       for (bit = 0, chunk = 0;
53502 +            mask != 0 || bit <= LAST_IMPORTANT_SD_EXTENSION;
53503 +            ++bit, mask >>= 1) {
53504 +               if (((bit + 1) % 16) != 0) {
53505 +                       /* handle extension */
53506 +                       sd_ext_plugin *sdplug;
53507 +
53508 +                       if (bit >= LAST_SD_EXTENSION) {
53509 +                               warning("vpf-1904",
53510 +                                       "No such extension %i in inode %llu",
53511 +                                       bit,
53512 +                                       (unsigned long long)
53513 +                                       get_inode_oid(inode));
53514 +
53515 +                               result = RETERR(-EINVAL);
53516 +                               break;
53517 +                       }
53518 +
53519 +                       sdplug = sd_ext_plugin_by_id(bit);
53520 +                       if (sdplug == NULL) {
53521 +                               warning("nikita-627",
53522 +                                       "No such extension %i in inode %llu",
53523 +                                       bit,
53524 +                                       (unsigned long long)
53525 +                                       get_inode_oid(inode));
53526 +
53527 +                               result = RETERR(-EINVAL);
53528 +                               break;
53529 +                       }
53530 +                       if (mask & 1) {
53531 +                               assert("nikita-628", sdplug->present);
53532 +                               /* alignment is not supported in node layout
53533 +                                  plugin yet.
53534 +                                  result = align( inode, &len, &sd,
53535 +                                  sdplug -> alignment );
53536 +                                  if( result != 0 )
53537 +                                  return result; */
53538 +                               result = sdplug->present(inode, &sd, &len);
53539 +                       } else if (sdplug->absent != NULL)
53540 +                               result = sdplug->absent(inode);
53541 +                       if (result)
53542 +                               break;
53543 +                       /* else, we are looking at the last bit in 16-bit
53544 +                          portion of bitmask */
53545 +               } else if (mask & 1) {
53546 +                       /* next portion of bitmask */
53547 +                       if (len < (int)sizeof(d16)) {
53548 +                               warning("nikita-629",
53549 +                                       "No space for bitmap in inode %llu",
53550 +                                       (unsigned long long)
53551 +                                       get_inode_oid(inode));
53552 +
53553 +                               result = RETERR(-EINVAL);
53554 +                               break;
53555 +                       }
53556 +                       mask = le16_to_cpu(get_unaligned((d16 *)sd));
53557 +                       bigmask <<= 16;
53558 +                       bigmask |= mask;
53559 +                       move_on(&len, &sd, sizeof(d16));
53560 +                       ++chunk;
53561 +                       if (chunk == 3) {
53562 +                               if (!(mask & 0x8000)) {
53563 +                                       /* clear last bit */
53564 +                                       mask &= ~0x8000;
53565 +                                       continue;
53566 +                               }
53567 +                               /* too much */
53568 +                               warning("nikita-630",
53569 +                                       "Too many extensions in %llu",
53570 +                                       (unsigned long long)
53571 +                                       get_inode_oid(inode));
53572 +
53573 +                               result = RETERR(-EINVAL);
53574 +                               break;
53575 +                       }
53576 +               } else
53577 +                       /* bitmask exhausted */
53578 +                       break;
53579 +       }
53580 +       state->extmask = bigmask;
53581 +       if (len - (bit / 16 * sizeof(d16)) > 0) {
53582 +               /* alignment in save_len_static_sd() is taken into account
53583 +                  -edward */
53584 +               warning("nikita-631", "unused space in inode %llu",
53585 +                       (unsigned long long)get_inode_oid(inode));
53586 +       }
53587 +
53588 +       return result;
53589 +}
53590 +
53591 +/* estimates size of stat-data required to store inode.
53592 +    Installed as ->save_len() method of
53593 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53594 +/* was sd_len */
53595 +int save_len_static_sd(struct inode *inode /* object being processed */ )
53596 +{
53597 +       unsigned int result;
53598 +       __u64 mask;
53599 +       int bit;
53600 +
53601 +       assert("nikita-632", inode != NULL);
53602 +
53603 +       result = sizeof(reiser4_stat_data_base);
53604 +       mask = reiser4_inode_data(inode)->extmask;
53605 +       for (bit = 0; mask != 0; ++bit, mask >>= 1) {
53606 +               if (mask & 1) {
53607 +                       sd_ext_plugin *sdplug;
53608 +
53609 +                       sdplug = sd_ext_plugin_by_id(bit);
53610 +                       assert("nikita-633", sdplug != NULL);
53611 +                       /* no aligment support
53612 +                          result +=
53613 +                          round_up( result, sdplug -> alignment ) - result; */
53614 +                       result += sdplug->save_len(inode);
53615 +               }
53616 +       }
53617 +       result += bit / 16 * sizeof(d16);
53618 +       return result;
53619 +}
53620 +
53621 +/* saves inode into stat-data.
53622 +    Installed as ->save() method of
53623 +    item_plugins[ STATIC_STAT_DATA_IT ] (fs/reiser4/plugin/item/item.c). */
53624 +/* was sd_save */
53625 +int save_static_sd(struct inode *inode /* object being processed */ ,
53626 +                  char **area /* where to save stat-data */ )
53627 +{
53628 +       int result;
53629 +       __u64 emask;
53630 +       int bit;
53631 +       unsigned int len;
53632 +       reiser4_stat_data_base *sd_base;
53633 +
53634 +       assert("nikita-634", inode != NULL);
53635 +       assert("nikita-635", area != NULL);
53636 +
53637 +       result = 0;
53638 +       emask = reiser4_inode_data(inode)->extmask;
53639 +       sd_base = (reiser4_stat_data_base *) * area;
53640 +       put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)), &sd_base->extmask);
53641 +       /*cputod16((unsigned)(emask & 0xffff), &sd_base->extmask);*/
53642 +
53643 +       *area += sizeof *sd_base;
53644 +       len = 0xffffffffu;
53645 +       for (bit = 0; emask != 0; ++bit, emask >>= 1) {
53646 +               if (emask & 1) {
53647 +                       if ((bit + 1) % 16 != 0) {
53648 +                               sd_ext_plugin *sdplug;
53649 +                               sdplug = sd_ext_plugin_by_id(bit);
53650 +                               assert("nikita-636", sdplug != NULL);
53651 +                               /* no alignment support yet
53652 +                                  align( inode, &len, area,
53653 +                                  sdplug -> alignment ); */
53654 +                               result = sdplug->save(inode, area);
53655 +                               if (result)
53656 +                                       break;
53657 +                       } else {
53658 +                               put_unaligned(cpu_to_le16((__u16)(emask & 0xffff)),
53659 +                                             (d16 *)(*area));
53660 +                               /*cputod16((unsigned)(emask & 0xffff),
53661 +                                 (d16 *) * area);*/
53662 +                               *area += sizeof(d16);
53663 +                       }
53664 +               }
53665 +       }
53666 +       return result;
53667 +}
53668 +
53669 +/* stat-data extension handling functions. */
53670 +
53671 +static int present_lw_sd(struct inode *inode /* object being processed */ ,
53672 +                        char **area /* position in stat-data */ ,
53673 +                        int *len /* remaining length */ )
53674 +{
53675 +       if (*len >= (int)sizeof(reiser4_light_weight_stat)) {
53676 +               reiser4_light_weight_stat *sd_lw;
53677 +
53678 +               sd_lw = (reiser4_light_weight_stat *) * area;
53679 +
53680 +               inode->i_mode = le16_to_cpu(get_unaligned(&sd_lw->mode));
53681 +               inode->i_nlink = le32_to_cpu(get_unaligned(&sd_lw->nlink));
53682 +               inode->i_size = le64_to_cpu(get_unaligned(&sd_lw->size));
53683 +               if ((inode->i_mode & S_IFMT) == (S_IFREG | S_IFIFO)) {
53684 +                       inode->i_mode &= ~S_IFIFO;
53685 +                       warning("", "partially converted file is encountered");
53686 +                       reiser4_inode_set_flag(inode, REISER4_PART_MIXED);
53687 +               }
53688 +               move_on(len, area, sizeof *sd_lw);
53689 +               return 0;
53690 +       } else
53691 +               return not_enough_space(inode, "lw sd");
53692 +}
53693 +
53694 +static int save_len_lw_sd(struct inode *inode UNUSED_ARG       /* object being
53695 +                                                                * processed */ )
53696 +{
53697 +       return sizeof(reiser4_light_weight_stat);
53698 +}
53699 +
53700 +static int save_lw_sd(struct inode *inode /* object being processed */ ,
53701 +                     char **area /* position in stat-data */ )
53702 +{
53703 +       reiser4_light_weight_stat *sd;
53704 +       mode_t delta;
53705 +
53706 +       assert("nikita-2705", inode != NULL);
53707 +       assert("nikita-2706", area != NULL);
53708 +       assert("nikita-2707", *area != NULL);
53709 +
53710 +       sd = (reiser4_light_weight_stat *) * area;
53711 +
53712 +       delta = (reiser4_inode_get_flag(inode,
53713 +                                       REISER4_PART_MIXED) ? S_IFIFO : 0);
53714 +       put_unaligned(cpu_to_le16(inode->i_mode | delta), &sd->mode);
53715 +       put_unaligned(cpu_to_le32(inode->i_nlink), &sd->nlink);
53716 +       put_unaligned(cpu_to_le64((__u64) inode->i_size), &sd->size);
53717 +       *area += sizeof *sd;
53718 +       return 0;
53719 +}
53720 +
53721 +static int present_unix_sd(struct inode *inode /* object being processed */ ,
53722 +                          char **area /* position in stat-data */ ,
53723 +                          int *len /* remaining length */ )
53724 +{
53725 +       assert("nikita-637", inode != NULL);
53726 +       assert("nikita-638", area != NULL);
53727 +       assert("nikita-639", *area != NULL);
53728 +       assert("nikita-640", len != NULL);
53729 +       assert("nikita-641", *len > 0);
53730 +
53731 +       if (*len >= (int)sizeof(reiser4_unix_stat)) {
53732 +               reiser4_unix_stat *sd;
53733 +
53734 +               sd = (reiser4_unix_stat *) * area;
53735 +
53736 +               inode->i_uid = le32_to_cpu(get_unaligned(&sd->uid));
53737 +               inode->i_gid = le32_to_cpu(get_unaligned(&sd->gid));
53738 +               inode->i_atime.tv_sec = le32_to_cpu(get_unaligned(&sd->atime));
53739 +               inode->i_mtime.tv_sec = le32_to_cpu(get_unaligned(&sd->mtime));
53740 +               inode->i_ctime.tv_sec = le32_to_cpu(get_unaligned(&sd->ctime));
53741 +               if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53742 +                       inode->i_rdev = le64_to_cpu(get_unaligned(&sd->u.rdev));
53743 +               else
53744 +                       inode_set_bytes(inode, (loff_t) le64_to_cpu(get_unaligned(&sd->u.bytes)));
53745 +               move_on(len, area, sizeof *sd);
53746 +               return 0;
53747 +       } else
53748 +               return not_enough_space(inode, "unix sd");
53749 +}
53750 +
53751 +static int absent_unix_sd(struct inode *inode /* object being processed */ )
53752 +{
53753 +       inode->i_uid = get_super_private(inode->i_sb)->default_uid;
53754 +       inode->i_gid = get_super_private(inode->i_sb)->default_gid;
53755 +       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
53756 +       inode_set_bytes(inode, inode->i_size);
53757 +       /* mark inode as lightweight, so that caller (lookup_common) will
53758 +          complete initialisation by copying [ug]id from a parent. */
53759 +       reiser4_inode_set_flag(inode, REISER4_LIGHT_WEIGHT);
53760 +       return 0;
53761 +}
53762 +
53763 +/* Audited by: green(2002.06.14) */
53764 +static int save_len_unix_sd(struct inode *inode UNUSED_ARG     /* object being
53765 +                                                                * processed */ )
53766 +{
53767 +       return sizeof(reiser4_unix_stat);
53768 +}
53769 +
53770 +static int save_unix_sd(struct inode *inode /* object being processed */ ,
53771 +                       char **area /* position in stat-data */ )
53772 +{
53773 +       reiser4_unix_stat *sd;
53774 +
53775 +       assert("nikita-642", inode != NULL);
53776 +       assert("nikita-643", area != NULL);
53777 +       assert("nikita-644", *area != NULL);
53778 +
53779 +       sd = (reiser4_unix_stat *) * area;
53780 +       put_unaligned(cpu_to_le32(inode->i_uid), &sd->uid);
53781 +       put_unaligned(cpu_to_le32(inode->i_gid), &sd->gid);
53782 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_sec), &sd->atime);
53783 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_sec), &sd->ctime);
53784 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_sec), &sd->mtime);
53785 +       if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
53786 +               put_unaligned(cpu_to_le64(inode->i_rdev), &sd->u.rdev);
53787 +       else
53788 +               put_unaligned(cpu_to_le64((__u64) inode_get_bytes(inode)), &sd->u.bytes);
53789 +       *area += sizeof *sd;
53790 +       return 0;
53791 +}
53792 +
53793 +static int
53794 +present_large_times_sd(struct inode *inode /* object being processed */ ,
53795 +                      char **area /* position in stat-data */ ,
53796 +                      int *len /* remaining length */ )
53797 +{
53798 +       if (*len >= (int)sizeof(reiser4_large_times_stat)) {
53799 +               reiser4_large_times_stat *sd_lt;
53800 +
53801 +               sd_lt = (reiser4_large_times_stat *) * area;
53802 +
53803 +               inode->i_atime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->atime));
53804 +               inode->i_mtime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->mtime));
53805 +               inode->i_ctime.tv_nsec = le32_to_cpu(get_unaligned(&sd_lt->ctime));
53806 +
53807 +               move_on(len, area, sizeof *sd_lt);
53808 +               return 0;
53809 +       } else
53810 +               return not_enough_space(inode, "large times sd");
53811 +}
53812 +
53813 +static int
53814 +save_len_large_times_sd(struct inode *inode UNUSED_ARG
53815 +                       /* object being processed */ )
53816 +{
53817 +       return sizeof(reiser4_large_times_stat);
53818 +}
53819 +
53820 +static int
53821 +save_large_times_sd(struct inode *inode /* object being processed */ ,
53822 +                   char **area /* position in stat-data */ )
53823 +{
53824 +       reiser4_large_times_stat *sd;
53825 +
53826 +       assert("nikita-2817", inode != NULL);
53827 +       assert("nikita-2818", area != NULL);
53828 +       assert("nikita-2819", *area != NULL);
53829 +
53830 +       sd = (reiser4_large_times_stat *) * area;
53831 +
53832 +       put_unaligned(cpu_to_le32((__u32) inode->i_atime.tv_nsec), &sd->atime);
53833 +       put_unaligned(cpu_to_le32((__u32) inode->i_ctime.tv_nsec), &sd->ctime);
53834 +       put_unaligned(cpu_to_le32((__u32) inode->i_mtime.tv_nsec), &sd->mtime);
53835 +
53836 +       *area += sizeof *sd;
53837 +       return 0;
53838 +}
53839 +
53840 +/* symlink stat data extension */
53841 +
53842 +/* allocate memory for symlink target and attach it to inode->i_private */
53843 +static int
53844 +symlink_target_to_inode(struct inode *inode, const char *target, int len)
53845 +{
53846 +       assert("vs-845", inode->i_private == NULL);
53847 +       assert("vs-846", !reiser4_inode_get_flag(inode,
53848 +                                                REISER4_GENERIC_PTR_USED));
53849 +       /* FIXME-VS: this is prone to deadlock. Not more than other similar
53850 +          places, though */
53851 +       inode->i_private = kmalloc((size_t) len + 1,
53852 +                                  reiser4_ctx_gfp_mask_get());
53853 +       if (!inode->i_private)
53854 +               return RETERR(-ENOMEM);
53855 +
53856 +       memcpy((char *)(inode->i_private), target, (size_t) len);
53857 +       ((char *)(inode->i_private))[len] = 0;
53858 +       reiser4_inode_set_flag(inode, REISER4_GENERIC_PTR_USED);
53859 +       return 0;
53860 +}
53861 +
53862 +/* this is called on read_inode. There is nothing to do actually, but some
53863 +   sanity checks */
53864 +static int present_symlink_sd(struct inode *inode, char **area, int *len)
53865 +{
53866 +       int result;
53867 +       int length;
53868 +       reiser4_symlink_stat *sd;
53869 +
53870 +       length = (int)inode->i_size;
53871 +       /*
53872 +        * *len is number of bytes in stat data item from *area to the end of
53873 +        * item. It must be not less than size of symlink + 1 for ending 0
53874 +        */
53875 +       if (length > *len)
53876 +               return not_enough_space(inode, "symlink");
53877 +
53878 +       if (*(*area + length) != 0) {
53879 +               warning("vs-840", "Symlink is not zero terminated");
53880 +               return RETERR(-EIO);
53881 +       }
53882 +
53883 +       sd = (reiser4_symlink_stat *) * area;
53884 +       result = symlink_target_to_inode(inode, sd->body, length);
53885 +
53886 +       move_on(len, area, length + 1);
53887 +       return result;
53888 +}
53889 +
53890 +static int save_len_symlink_sd(struct inode *inode)
53891 +{
53892 +       return inode->i_size + 1;
53893 +}
53894 +
53895 +/* this is called on create and update stat data. Do nothing on update but
53896 +   update @area */
53897 +static int save_symlink_sd(struct inode *inode, char **area)
53898 +{
53899 +       int result;
53900 +       int length;
53901 +       reiser4_symlink_stat *sd;
53902 +
53903 +       length = (int)inode->i_size;
53904 +       /* inode->i_size must be set already */
53905 +       assert("vs-841", length);
53906 +
53907 +       result = 0;
53908 +       sd = (reiser4_symlink_stat *) * area;
53909 +       if (!reiser4_inode_get_flag(inode, REISER4_GENERIC_PTR_USED)) {
53910 +               const char *target;
53911 +
53912 +               target = (const char *)(inode->i_private);
53913 +               inode->i_private = NULL;
53914 +
53915 +               result = symlink_target_to_inode(inode, target, length);
53916 +
53917 +               /* copy symlink to stat data */
53918 +               memcpy(sd->body, target, (size_t) length);
53919 +               (*area)[length] = 0;
53920 +       } else {
53921 +               /* there is nothing to do in update but move area */
53922 +               assert("vs-844",
53923 +                      !memcmp(inode->i_private, sd->body,
53924 +                              (size_t) length + 1));
53925 +       }
53926 +
53927 +       *area += (length + 1);
53928 +       return result;
53929 +}
53930 +
53931 +static int present_flags_sd(struct inode *inode /* object being processed */ ,
53932 +                           char **area /* position in stat-data */ ,
53933 +                           int *len /* remaining length */ )
53934 +{
53935 +       assert("nikita-645", inode != NULL);
53936 +       assert("nikita-646", area != NULL);
53937 +       assert("nikita-647", *area != NULL);
53938 +       assert("nikita-648", len != NULL);
53939 +       assert("nikita-649", *len > 0);
53940 +
53941 +       if (*len >= (int)sizeof(reiser4_flags_stat)) {
53942 +               reiser4_flags_stat *sd;
53943 +
53944 +               sd = (reiser4_flags_stat *) * area;
53945 +               inode->i_flags = le32_to_cpu(get_unaligned(&sd->flags));
53946 +               move_on(len, area, sizeof *sd);
53947 +               return 0;
53948 +       } else
53949 +               return not_enough_space(inode, "generation and attrs");
53950 +}
53951 +
53952 +/* Audited by: green(2002.06.14) */
53953 +static int save_len_flags_sd(struct inode *inode UNUSED_ARG    /* object being
53954 +                                                                * processed */ )
53955 +{
53956 +       return sizeof(reiser4_flags_stat);
53957 +}
53958 +
53959 +static int save_flags_sd(struct inode *inode /* object being processed */ ,
53960 +                        char **area /* position in stat-data */ )
53961 +{
53962 +       reiser4_flags_stat *sd;
53963 +
53964 +       assert("nikita-650", inode != NULL);
53965 +       assert("nikita-651", area != NULL);
53966 +       assert("nikita-652", *area != NULL);
53967 +
53968 +       sd = (reiser4_flags_stat *) * area;
53969 +       put_unaligned(cpu_to_le32(inode->i_flags), &sd->flags);
53970 +       *area += sizeof *sd;
53971 +       return 0;
53972 +}
53973 +
53974 +static int absent_plugin_sd(struct inode *inode);
53975 +static int present_plugin_sd(struct inode *inode /* object being processed */ ,
53976 +                            char **area /* position in stat-data */ ,
53977 +                            int *len /* remaining length */,
53978 +                            int is_pset /* 1 if plugin set, 0 if heir set. */)
53979 +{
53980 +       reiser4_plugin_stat *sd;
53981 +       reiser4_plugin *plugin;
53982 +       reiser4_inode *info;
53983 +       int i;
53984 +       __u16 mask;
53985 +       int result;
53986 +       int num_of_plugins;
53987 +
53988 +       assert("nikita-653", inode != NULL);
53989 +       assert("nikita-654", area != NULL);
53990 +       assert("nikita-655", *area != NULL);
53991 +       assert("nikita-656", len != NULL);
53992 +       assert("nikita-657", *len > 0);
53993 +
53994 +       if (*len < (int)sizeof(reiser4_plugin_stat))
53995 +               return not_enough_space(inode, "plugin");
53996 +
53997 +       sd = (reiser4_plugin_stat *) * area;
53998 +       info = reiser4_inode_data(inode);
53999 +
54000 +       mask = 0;
54001 +       num_of_plugins = le16_to_cpu(get_unaligned(&sd->plugins_no));
54002 +       move_on(len, area, sizeof *sd);
54003 +       result = 0;
54004 +       for (i = 0; i < num_of_plugins; ++i) {
54005 +               reiser4_plugin_slot *slot;
54006 +               reiser4_plugin_type type;
54007 +               pset_member memb;
54008 +
54009 +               slot = (reiser4_plugin_slot *) * area;
54010 +               if (*len < (int)sizeof *slot)
54011 +                       return not_enough_space(inode, "additional plugin");
54012 +
54013 +               memb = le16_to_cpu(get_unaligned(&slot->pset_memb));
54014 +               type = aset_member_to_type_unsafe(memb);
54015 +
54016 +               if (type == REISER4_PLUGIN_TYPES) {
54017 +                       warning("nikita-3502",
54018 +                               "wrong %s member (%i) for %llu", is_pset ?
54019 +                               "pset" : "hset", memb,
54020 +                               (unsigned long long)get_inode_oid(inode));
54021 +                       return RETERR(-EINVAL);
54022 +               }
54023 +               plugin = plugin_by_disk_id(reiser4_tree_by_inode(inode),
54024 +                                          type, &slot->id);
54025 +               if (plugin == NULL)
54026 +                       return unknown_plugin(le16_to_cpu(get_unaligned(&slot->id)), inode);
54027 +
54028 +               /* plugin is loaded into inode, mark this into inode's
54029 +                  bitmask of loaded non-standard plugins */
54030 +               if (!(mask & (1 << memb))) {
54031 +                       mask |= (1 << memb);
54032 +               } else {
54033 +                       warning("nikita-658", "duplicate plugin for %llu",
54034 +                               (unsigned long long)get_inode_oid(inode));
54035 +                       return RETERR(-EINVAL);
54036 +               }
54037 +               move_on(len, area, sizeof *slot);
54038 +               /* load plugin data, if any */
54039 +               if (plugin->h.pops != NULL && plugin->h.pops->load)
54040 +                       result = plugin->h.pops->load(inode, plugin, area, len);
54041 +               else
54042 +                       result = aset_set_unsafe(is_pset ? &info->pset :
54043 +                                                &info->hset, memb, plugin);
54044 +               if (result)
54045 +                       return result;
54046 +       }
54047 +       if (is_pset) {
54048 +               /* if object plugin wasn't loaded from stat-data, guess it by
54049 +                  mode bits */
54050 +               plugin = file_plugin_to_plugin(inode_file_plugin(inode));
54051 +               if (plugin == NULL)
54052 +                       result = absent_plugin_sd(inode);
54053 +               info->plugin_mask = mask;
54054 +       } else
54055 +               info->heir_mask = mask;
54056 +
54057 +       return result;
54058 +}
54059 +
54060 +static int present_pset_sd(struct inode *inode, char **area, int *len) {
54061 +       return present_plugin_sd(inode, area, len, 1 /* pset */);
54062 +}
54063 +
54064 +/* Determine object plugin for @inode based on i_mode.
54065 +
54066 +   Many objects in reiser4 file system are controlled by standard object
54067 +   plugins that emulate traditional unix objects: unix file, directory, symlink, fifo, and so on.
54068 +
54069 +   For such files we don't explicitly store plugin id in object stat
54070 +   data. Rather required plugin is guessed from mode bits, where file "type"
54071 +   is encoded (see stat(2)).
54072 +*/
54073 +static int
54074 +guess_plugin_by_mode(struct inode *inode /* object to guess plugins for */ )
54075 +{
54076 +       int fplug_id;
54077 +       int dplug_id;
54078 +       reiser4_inode *info;
54079 +
54080 +       assert("nikita-736", inode != NULL);
54081 +
54082 +       dplug_id = fplug_id = -1;
54083 +
54084 +       switch (inode->i_mode & S_IFMT) {
54085 +       case S_IFSOCK:
54086 +       case S_IFBLK:
54087 +       case S_IFCHR:
54088 +       case S_IFIFO:
54089 +               fplug_id = SPECIAL_FILE_PLUGIN_ID;
54090 +               break;
54091 +       case S_IFLNK:
54092 +               fplug_id = SYMLINK_FILE_PLUGIN_ID;
54093 +               break;
54094 +       case S_IFDIR:
54095 +               fplug_id = DIRECTORY_FILE_PLUGIN_ID;
54096 +               dplug_id = HASHED_DIR_PLUGIN_ID;
54097 +               break;
54098 +       default:
54099 +               warning("nikita-737", "wrong file mode: %o", inode->i_mode);
54100 +               return RETERR(-EIO);
54101 +       case S_IFREG:
54102 +               fplug_id = UNIX_FILE_PLUGIN_ID;
54103 +               break;
54104 +       }
54105 +       info = reiser4_inode_data(inode);
54106 +       set_plugin(&info->pset, PSET_FILE, (fplug_id >= 0) ?
54107 +                  plugin_by_id(REISER4_FILE_PLUGIN_TYPE, fplug_id) : NULL);
54108 +       set_plugin(&info->pset, PSET_DIR, (dplug_id >= 0) ?
54109 +                  plugin_by_id(REISER4_DIR_PLUGIN_TYPE, dplug_id) : NULL);
54110 +       return 0;
54111 +}
54112 +
54113 +/* Audited by: green(2002.06.14) */
54114 +static int absent_plugin_sd(struct inode *inode /* object being processed */ )
54115 +{
54116 +       int result;
54117 +
54118 +       assert("nikita-659", inode != NULL);
54119 +
54120 +       result = guess_plugin_by_mode(inode);
54121 +       /* if mode was wrong, guess_plugin_by_mode() returns "regular file",
54122 +          but setup_inode_ops() will call make_bad_inode().
54123 +          Another, more logical but bit more complex solution is to add
54124 +          "bad-file plugin". */
54125 +       /* FIXME-VS: activate was called here */
54126 +       return result;
54127 +}
54128 +
54129 +/* helper function for plugin_sd_save_len(): calculate how much space
54130 +    required to save state of given plugin */
54131 +/* Audited by: green(2002.06.14) */
54132 +static int len_for(reiser4_plugin * plugin /* plugin to save */ ,
54133 +                  struct inode *inode /* object being processed */ ,
54134 +                  pset_member memb,
54135 +                  int len, int is_pset)
54136 +{
54137 +       reiser4_inode *info;
54138 +       assert("nikita-661", inode != NULL);
54139 +
54140 +       if (plugin == NULL)
54141 +               return len;
54142 +
54143 +       info = reiser4_inode_data(inode);
54144 +       if (is_pset ?
54145 +           info->plugin_mask & (1 << memb) :
54146 +           info->heir_mask & (1 << memb)) {
54147 +               len += sizeof(reiser4_plugin_slot);
54148 +               if (plugin->h.pops && plugin->h.pops->save_len != NULL) {
54149 +                       /* non-standard plugin, call method */
54150 +                       /* commented as it is incompatible with alignment
54151 +                        * policy in save_plug() -edward */
54152 +                       /* len = round_up(len, plugin->h.pops->alignment); */
54153 +                       len += plugin->h.pops->save_len(inode, plugin);
54154 +               }
54155 +       }
54156 +       return len;
54157 +}
54158 +
54159 +/* calculate how much space is required to save state of all plugins,
54160 +    associated with inode */
54161 +static int save_len_plugin_sd(struct inode *inode /* object being processed */,
54162 +                             int is_pset)
54163 +{
54164 +       int len;
54165 +       int last;
54166 +       reiser4_inode *state;
54167 +       pset_member memb;
54168 +
54169 +       assert("nikita-663", inode != NULL);
54170 +
54171 +       state = reiser4_inode_data(inode);
54172 +
54173 +       /* common case: no non-standard plugins */
54174 +       if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
54175 +               return 0;
54176 +       len = sizeof(reiser4_plugin_stat);
54177 +       last = PSET_LAST;
54178 +
54179 +       for (memb = 0; memb < last; ++memb) {
54180 +             len = len_for(aset_get(is_pset ? state->pset : state->hset, memb),
54181 +                           inode, memb, len, is_pset);
54182 +       }
54183 +       assert("nikita-664", len > (int)sizeof(reiser4_plugin_stat));
54184 +       return len;
54185 +}
54186 +
54187 +static int save_len_pset_sd(struct inode *inode) {
54188 +       return save_len_plugin_sd(inode, 1 /* pset */);
54189 +}
54190 +
54191 +/* helper function for plugin_sd_save(): save plugin, associated with
54192 +    inode. */
54193 +static int save_plug(reiser4_plugin * plugin /* plugin to save */ ,
54194 +                    struct inode *inode /* object being processed */ ,
54195 +                    int memb /* what element of pset is saved */ ,
54196 +                    char **area /* position in stat-data */ ,
54197 +                    int *count /* incremented if plugin were actually saved. */,
54198 +                    int is_pset /* 1 for plugin set, 0 for heir set */)
54199 +{
54200 +       reiser4_plugin_slot *slot;
54201 +       int fake_len;
54202 +       int result;
54203 +
54204 +       assert("nikita-665", inode != NULL);
54205 +       assert("nikita-666", area != NULL);
54206 +       assert("nikita-667", *area != NULL);
54207 +
54208 +       if (plugin == NULL)
54209 +               return 0;
54210 +
54211 +       if (is_pset ?
54212 +           !(reiser4_inode_data(inode)->plugin_mask & (1 << memb)) :
54213 +           !(reiser4_inode_data(inode)->heir_mask & (1 << memb)))
54214 +               return 0;
54215 +       slot = (reiser4_plugin_slot *) * area;
54216 +       put_unaligned(cpu_to_le16(memb), &slot->pset_memb);
54217 +       put_unaligned(cpu_to_le16(plugin->h.id), &slot->id);
54218 +       fake_len = (int)0xffff;
54219 +       move_on(&fake_len, area, sizeof *slot);
54220 +       ++*count;
54221 +       result = 0;
54222 +       if (plugin->h.pops != NULL) {
54223 +               if (plugin->h.pops->save != NULL)
54224 +                       result = plugin->h.pops->save(inode, plugin, area);
54225 +       }
54226 +       return result;
54227 +}
54228 +
54229 +/* save state of all non-standard plugins associated with inode */
54230 +static int save_plugin_sd(struct inode *inode /* object being processed */ ,
54231 +                         char **area /* position in stat-data */,
54232 +                         int is_pset /* 1 for pset, 0 for hset */)
54233 +{
54234 +       int fake_len;
54235 +       int result = 0;
54236 +       int num_of_plugins;
54237 +       reiser4_plugin_stat *sd;
54238 +       reiser4_inode *state;
54239 +       pset_member memb;
54240 +
54241 +       assert("nikita-669", inode != NULL);
54242 +       assert("nikita-670", area != NULL);
54243 +       assert("nikita-671", *area != NULL);
54244 +
54245 +       state = reiser4_inode_data(inode);
54246 +       if (is_pset ? state->plugin_mask == 0 : state->heir_mask == 0)
54247 +               return 0;
54248 +       sd = (reiser4_plugin_stat *) * area;
54249 +       fake_len = (int)0xffff;
54250 +       move_on(&fake_len, area, sizeof *sd);
54251 +
54252 +       num_of_plugins = 0;
54253 +       for (memb = 0; memb < PSET_LAST; ++memb) {
54254 +               result = save_plug(aset_get(is_pset ? state->pset : state->hset,
54255 +                                           memb),
54256 +                                  inode, memb, area, &num_of_plugins, is_pset);
54257 +               if (result != 0)
54258 +                       break;
54259 +       }
54260 +
54261 +       put_unaligned(cpu_to_le16((__u16)num_of_plugins), &sd->plugins_no);
54262 +       return result;
54263 +}
54264 +
54265 +static int save_pset_sd(struct inode *inode, char **area) {
54266 +       return save_plugin_sd(inode, area, 1 /* pset */);
54267 +}
54268 +
54269 +static int present_hset_sd(struct inode *inode, char **area, int *len) {
54270 +       return present_plugin_sd(inode, area, len, 0 /* hset */);
54271 +}
54272 +
54273 +static int save_len_hset_sd(struct inode *inode) {
54274 +       return save_len_plugin_sd(inode, 0 /* pset */);
54275 +}
54276 +
54277 +static int save_hset_sd(struct inode *inode, char **area) {
54278 +       return save_plugin_sd(inode, area, 0 /* hset */);
54279 +}
54280 +
54281 +/* helper function for crypto_sd_present(), crypto_sd_save.
54282 +   Allocates memory for crypto stat, keyid and attaches it to the inode */
54283 +static int extract_crypto_stat (struct inode * inode,
54284 +                               reiser4_crypto_stat * sd)
54285 +{
54286 +       crypto_stat_t * info;
54287 +       assert("edward-11", !inode_crypto_stat(inode));
54288 +       assert("edward-1413",
54289 +              !reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED));
54290 +       /* create and attach a crypto-stat without secret key loaded */
54291 +       info = reiser4_alloc_crypto_stat(inode);
54292 +       if (IS_ERR(info))
54293 +               return PTR_ERR(info);
54294 +       info->keysize = le16_to_cpu(get_unaligned(&sd->keysize));
54295 +       memcpy(info->keyid, sd->keyid, inode_digest_plugin(inode)->fipsize);
54296 +       reiser4_attach_crypto_stat(inode, info);
54297 +       reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
54298 +       return 0;
54299 +}
54300 +
54301 +/* crypto stat-data extension */
54302 +
54303 +static int present_crypto_sd(struct inode *inode, char **area, int *len)
54304 +{
54305 +       int result;
54306 +       reiser4_crypto_stat *sd;
54307 +       digest_plugin *dplug = inode_digest_plugin(inode);
54308 +
54309 +       assert("edward-06", dplug != NULL);
54310 +       assert("edward-684", dplug->fipsize);
54311 +       assert("edward-07", area != NULL);
54312 +       assert("edward-08", *area != NULL);
54313 +       assert("edward-09", len != NULL);
54314 +       assert("edward-10", *len > 0);
54315 +
54316 +       if (*len < (int)sizeof(reiser4_crypto_stat)) {
54317 +               return not_enough_space(inode, "crypto-sd");
54318 +       }
54319 +       /* *len is number of bytes in stat data item from *area to the end of
54320 +          item. It must be not less than size of this extension */
54321 +       assert("edward-75", sizeof(*sd) + dplug->fipsize <= *len);
54322 +
54323 +       sd = (reiser4_crypto_stat *) * area;
54324 +       result = extract_crypto_stat(inode, sd);
54325 +       move_on(len, area, sizeof(*sd) + dplug->fipsize);
54326 +
54327 +       return result;
54328 +}
54329 +
54330 +static int save_len_crypto_sd(struct inode *inode)
54331 +{
54332 +       return sizeof(reiser4_crypto_stat) +
54333 +               inode_digest_plugin(inode)->fipsize;
54334 +}
54335 +
54336 +static int save_crypto_sd(struct inode *inode, char **area)
54337 +{
54338 +       int result = 0;
54339 +       reiser4_crypto_stat *sd;
54340 +       crypto_stat_t * info = inode_crypto_stat(inode);
54341 +       digest_plugin *dplug = inode_digest_plugin(inode);
54342 +
54343 +       assert("edward-12", dplug != NULL);
54344 +       assert("edward-13", area != NULL);
54345 +       assert("edward-14", *area != NULL);
54346 +       assert("edward-15", info != NULL);
54347 +       assert("edward-1414", info->keyid != NULL);
54348 +       assert("edward-1415", info->keysize != 0);
54349 +       assert("edward-76", reiser4_inode_data(inode) != NULL);
54350 +
54351 +       if (!reiser4_inode_get_flag(inode, REISER4_CRYPTO_STAT_LOADED)) {
54352 +               /* file is just created */
54353 +               sd = (reiser4_crypto_stat *) *area;
54354 +               /* copy everything but private key to the disk stat-data */
54355 +               put_unaligned(cpu_to_le16(info->keysize), &sd->keysize);
54356 +               memcpy(sd->keyid, info->keyid, (size_t) dplug->fipsize);
54357 +               reiser4_inode_set_flag(inode, REISER4_CRYPTO_STAT_LOADED);
54358 +       }
54359 +       *area += (sizeof(*sd) + dplug->fipsize);
54360 +       return result;
54361 +}
54362 +
54363 +static int eio(struct inode *inode, char **area, int *len)
54364 +{
54365 +       return RETERR(-EIO);
54366 +}
54367 +
54368 +sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION] = {
54369 +       [LIGHT_WEIGHT_STAT] = {
54370 +               .h = {
54371 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54372 +                       .id = LIGHT_WEIGHT_STAT,
54373 +                       .pops = NULL,
54374 +                       .label = "light-weight sd",
54375 +                       .desc = "sd for light-weight files",
54376 +                       .linkage = {NULL,NULL}
54377 +               },
54378 +               .present = present_lw_sd,
54379 +               .absent = NULL,
54380 +               .save_len = save_len_lw_sd,
54381 +               .save = save_lw_sd,
54382 +               .alignment = 8
54383 +       },
54384 +       [UNIX_STAT] = {
54385 +               .h = {
54386 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54387 +                       .id = UNIX_STAT,
54388 +                       .pops = NULL,
54389 +                       .label = "unix-sd",
54390 +                       .desc = "unix stat-data fields",
54391 +                       .linkage = {NULL,NULL}
54392 +               },
54393 +               .present = present_unix_sd,
54394 +               .absent = absent_unix_sd,
54395 +               .save_len = save_len_unix_sd,
54396 +               .save = save_unix_sd,
54397 +               .alignment = 8
54398 +       },
54399 +       [LARGE_TIMES_STAT] = {
54400 +               .h = {
54401 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54402 +                       .id = LARGE_TIMES_STAT,
54403 +                       .pops = NULL,
54404 +                       .label = "64time-sd",
54405 +                       .desc = "nanosecond resolution for times",
54406 +                       .linkage = {NULL,NULL}
54407 +               },
54408 +               .present = present_large_times_sd,
54409 +               .absent = NULL,
54410 +               .save_len = save_len_large_times_sd,
54411 +               .save = save_large_times_sd,
54412 +               .alignment = 8
54413 +       },
54414 +       [SYMLINK_STAT] = {
54415 +               /* stat data of symlink has this extension */
54416 +               .h = {
54417 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54418 +                       .id = SYMLINK_STAT,
54419 +                       .pops = NULL,
54420 +                       .label = "symlink-sd",
54421 +                       .desc =
54422 +                       "stat data is appended with symlink name",
54423 +                       .linkage = {NULL,NULL}
54424 +               },
54425 +               .present = present_symlink_sd,
54426 +               .absent = NULL,
54427 +               .save_len = save_len_symlink_sd,
54428 +               .save = save_symlink_sd,
54429 +               .alignment = 8
54430 +       },
54431 +       [PLUGIN_STAT] = {
54432 +               .h = {
54433 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54434 +                       .id = PLUGIN_STAT,
54435 +                       .pops = NULL,
54436 +                       .label = "plugin-sd",
54437 +                       .desc = "plugin stat-data fields",
54438 +                       .linkage = {NULL,NULL}
54439 +               },
54440 +               .present = present_pset_sd,
54441 +               .absent = absent_plugin_sd,
54442 +               .save_len = save_len_pset_sd,
54443 +               .save = save_pset_sd,
54444 +               .alignment = 8
54445 +       },
54446 +       [HEIR_STAT] = {
54447 +               .h = {
54448 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54449 +                       .id = HEIR_STAT,
54450 +                       .pops = NULL,
54451 +                       .label = "heir-plugin-sd",
54452 +                       .desc = "heir plugin stat-data fields",
54453 +                       .linkage = {NULL,NULL}
54454 +               },
54455 +               .present = present_hset_sd,
54456 +               .absent = NULL,
54457 +               .save_len = save_len_hset_sd,
54458 +               .save = save_hset_sd,
54459 +               .alignment = 8
54460 +       },
54461 +       [FLAGS_STAT] = {
54462 +               .h = {
54463 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54464 +                       .id = FLAGS_STAT,
54465 +                       .pops = NULL,
54466 +                       .label = "flags-sd",
54467 +                       .desc = "inode bit flags",
54468 +                       .linkage = {NULL, NULL}
54469 +               },
54470 +               .present = present_flags_sd,
54471 +               .absent = NULL,
54472 +               .save_len = save_len_flags_sd,
54473 +               .save = save_flags_sd,
54474 +               .alignment = 8
54475 +       },
54476 +       [CAPABILITIES_STAT] = {
54477 +               .h = {
54478 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54479 +                       .id = CAPABILITIES_STAT,
54480 +                       .pops = NULL,
54481 +                       .label = "capabilities-sd",
54482 +                       .desc = "capabilities",
54483 +                       .linkage = {NULL, NULL}
54484 +               },
54485 +               .present = eio,
54486 +               .absent = NULL,
54487 +               .save_len = save_len_flags_sd,
54488 +               .save = save_flags_sd,
54489 +               .alignment = 8
54490 +       },
54491 +       [CRYPTO_STAT] = {
54492 +               .h = {
54493 +                       .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
54494 +                       .id = CRYPTO_STAT,
54495 +                       .pops = NULL,
54496 +                       .label = "crypto-sd",
54497 +                       .desc = "secret key size and id",
54498 +                       .linkage = {NULL, NULL}
54499 +               },
54500 +               .present = present_crypto_sd,
54501 +               .absent = NULL,
54502 +               .save_len = save_len_crypto_sd,
54503 +               .save = save_crypto_sd,
54504 +               .alignment = 8
54505 +       }
54506 +};
54507 +
54508 +/* Make Linus happy.
54509 +   Local variables:
54510 +   c-indentation-style: "K&R"
54511 +   mode-name: "LC"
54512 +   c-basic-offset: 8
54513 +   tab-width: 8
54514 +   fill-column: 120
54515 +   End:
54516 +*/
54517 diff --git a/fs/reiser4/plugin/item/static_stat.h b/fs/reiser4/plugin/item/static_stat.h
54518 new file mode 100644
54519 index 0000000..dd20eb3
54520 --- /dev/null
54521 +++ b/fs/reiser4/plugin/item/static_stat.h
54522 @@ -0,0 +1,224 @@
54523 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54524 +
54525 +/* This describes the static_stat item, used to hold all information needed by the stat() syscall.
54526 +
54527 +In the case where each file has not less than the fields needed by the
54528 +stat() syscall, it is more compact to store those fields in this
54529 +struct.
54530 +
54531 +If this item does not exist, then all stats are dynamically resolved.
54532 +At the moment, we either resolve all stats dynamically or all of them
54533 +statically.  If you think this is not fully optimal, and the rest of
54534 +reiser4 is working, then fix it...:-)
54535 +
54536 +*/
54537 +
54538 +#if !defined( __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ )
54539 +#define __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__
54540 +
54541 +#include "../../forward.h"
54542 +#include "../../dformat.h"
54543 +
54544 +#include <linux/fs.h>          /* for struct inode */
54545 +
54546 +/* Stat data layout: goals and implementation.
54547 +
54548 +   We want to be able to have lightweight files which have complete flexibility in what semantic metadata is attached to
54549 +   them, including not having semantic metadata attached to them.
54550 +
54551 +   There is one problem with doing that, which is that if in fact you have exactly the same metadata for most files you
54552 +   want to store, then it takes more space to store that metadata in a dynamically sized structure than in a statically
54553 +   sized structure because the statically sized structure knows without recording it what the names and lengths of the
54554 +   attributes are.
54555 +
54556 +   This leads to a natural compromise, which is to special case those files which have simply the standard unix file
54557 +   attributes, and only employ the full dynamic stat data mechanism for those files that differ from the standard unix
54558 +   file in their use of file attributes.
54559 +
54560 +   Yet this compromise deserves to be compromised a little.
54561 +
54562 +   We accommodate the case where you have no more than the standard unix file attributes by using an "extension
54563 +   bitmask": each bit in it indicates presence or absence of or particular stat data extension (see sd_ext_bits enum).
54564 +
54565 +   If the first bit of the extension bitmask bit is 0, we have light-weight file whose attributes are either inherited
54566 +   from parent directory (as uid, gid) or initialised to some sane values.
54567 +
54568 +   To capitalize on existing code infrastructure, extensions are
54569 +   implemented as plugins of type REISER4_SD_EXT_PLUGIN_TYPE.
54570 +   Each stat-data extension plugin implements four methods:
54571 +
54572 +    ->present() called by sd_load() when this extension is found in stat-data
54573 +    ->absent() called by sd_load() when this extension is not found in stat-data
54574 +    ->save_len() called by sd_len() to calculate total length of stat-data
54575 +    ->save() called by sd_save() to store extension data into stat-data
54576 +
54577 +    Implementation is in fs/reiser4/plugin/item/static_stat.c
54578 +*/
54579 +
54580 +/* stat-data extension. Please order this by presumed frequency of use */
54581 +typedef enum {
54582 +       /* support for light-weight files */
54583 +       LIGHT_WEIGHT_STAT,
54584 +       /* data required to implement unix stat(2) call. Layout is in
54585 +          reiser4_unix_stat. If this is not present, file is light-weight */
54586 +       UNIX_STAT,
54587 +       /* this contains additional set of 32bit [anc]time fields to implement
54588 +          nanosecond resolution. Layout is in reiser4_large_times_stat. Usage
54589 +          if this extension is governed by 32bittimes mount option. */
54590 +       LARGE_TIMES_STAT,
54591 +       /* stat data has link name included */
54592 +       SYMLINK_STAT,
54593 +       /* on-disk slots of non-standard plugins for main plugin table
54594 +          (@reiser4_inode->pset), that is, plugins that cannot be deduced
54595 +          from file mode bits), for example, aggregation, interpolation etc. */
54596 +       PLUGIN_STAT,
54597 +       /* this extension contains persistent inode flags. These flags are
54598 +          single bits: immutable, append, only, etc. Layout is in
54599 +          reiser4_flags_stat. */
54600 +       FLAGS_STAT,
54601 +       /* this extension contains capabilities sets, associated with this
54602 +          file. Layout is in reiser4_capabilities_stat */
54603 +       CAPABILITIES_STAT,
54604 +       /* this extension contains size and public id of the secret key.
54605 +          Layout is in reiser4_crypto_stat */
54606 +       CRYPTO_STAT,
54607 +       /* on-disk slots of non-default plugins for inheritance, which
54608 +          are extracted to special plugin table (@reiser4_inode->hset).
54609 +          By default, children of the object will inherit plugins from
54610 +          its main plugin table (pset). */
54611 +       HEIR_STAT,
54612 +       LAST_SD_EXTENSION,
54613 +       /*
54614 +        * init_inode_static_sd() iterates over extension mask until all
54615 +        * non-zero bits are processed. This means, that neither ->present(),
54616 +        * nor ->absent() methods will be called for stat-data extensions that
54617 +        * go after last present extension. But some basic extensions, we want
54618 +        * either ->absent() or ->present() method to be called, because these
54619 +        * extensions set up something in inode even when they are not
54620 +        * present. This is what LAST_IMPORTANT_SD_EXTENSION is for: for all
54621 +        * extensions before and including LAST_IMPORTANT_SD_EXTENSION either
54622 +        * ->present(), or ->absent() method will be called, independently of
54623 +        * what other extensions are present.
54624 +        */
54625 +       LAST_IMPORTANT_SD_EXTENSION = PLUGIN_STAT
54626 +} sd_ext_bits;
54627 +
54628 +/* minimal stat-data. This allows to support light-weight files. */
54629 +typedef struct reiser4_stat_data_base {
54630 +       /*  0 */ __le16 extmask;
54631 +       /*  2 */
54632 +} PACKED reiser4_stat_data_base;
54633 +
54634 +typedef struct reiser4_light_weight_stat {
54635 +       /*  0 */ __le16 mode;
54636 +       /*  2 */ __le32 nlink;
54637 +       /*  6 */ __le64 size;
54638 +       /* size in bytes */
54639 +       /* 14 */
54640 +} PACKED reiser4_light_weight_stat;
54641 +
54642 +typedef struct reiser4_unix_stat {
54643 +       /* owner id */
54644 +       /*  0 */ __le32 uid;
54645 +       /* group id */
54646 +       /*  4 */ __le32 gid;
54647 +       /* access time */
54648 +       /*  8 */ __le32 atime;
54649 +       /* modification time */
54650 +       /* 12 */ __le32 mtime;
54651 +       /* change time */
54652 +       /* 16 */ __le32 ctime;
54653 +       union {
54654 +               /* minor:major for device files */
54655 +               /* 20 */ __le64 rdev;
54656 +               /* bytes used by file */
54657 +               /* 20 */ __le64 bytes;
54658 +       } u;
54659 +       /* 28 */
54660 +} PACKED reiser4_unix_stat;
54661 +
54662 +/* symlink stored as part of inode */
54663 +typedef struct reiser4_symlink_stat {
54664 +       char body[0];
54665 +} PACKED reiser4_symlink_stat;
54666 +
54667 +typedef struct reiser4_plugin_slot {
54668 +       /*  0 */ __le16 pset_memb;
54669 +       /*  2 */ __le16 id;
54670 +       /*  4 *//* here plugin stores its persistent state */
54671 +} PACKED reiser4_plugin_slot;
54672 +
54673 +/* stat-data extension for files with non-standard plugin. */
54674 +typedef struct reiser4_plugin_stat {
54675 +       /* number of additional plugins, associated with this object */
54676 +       /*  0 */ __le16 plugins_no;
54677 +       /*  2 */ reiser4_plugin_slot slot[0];
54678 +       /*  2 */
54679 +} PACKED reiser4_plugin_stat;
54680 +
54681 +/* stat-data extension for inode flags. Currently it is just fixed-width 32
54682 + * bit mask. If need arise, this can be replaced with variable width
54683 + * bitmask. */
54684 +typedef struct reiser4_flags_stat {
54685 +       /*  0 */ __le32 flags;
54686 +       /*  4 */
54687 +} PACKED reiser4_flags_stat;
54688 +
54689 +typedef struct reiser4_capabilities_stat {
54690 +       /*  0 */ __le32 effective;
54691 +       /*  8 */ __le32 permitted;
54692 +       /* 16 */
54693 +} PACKED reiser4_capabilities_stat;
54694 +
54695 +typedef struct reiser4_cluster_stat {
54696 +/* this defines cluster size (an attribute of cryptcompress objects) as PAGE_SIZE << cluster shift */
54697 +       /* 0 */ d8 cluster_shift;
54698 +       /* 1 */
54699 +} PACKED reiser4_cluster_stat;
54700 +
54701 +typedef struct reiser4_crypto_stat {
54702 +       /* secret key size, bits */
54703 +       /*  0 */ d16 keysize;
54704 +       /* secret key id */
54705 +       /*  2 */ d8 keyid[0];
54706 +       /* 2 */
54707 +} PACKED reiser4_crypto_stat;
54708 +
54709 +typedef struct reiser4_large_times_stat {
54710 +       /* access time */
54711 +       /* 0 */ d32 atime;
54712 +       /* modification time */
54713 +       /* 4 */ d32 mtime;
54714 +       /* change time */
54715 +       /* 8 */ d32 ctime;
54716 +       /* 12 */
54717 +} PACKED reiser4_large_times_stat;
54718 +
54719 +/* this structure is filled by sd_item_stat */
54720 +typedef struct sd_stat {
54721 +       int dirs;
54722 +       int files;
54723 +       int others;
54724 +} sd_stat;
54725 +
54726 +/* plugin->item.common.* */
54727 +extern void print_sd(const char *prefix, coord_t * coord);
54728 +extern void item_stat_static_sd(const coord_t * coord, void *vp);
54729 +
54730 +/* plugin->item.s.sd.* */
54731 +extern int init_inode_static_sd(struct inode *inode, char *sd, int len);
54732 +extern int save_len_static_sd(struct inode *inode);
54733 +extern int save_static_sd(struct inode *inode, char **area);
54734 +
54735 +/* __FS_REISER4_PLUGIN_ITEM_STATIC_STAT_H__ */
54736 +#endif
54737 +
54738 +/* Make Linus happy.
54739 +   Local variables:
54740 +   c-indentation-style: "K&R"
54741 +   mode-name: "LC"
54742 +   c-basic-offset: 8
54743 +   tab-width: 8
54744 +   fill-column: 120
54745 +   End:
54746 +*/
54747 diff --git a/fs/reiser4/plugin/item/tail.c b/fs/reiser4/plugin/item/tail.c
54748 new file mode 100644
54749 index 0000000..281dd36
54750 --- /dev/null
54751 +++ b/fs/reiser4/plugin/item/tail.c
54752 @@ -0,0 +1,812 @@
54753 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
54754 +
54755 +#include "item.h"
54756 +#include "../../inode.h"
54757 +#include "../../page_cache.h"
54758 +#include "../../carry.h"
54759 +#include "../../vfs_ops.h"
54760 +
54761 +#include <linux/quotaops.h>
54762 +#include <asm/uaccess.h>
54763 +#include <linux/swap.h>
54764 +#include <linux/writeback.h>
54765 +
54766 +/* plugin->u.item.b.max_key_inside */
54767 +reiser4_key *max_key_inside_tail(const coord_t *coord, reiser4_key *key)
54768 +{
54769 +       item_key_by_coord(coord, key);
54770 +       set_key_offset(key, get_key_offset(reiser4_max_key()));
54771 +       return key;
54772 +}
54773 +
54774 +/* plugin->u.item.b.can_contain_key */
54775 +int can_contain_key_tail(const coord_t *coord, const reiser4_key *key,
54776 +                        const reiser4_item_data *data)
54777 +{
54778 +       reiser4_key item_key;
54779 +
54780 +       if (item_plugin_by_coord(coord) != data->iplug)
54781 +               return 0;
54782 +
54783 +       item_key_by_coord(coord, &item_key);
54784 +       if (get_key_locality(key) != get_key_locality(&item_key) ||
54785 +           get_key_objectid(key) != get_key_objectid(&item_key))
54786 +               return 0;
54787 +
54788 +       return 1;
54789 +}
54790 +
54791 +/* plugin->u.item.b.mergeable
54792 +   first item is of tail type */
54793 +/* Audited by: green(2002.06.14) */
54794 +int mergeable_tail(const coord_t *p1, const coord_t *p2)
54795 +{
54796 +       reiser4_key key1, key2;
54797 +
54798 +       assert("vs-535", plugin_of_group(item_plugin_by_coord(p1),
54799 +                                        UNIX_FILE_METADATA_ITEM_TYPE));
54800 +       assert("vs-365", item_id_by_coord(p1) == FORMATTING_ID);
54801 +
54802 +       if (item_id_by_coord(p2) != FORMATTING_ID) {
54803 +               /* second item is of another type */
54804 +               return 0;
54805 +       }
54806 +
54807 +       item_key_by_coord(p1, &key1);
54808 +       item_key_by_coord(p2, &key2);
54809 +       if (get_key_locality(&key1) != get_key_locality(&key2) ||
54810 +           get_key_objectid(&key1) != get_key_objectid(&key2)
54811 +           || get_key_type(&key1) != get_key_type(&key2)) {
54812 +               /* items of different objects */
54813 +               return 0;
54814 +       }
54815 +       if (get_key_offset(&key1) + nr_units_tail(p1) != get_key_offset(&key2)) {
54816 +               /* not adjacent items */
54817 +               return 0;
54818 +       }
54819 +       return 1;
54820 +}
54821 +
54822 +/* plugin->u.item.b.print
54823 +   plugin->u.item.b.check */
54824 +
54825 +/* plugin->u.item.b.nr_units */
54826 +pos_in_node_t nr_units_tail(const coord_t * coord)
54827 +{
54828 +       return item_length_by_coord(coord);
54829 +}
54830 +
54831 +/* plugin->u.item.b.lookup */
54832 +lookup_result
54833 +lookup_tail(const reiser4_key * key, lookup_bias bias, coord_t * coord)
54834 +{
54835 +       reiser4_key item_key;
54836 +       __u64 lookuped, offset;
54837 +       unsigned nr_units;
54838 +
54839 +       item_key_by_coord(coord, &item_key);
54840 +       offset = get_key_offset(item_key_by_coord(coord, &item_key));
54841 +       nr_units = nr_units_tail(coord);
54842 +
54843 +       /* key we are looking for must be greater than key of item @coord */
54844 +       assert("vs-416", keygt(key, &item_key));
54845 +
54846 +       /* offset we are looking for */
54847 +       lookuped = get_key_offset(key);
54848 +
54849 +       if (lookuped >= offset && lookuped < offset + nr_units) {
54850 +               /* byte we are looking for is in this item */
54851 +               coord->unit_pos = lookuped - offset;
54852 +               coord->between = AT_UNIT;
54853 +               return CBK_COORD_FOUND;
54854 +       }
54855 +
54856 +       /* set coord after last unit */
54857 +       coord->unit_pos = nr_units - 1;
54858 +       coord->between = AFTER_UNIT;
54859 +       return bias ==
54860 +           FIND_MAX_NOT_MORE_THAN ? CBK_COORD_FOUND : CBK_COORD_NOTFOUND;
54861 +}
54862 +
54863 +/* plugin->u.item.b.paste */
54864 +int
54865 +paste_tail(coord_t *coord, reiser4_item_data *data,
54866 +          carry_plugin_info *info UNUSED_ARG)
54867 +{
54868 +       unsigned old_item_length;
54869 +       char *item;
54870 +
54871 +       /* length the item had before resizing has been performed */
54872 +       old_item_length = item_length_by_coord(coord) - data->length;
54873 +
54874 +       /* tail items never get pasted in the middle */
54875 +       assert("vs-363",
54876 +              (coord->unit_pos == 0 && coord->between == BEFORE_UNIT) ||
54877 +              (coord->unit_pos == old_item_length - 1 &&
54878 +               coord->between == AFTER_UNIT) ||
54879 +              (coord->unit_pos == 0 && old_item_length == 0
54880 +               && coord->between == AT_UNIT));
54881 +
54882 +       item = item_body_by_coord(coord);
54883 +       if (coord->unit_pos == 0)
54884 +               /* make space for pasted data when pasting at the beginning of
54885 +                  the item */
54886 +               memmove(item + data->length, item, old_item_length);
54887 +
54888 +       if (coord->between == AFTER_UNIT)
54889 +               coord->unit_pos++;
54890 +
54891 +       if (data->data) {
54892 +               assert("vs-554", data->user == 0 || data->user == 1);
54893 +               if (data->user) {
54894 +                       assert("nikita-3035", reiser4_schedulable());
54895 +                       /* copy from user space */
54896 +                       if (__copy_from_user(item + coord->unit_pos,
54897 +                                            (const char __user *)data->data,
54898 +                                            (unsigned)data->length))
54899 +                               return RETERR(-EFAULT);
54900 +               } else
54901 +                       /* copy from kernel space */
54902 +                       memcpy(item + coord->unit_pos, data->data,
54903 +                              (unsigned)data->length);
54904 +       } else {
54905 +               memset(item + coord->unit_pos, 0, (unsigned)data->length);
54906 +       }
54907 +       return 0;
54908 +}
54909 +
54910 +/* plugin->u.item.b.fast_paste */
54911 +
54912 +/* plugin->u.item.b.can_shift
54913 +   number of units is returned via return value, number of bytes via @size. For
54914 +   tail items they coincide */
54915 +int
54916 +can_shift_tail(unsigned free_space, coord_t * source UNUSED_ARG,
54917 +              znode * target UNUSED_ARG, shift_direction direction UNUSED_ARG,
54918 +              unsigned *size, unsigned want)
54919 +{
54920 +       /* make sure that that we do not want to shift more than we have */
54921 +       assert("vs-364", want > 0
54922 +              && want <= (unsigned)item_length_by_coord(source));
54923 +
54924 +       *size = min(want, free_space);
54925 +       return *size;
54926 +}
54927 +
54928 +/* plugin->u.item.b.copy_units */
54929 +void
54930 +copy_units_tail(coord_t * target, coord_t * source,
54931 +               unsigned from, unsigned count,
54932 +               shift_direction where_is_free_space,
54933 +               unsigned free_space UNUSED_ARG)
54934 +{
54935 +       /* make sure that item @target is expanded already */
54936 +       assert("vs-366", (unsigned)item_length_by_coord(target) >= count);
54937 +       assert("vs-370", free_space >= count);
54938 +
54939 +       if (where_is_free_space == SHIFT_LEFT) {
54940 +               /* append item @target with @count first bytes of @source */
54941 +               assert("vs-365", from == 0);
54942 +
54943 +               memcpy((char *)item_body_by_coord(target) +
54944 +                      item_length_by_coord(target) - count,
54945 +                      (char *)item_body_by_coord(source), count);
54946 +       } else {
54947 +               /* target item is moved to right already */
54948 +               reiser4_key key;
54949 +
54950 +               assert("vs-367",
54951 +                      (unsigned)item_length_by_coord(source) == from + count);
54952 +
54953 +               memcpy((char *)item_body_by_coord(target),
54954 +                      (char *)item_body_by_coord(source) + from, count);
54955 +
54956 +               /* new units are inserted before first unit in an item,
54957 +                  therefore, we have to update item key */
54958 +               item_key_by_coord(source, &key);
54959 +               set_key_offset(&key, get_key_offset(&key) + from);
54960 +
54961 +               node_plugin_by_node(target->node)->update_item_key(target, &key,
54962 +                                                                  NULL /*info */);
54963 +       }
54964 +}
54965 +
54966 +/* plugin->u.item.b.create_hook */
54967 +
54968 +/* item_plugin->b.kill_hook
54969 +   this is called when @count units starting from @from-th one are going to be removed
54970 +   */
54971 +int
54972 +kill_hook_tail(const coord_t * coord, pos_in_node_t from,
54973 +              pos_in_node_t count, struct carry_kill_data *kdata)
54974 +{
54975 +       reiser4_key key;
54976 +       loff_t start, end;
54977 +
54978 +       assert("vs-1577", kdata);
54979 +       assert("vs-1579", kdata->inode);
54980 +
54981 +       item_key_by_coord(coord, &key);
54982 +       start = get_key_offset(&key) + from;
54983 +       end = start + count;
54984 +       fake_kill_hook_tail(kdata->inode, start, end, kdata->params.truncate);
54985 +       return 0;
54986 +}
54987 +
54988 +/* plugin->u.item.b.shift_hook */
54989 +
54990 +/* helper for kill_units_tail and cut_units_tail */
54991 +static int
54992 +do_cut_or_kill(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
54993 +              reiser4_key * smallest_removed, reiser4_key * new_first)
54994 +{
54995 +       pos_in_node_t count;
54996 +
54997 +       /* this method is only called to remove part of item */
54998 +       assert("vs-374", (to - from + 1) < item_length_by_coord(coord));
54999 +       /* tails items are never cut from the middle of an item */
55000 +       assert("vs-396", ergo(from != 0, to == coord_last_unit_pos(coord)));
55001 +       assert("vs-1558", ergo(from == 0, to < coord_last_unit_pos(coord)));
55002 +
55003 +       count = to - from + 1;
55004 +
55005 +       if (smallest_removed) {
55006 +               /* store smallest key removed */
55007 +               item_key_by_coord(coord, smallest_removed);
55008 +               set_key_offset(smallest_removed,
55009 +                              get_key_offset(smallest_removed) + from);
55010 +       }
55011 +       if (new_first) {
55012 +               /* head of item is cut */
55013 +               assert("vs-1529", from == 0);
55014 +
55015 +               item_key_by_coord(coord, new_first);
55016 +               set_key_offset(new_first,
55017 +                              get_key_offset(new_first) + from + count);
55018 +       }
55019 +
55020 +       if (REISER4_DEBUG)
55021 +               memset((char *)item_body_by_coord(coord) + from, 0, count);
55022 +       return count;
55023 +}
55024 +
55025 +/* plugin->u.item.b.cut_units */
55026 +int
55027 +cut_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
55028 +              struct carry_cut_data *cdata UNUSED_ARG,
55029 +              reiser4_key * smallest_removed, reiser4_key * new_first)
55030 +{
55031 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
55032 +}
55033 +
55034 +/* plugin->u.item.b.kill_units */
55035 +int
55036 +kill_units_tail(coord_t * coord, pos_in_node_t from, pos_in_node_t to,
55037 +               struct carry_kill_data *kdata, reiser4_key * smallest_removed,
55038 +               reiser4_key * new_first)
55039 +{
55040 +       kill_hook_tail(coord, from, to - from + 1, kdata);
55041 +       return do_cut_or_kill(coord, from, to, smallest_removed, new_first);
55042 +}
55043 +
55044 +/* plugin->u.item.b.unit_key */
55045 +reiser4_key *unit_key_tail(const coord_t * coord, reiser4_key * key)
55046 +{
55047 +       assert("vs-375", coord_is_existing_unit(coord));
55048 +
55049 +       item_key_by_coord(coord, key);
55050 +       set_key_offset(key, (get_key_offset(key) + coord->unit_pos));
55051 +
55052 +       return key;
55053 +}
55054 +
55055 +/* plugin->u.item.b.estimate
55056 +   plugin->u.item.b.item_data_by_flow */
55057 +
55058 +/* tail redpage function. It is called from readpage_tail(). */
55059 +static int do_readpage_tail(uf_coord_t *uf_coord, struct page *page)
55060 +{
55061 +       tap_t tap;
55062 +       int result;
55063 +       coord_t coord;
55064 +       lock_handle lh;
55065 +       int count, mapped;
55066 +       struct inode *inode;
55067 +       char *pagedata;
55068 +
55069 +       /* saving passed coord in order to do not move it by tap. */
55070 +       init_lh(&lh);
55071 +       copy_lh(&lh, uf_coord->lh);
55072 +       inode = page->mapping->host;
55073 +       coord_dup(&coord, &uf_coord->coord);
55074 +
55075 +       reiser4_tap_init(&tap, &coord, &lh, ZNODE_READ_LOCK);
55076 +
55077 +       if ((result = reiser4_tap_load(&tap)))
55078 +               goto out_tap_done;
55079 +
55080 +       /* lookup until page is filled up. */
55081 +       for (mapped = 0; mapped < PAGE_CACHE_SIZE; ) {
55082 +               /* number of bytes to be copied to page */
55083 +               count = item_length_by_coord(&coord) - coord.unit_pos;
55084 +               if (count > PAGE_CACHE_SIZE - mapped)
55085 +                       count = PAGE_CACHE_SIZE - mapped;
55086 +
55087 +               /* attach @page to address space and get data address */
55088 +               pagedata = kmap_atomic(page, KM_USER0);
55089 +
55090 +               /* copy tail item to page */
55091 +               memcpy(pagedata + mapped,
55092 +                      ((char *)item_body_by_coord(&coord) + coord.unit_pos),
55093 +                      count);
55094 +               mapped += count;
55095 +
55096 +               flush_dcache_page(page);
55097 +
55098 +               /* dettach page from address space */
55099 +               kunmap_atomic(pagedata, KM_USER0);
55100 +
55101 +               /* Getting next tail item. */
55102 +               if (mapped < PAGE_CACHE_SIZE) {
55103 +                       /*
55104 +                        * unlock page in order to avoid keep it locked
55105 +                        * during tree lookup, which takes long term locks
55106 +                        */
55107 +                       unlock_page(page);
55108 +
55109 +                       /* getting right neighbour. */
55110 +                       result = go_dir_el(&tap, RIGHT_SIDE, 0);
55111 +
55112 +                       /* lock page back */
55113 +                       lock_page(page);
55114 +                       if (PageUptodate(page)) {
55115 +                               /*
55116 +                                * another thread read the page, we have
55117 +                                * nothing to do
55118 +                                */
55119 +                               result = 0;
55120 +                               goto out_unlock_page;
55121 +                       }
55122 +
55123 +                       if (result) {
55124 +                               if (result == -E_NO_NEIGHBOR) {
55125 +                                       /*
55126 +                                        * rigth neighbor is not a formatted
55127 +                                        * node
55128 +                                        */
55129 +                                       result = 0;
55130 +                                       goto done;
55131 +                               } else {
55132 +                                       goto out_tap_relse;
55133 +                               }
55134 +                       } else {
55135 +                               if (!inode_file_plugin(inode)->
55136 +                                   owns_item(inode, &coord)) {
55137 +                                       /* item of another file is found */
55138 +                                       result = 0;
55139 +                                       goto done;
55140 +                               }
55141 +                       }
55142 +               }
55143 +       }
55144 +
55145 + done:
55146 +       if (mapped != PAGE_CACHE_SIZE) {
55147 +               pagedata = kmap_atomic(page, KM_USER0);
55148 +               memset(pagedata + mapped, 0, PAGE_CACHE_SIZE - mapped);
55149 +               flush_dcache_page(page);
55150 +               kunmap_atomic(pagedata, KM_USER0);
55151 +       }
55152 +       SetPageUptodate(page);
55153 + out_unlock_page:
55154 +       unlock_page(page);
55155 + out_tap_relse:
55156 +       reiser4_tap_relse(&tap);
55157 + out_tap_done:
55158 +       reiser4_tap_done(&tap);
55159 +       return result;
55160 +}
55161 +
55162 +/*
55163 +   plugin->s.file.readpage
55164 +   reiser4_read->unix_file_read->page_cache_readahead->reiser4_readpage->unix_file_readpage->readpage_tail
55165 +   or
55166 +   filemap_nopage->reiser4_readpage->readpage_unix_file->->readpage_tail
55167 +
55168 +   At the beginning: coord->node is read locked, zloaded, page is locked, coord is set to existing unit inside of tail
55169 +   item. */
55170 +int readpage_tail(void *vp, struct page *page)
55171 +{
55172 +       uf_coord_t *uf_coord = vp;
55173 +       ON_DEBUG(coord_t * coord = &uf_coord->coord);
55174 +       ON_DEBUG(reiser4_key key);
55175 +
55176 +       assert("umka-2515", PageLocked(page));
55177 +       assert("umka-2516", !PageUptodate(page));
55178 +       assert("umka-2517", !jprivate(page) && !PagePrivate(page));
55179 +       assert("umka-2518", page->mapping && page->mapping->host);
55180 +
55181 +       assert("umka-2519", znode_is_loaded(coord->node));
55182 +       assert("umka-2520", item_is_tail(coord));
55183 +       assert("umka-2521", coord_is_existing_unit(coord));
55184 +       assert("umka-2522", znode_is_rlocked(coord->node));
55185 +       assert("umka-2523",
55186 +              page->mapping->host->i_ino ==
55187 +              get_key_objectid(item_key_by_coord(coord, &key)));
55188 +
55189 +       return do_readpage_tail(uf_coord, page);
55190 +}
55191 +
55192 +/**
55193 + * overwrite_tail
55194 + * @flow:
55195 + * @coord:
55196 + *
55197 + * Overwrites tail item or its part by user data. Returns number of bytes
55198 + * written or error code.
55199 + */
55200 +static int overwrite_tail(flow_t *flow, coord_t *coord)
55201 +{
55202 +       unsigned count;
55203 +
55204 +       assert("vs-570", flow->user == 1);
55205 +       assert("vs-946", flow->data);
55206 +       assert("vs-947", coord_is_existing_unit(coord));
55207 +       assert("vs-948", znode_is_write_locked(coord->node));
55208 +       assert("nikita-3036", reiser4_schedulable());
55209 +
55210 +       count = item_length_by_coord(coord) - coord->unit_pos;
55211 +       if (count > flow->length)
55212 +               count = flow->length;
55213 +
55214 +       if (__copy_from_user((char *)item_body_by_coord(coord) + coord->unit_pos,
55215 +                            (const char __user *)flow->data, count))
55216 +               return RETERR(-EFAULT);
55217 +
55218 +       znode_make_dirty(coord->node);
55219 +       return count;
55220 +}
55221 +
55222 +/**
55223 + * insert_first_tail
55224 + * @inode:
55225 + * @flow:
55226 + * @coord:
55227 + * @lh:
55228 + *
55229 + * Returns number of bytes written or error code.
55230 + */
55231 +static ssize_t insert_first_tail(struct inode *inode, flow_t *flow,
55232 +                                coord_t *coord, lock_handle *lh)
55233 +{
55234 +       int result;
55235 +       loff_t to_write;
55236 +       unix_file_info_t *uf_info;
55237 +
55238 +       if (get_key_offset(&flow->key) != 0) {
55239 +               /*
55240 +                * file is empty and we have to write not to the beginning of
55241 +                * file. Create a hole at the beginning of file. On success
55242 +                * insert_flow returns 0 as number of written bytes which is
55243 +                * what we have to return on padding a file with holes
55244 +                */
55245 +               flow->data = NULL;
55246 +               flow->length = get_key_offset(&flow->key);
55247 +               set_key_offset(&flow->key, 0);
55248 +               /*
55249 +                * holes in files built of tails are stored just like if there
55250 +                * were real data which are all zeros. Therefore we have to
55251 +                * allocate quota here as well
55252 +                */
55253 +               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55254 +                       return RETERR(-EDQUOT);
55255 +               result = reiser4_insert_flow(coord, lh, flow);
55256 +               if (flow->length)
55257 +                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55258 +
55259 +               uf_info = unix_file_inode_data(inode);
55260 +
55261 +               /*
55262 +                * first item insertion is only possible when writing to empty
55263 +                * file or performing tail conversion
55264 +                */
55265 +               assert("", (uf_info->container == UF_CONTAINER_EMPTY ||
55266 +                           (reiser4_inode_get_flag(inode,
55267 +                                                   REISER4_PART_MIXED) &&
55268 +                            reiser4_inode_get_flag(inode,
55269 +                                                   REISER4_PART_IN_CONV))));
55270 +               /* if file was empty - update its state */
55271 +               if (result == 0 && uf_info->container == UF_CONTAINER_EMPTY)
55272 +                       uf_info->container = UF_CONTAINER_TAILS;
55273 +               return result;
55274 +       }
55275 +
55276 +       /* check quota before appending data */
55277 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55278 +               return RETERR(-EDQUOT);
55279 +
55280 +       to_write = flow->length;
55281 +       result = reiser4_insert_flow(coord, lh, flow);
55282 +       if (flow->length)
55283 +               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55284 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
55285 +}
55286 +
55287 +/**
55288 + * append_tail
55289 + * @inode:
55290 + * @flow:
55291 + * @coord:
55292 + * @lh:
55293 + *
55294 + * Returns number of bytes written or error code.
55295 + */
55296 +static ssize_t append_tail(struct inode *inode,
55297 +                          flow_t *flow, coord_t *coord, lock_handle *lh)
55298 +{
55299 +       int result;
55300 +       reiser4_key append_key;
55301 +       loff_t to_write;
55302 +
55303 +       if (!keyeq(&flow->key, append_key_tail(coord, &append_key))) {
55304 +               flow->data = NULL;
55305 +               flow->length = get_key_offset(&flow->key) - get_key_offset(&append_key);
55306 +               set_key_offset(&flow->key, get_key_offset(&append_key));
55307 +               /*
55308 +                * holes in files built of tails are stored just like if there
55309 +                * were real data which are all zeros. Therefore we have to
55310 +                * allocate quota here as well
55311 +                */
55312 +               if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55313 +                       return RETERR(-EDQUOT);
55314 +               result = reiser4_insert_flow(coord, lh, flow);
55315 +               if (flow->length)
55316 +                       DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55317 +               return result;
55318 +       }
55319 +
55320 +       /* check quota before appending data */
55321 +       if (DQUOT_ALLOC_SPACE_NODIRTY(inode, flow->length))
55322 +               return RETERR(-EDQUOT);
55323 +
55324 +       to_write = flow->length;
55325 +       result = reiser4_insert_flow(coord, lh, flow);
55326 +       if (flow->length)
55327 +               DQUOT_FREE_SPACE_NODIRTY(inode, flow->length);
55328 +       return (to_write - flow->length) ? (to_write - flow->length) : result;
55329 +}
55330 +
55331 +/**
55332 + * write_tail_reserve_space - reserve space for tail write operation
55333 + * @inode:
55334 + *
55335 + * Estimates and reserves space which may be required for writing one flow to a
55336 + * file
55337 + */
55338 +static int write_extent_reserve_space(struct inode *inode)
55339 +{
55340 +       __u64 count;
55341 +       reiser4_tree *tree;
55342 +
55343 +       /*
55344 +        * to write one flow to a file by tails we have to reserve disk space for:
55345 +
55346 +        * 1. find_file_item may have to insert empty node to the tree (empty
55347 +        * leaf node between two extent items). This requires 1 block and
55348 +        * number of blocks which are necessary to perform insertion of an
55349 +        * internal item into twig level.
55350 +        *
55351 +        * 2. flow insertion
55352 +        *
55353 +        * 3. stat data update
55354 +        */
55355 +       tree = reiser4_tree_by_inode(inode);
55356 +       count = estimate_one_insert_item(tree) +
55357 +               estimate_insert_flow(tree->height) +
55358 +               estimate_one_insert_item(tree);
55359 +       grab_space_enable();
55360 +       return reiser4_grab_space(count, 0 /* flags */);
55361 +}
55362 +
55363 +#define PAGE_PER_FLOW 4
55364 +
55365 +static loff_t faultin_user_pages(const char __user *buf, size_t count)
55366 +{
55367 +       loff_t faulted;
55368 +       int to_fault;
55369 +
55370 +       if (count > PAGE_PER_FLOW * PAGE_CACHE_SIZE)
55371 +               count = PAGE_PER_FLOW * PAGE_CACHE_SIZE;
55372 +       faulted = 0;
55373 +       while (count > 0) {
55374 +               to_fault = PAGE_CACHE_SIZE;
55375 +               if (count < to_fault)
55376 +                       to_fault = count;
55377 +               fault_in_pages_readable(buf + faulted, to_fault);
55378 +               count -= to_fault;
55379 +               faulted += to_fault;
55380 +       }
55381 +       return faulted;
55382 +}
55383 +
55384 +/**
55385 + * reiser4_write_extent - write method of tail item plugin
55386 + * @file: file to write to
55387 + * @buf: address of user-space buffer
55388 + * @count: number of bytes to write
55389 + * @pos: position in file to write to
55390 + *
55391 + * Returns number of written bytes or error code.
55392 + */
55393 +ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
55394 +                          size_t count, loff_t *pos)
55395 +{
55396 +       struct inode *inode;
55397 +       struct hint hint;
55398 +       int result;
55399 +       flow_t flow;
55400 +       coord_t *coord;
55401 +       lock_handle *lh;
55402 +       znode *loaded;
55403 +
55404 +       inode = file->f_dentry->d_inode;
55405 +
55406 +       if (write_extent_reserve_space(inode))
55407 +               return RETERR(-ENOSPC);
55408 +
55409 +       result = load_file_hint(file, &hint);
55410 +       BUG_ON(result != 0);
55411 +
55412 +       flow.length = faultin_user_pages(buf, count);
55413 +       flow.user = 1;
55414 +       memcpy(&flow.data, &buf, sizeof(buf));
55415 +       flow.op = WRITE_OP;
55416 +       key_by_inode_and_offset_common(inode, *pos, &flow.key);
55417 +
55418 +       result = find_file_item(&hint, &flow.key, ZNODE_WRITE_LOCK, inode);
55419 +       if (IS_CBKERR(result))
55420 +               return result;
55421 +
55422 +       coord = &hint.ext_coord.coord;
55423 +       lh = hint.ext_coord.lh;
55424 +
55425 +       result = zload(coord->node);
55426 +       BUG_ON(result != 0);
55427 +       loaded = coord->node;
55428 +
55429 +       if (coord->between == AFTER_UNIT) {
55430 +               /* append with data or hole */
55431 +               result = append_tail(inode, &flow, coord, lh);
55432 +       } else if (coord->between == AT_UNIT) {
55433 +               /* overwrite */
55434 +               result = overwrite_tail(&flow, coord);
55435 +       } else {
55436 +               /* no items of this file yet. insert data or hole */
55437 +               result = insert_first_tail(inode, &flow, coord, lh);
55438 +       }
55439 +       zrelse(loaded);
55440 +       if (result < 0) {
55441 +               done_lh(lh);
55442 +               return result;
55443 +       }
55444 +
55445 +       /* seal and unlock znode */
55446 +       hint.ext_coord.valid = 0;
55447 +       if (hint.ext_coord.valid)
55448 +               reiser4_set_hint(&hint, &flow.key, ZNODE_WRITE_LOCK);
55449 +       else
55450 +               reiser4_unset_hint(&hint);
55451 +
55452 +       save_file_hint(file, &hint);
55453 +       return result;
55454 +}
55455 +
55456 +#if REISER4_DEBUG
55457 +
55458 +static int
55459 +coord_matches_key_tail(const coord_t * coord, const reiser4_key * key)
55460 +{
55461 +       reiser4_key item_key;
55462 +
55463 +       assert("vs-1356", coord_is_existing_unit(coord));
55464 +       assert("vs-1354", keylt(key, append_key_tail(coord, &item_key)));
55465 +       assert("vs-1355", keyge(key, item_key_by_coord(coord, &item_key)));
55466 +       return get_key_offset(key) ==
55467 +           get_key_offset(&item_key) + coord->unit_pos;
55468 +
55469 +}
55470 +
55471 +#endif
55472 +
55473 +/* plugin->u.item.s.file.read */
55474 +int reiser4_read_tail(struct file *file UNUSED_ARG, flow_t *f, hint_t *hint)
55475 +{
55476 +       unsigned count;
55477 +       int item_length;
55478 +       coord_t *coord;
55479 +       uf_coord_t *uf_coord;
55480 +
55481 +       uf_coord = &hint->ext_coord;
55482 +       coord = &uf_coord->coord;
55483 +
55484 +       assert("vs-571", f->user == 1);
55485 +       assert("vs-571", f->data);
55486 +       assert("vs-967", coord && coord->node);
55487 +       assert("vs-1117", znode_is_rlocked(coord->node));
55488 +       assert("vs-1118", znode_is_loaded(coord->node));
55489 +
55490 +       assert("nikita-3037", reiser4_schedulable());
55491 +       assert("vs-1357", coord_matches_key_tail(coord, &f->key));
55492 +
55493 +       /* calculate number of bytes to read off the item */
55494 +       item_length = item_length_by_coord(coord);
55495 +       count = item_length_by_coord(coord) - coord->unit_pos;
55496 +       if (count > f->length)
55497 +               count = f->length;
55498 +
55499 +       /* user page has to be brought in so that major page fault does not
55500 +        * occur here when longtem lock is held */
55501 +       if (__copy_to_user((char __user *)f->data,
55502 +                          ((char *)item_body_by_coord(coord) + coord->unit_pos),
55503 +                          count))
55504 +               return RETERR(-EFAULT);
55505 +
55506 +       /* probably mark_page_accessed() should only be called if
55507 +        * coord->unit_pos is zero. */
55508 +       mark_page_accessed(znode_page(coord->node));
55509 +       move_flow_forward(f, count);
55510 +
55511 +       coord->unit_pos += count;
55512 +       if (item_length == coord->unit_pos) {
55513 +               coord->unit_pos--;
55514 +               coord->between = AFTER_UNIT;
55515 +       }
55516 +
55517 +       return 0;
55518 +}
55519 +
55520 +/*
55521 +   plugin->u.item.s.file.append_key
55522 +   key of first byte which is the next to last byte by addressed by this item
55523 +*/
55524 +reiser4_key *append_key_tail(const coord_t * coord, reiser4_key * key)
55525 +{
55526 +       item_key_by_coord(coord, key);
55527 +       set_key_offset(key, get_key_offset(key) + item_length_by_coord(coord));
55528 +       return key;
55529 +}
55530 +
55531 +/* plugin->u.item.s.file.init_coord_extension */
55532 +void init_coord_extension_tail(uf_coord_t * uf_coord, loff_t lookuped)
55533 +{
55534 +       uf_coord->valid = 1;
55535 +}
55536 +
55537 +/*
55538 +  plugin->u.item.s.file.get_block
55539 +*/
55540 +int
55541 +get_block_address_tail(const coord_t * coord, sector_t lblock, sector_t * block)
55542 +{
55543 +       assert("nikita-3252", znode_get_level(coord->node) == LEAF_LEVEL);
55544 +
55545 +       if (reiser4_blocknr_is_fake(znode_get_block(coord->node)))
55546 +               /* if node has'nt obtainet its block number yet, return 0.
55547 +                * Lets avoid upsetting users with some cosmic numbers beyond
55548 +                * the device capacity.*/
55549 +               *block = 0;
55550 +       else
55551 +               *block = *znode_get_block(coord->node);
55552 +       return 0;
55553 +}
55554 +
55555 +/*
55556 + * Local variables:
55557 + * c-indentation-style: "K&R"
55558 + * mode-name: "LC"
55559 + * c-basic-offset: 8
55560 + * tab-width: 8
55561 + * fill-column: 79
55562 + * scroll-step: 1
55563 + * End:
55564 + */
55565 diff --git a/fs/reiser4/plugin/item/tail.h b/fs/reiser4/plugin/item/tail.h
55566 new file mode 100644
55567 index 0000000..459fa27
55568 --- /dev/null
55569 +++ b/fs/reiser4/plugin/item/tail.h
55570 @@ -0,0 +1,58 @@
55571 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55572 +
55573 +#if !defined( __REISER4_TAIL_H__ )
55574 +#define __REISER4_TAIL_H__
55575 +
55576 +typedef struct {
55577 +       int not_used;
55578 +} tail_coord_extension_t;
55579 +
55580 +struct cut_list;
55581 +
55582 +/* plugin->u.item.b.* */
55583 +reiser4_key *max_key_inside_tail(const coord_t *, reiser4_key *);
55584 +int can_contain_key_tail(const coord_t * coord, const reiser4_key * key,
55585 +                        const reiser4_item_data *);
55586 +int mergeable_tail(const coord_t * p1, const coord_t * p2);
55587 +pos_in_node_t nr_units_tail(const coord_t *);
55588 +lookup_result lookup_tail(const reiser4_key *, lookup_bias, coord_t *);
55589 +int paste_tail(coord_t *, reiser4_item_data *, carry_plugin_info *);
55590 +int can_shift_tail(unsigned free_space, coord_t * source,
55591 +                  znode * target, shift_direction, unsigned *size,
55592 +                  unsigned want);
55593 +void copy_units_tail(coord_t * target, coord_t * source, unsigned from,
55594 +                    unsigned count, shift_direction, unsigned free_space);
55595 +int kill_hook_tail(const coord_t *, pos_in_node_t from, pos_in_node_t count,
55596 +                  struct carry_kill_data *);
55597 +int cut_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55598 +                  struct carry_cut_data *, reiser4_key * smallest_removed,
55599 +                  reiser4_key * new_first);
55600 +int kill_units_tail(coord_t *, pos_in_node_t from, pos_in_node_t to,
55601 +                   struct carry_kill_data *, reiser4_key * smallest_removed,
55602 +                   reiser4_key * new_first);
55603 +reiser4_key *unit_key_tail(const coord_t *, reiser4_key *);
55604 +
55605 +/* plugin->u.item.s.* */
55606 +ssize_t reiser4_write_tail(struct file *file, const char __user *buf,
55607 +                          size_t count, loff_t *pos);
55608 +int reiser4_read_tail(struct file *, flow_t *, hint_t *);
55609 +int readpage_tail(void *vp, struct page *page);
55610 +reiser4_key *append_key_tail(const coord_t *, reiser4_key *);
55611 +void init_coord_extension_tail(uf_coord_t *, loff_t offset);
55612 +int get_block_address_tail(const coord_t *, sector_t, sector_t *);
55613 +int item_balance_dirty_pages(struct address_space *, const flow_t *,
55614 +                            hint_t *, int back_to_dirty, int set_hint);
55615 +
55616 +/* __REISER4_TAIL_H__ */
55617 +#endif
55618 +
55619 +/* Make Linus happy.
55620 +   Local variables:
55621 +   c-indentation-style: "K&R"
55622 +   mode-name: "LC"
55623 +   c-basic-offset: 8
55624 +   tab-width: 8
55625 +   fill-column: 120
55626 +   scroll-step: 1
55627 +   End:
55628 +*/
55629 diff --git a/fs/reiser4/plugin/node/Makefile b/fs/reiser4/plugin/node/Makefile
55630 new file mode 100644
55631 index 0000000..9400627
55632 --- /dev/null
55633 +++ b/fs/reiser4/plugin/node/Makefile
55634 @@ -0,0 +1,5 @@
55635 +obj-$(CONFIG_REISER4_FS) += node_plugins.o
55636 +
55637 +node_plugins-objs :=   \
55638 +       node.o          \
55639 +       node40.o
55640 diff --git a/fs/reiser4/plugin/node/node.c b/fs/reiser4/plugin/node/node.c
55641 new file mode 100644
55642 index 0000000..179a4a7
55643 --- /dev/null
55644 +++ b/fs/reiser4/plugin/node/node.c
55645 @@ -0,0 +1,131 @@
55646 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55647 +
55648 +/* Node plugin interface.
55649 +
55650 +   Description: The tree provides the abstraction of flows, which it
55651 +   internally fragments into items which it stores in nodes.
55652 +
55653 +   A key_atom is a piece of data bound to a single key.
55654 +
55655 +   For reasonable space efficiency to be achieved it is often
55656 +   necessary to store key_atoms in the nodes in the form of items, where
55657 +   an item is a sequence of key_atoms of the same or similar type. It is
55658 +   more space-efficient, because the item can implement (very)
55659 +   efficient compression of key_atom's bodies using internal knowledge
55660 +   about their semantics, and it can often avoid having a key for each
55661 +   key_atom. Each type of item has specific operations implemented by its
55662 +   item handler (see balance.c).
55663 +
55664 +   Rationale: the rest of the code (specifically balancing routines)
55665 +   accesses leaf level nodes through this interface. This way we can
55666 +   implement various block layouts and even combine various layouts
55667 +   within the same tree. Balancing/allocating algorithms should not
55668 +   care about peculiarities of splitting/merging specific item types,
55669 +   but rather should leave that to the item's item handler.
55670 +
55671 +   Items, including those that provide the abstraction of flows, have
55672 +   the property that if you move them in part or in whole to another
55673 +   node, the balancing code invokes their is_left_mergeable()
55674 +   item_operation to determine if they are mergeable with their new
55675 +   neighbor in the node you have moved them to.  For some items the
55676 +   is_left_mergeable() function always returns null.
55677 +
55678 +   When moving the bodies of items from one node to another:
55679 +
55680 +     if a partial item is shifted to another node the balancing code invokes
55681 +     an item handler method to handle the item splitting.
55682 +
55683 +     if the balancing code needs to merge with an item in the node it
55684 +     is shifting to, it will invoke an item handler method to handle
55685 +     the item merging.
55686 +
55687 +     if it needs to move whole item bodies unchanged, the balancing code uses xmemcpy()
55688 +     adjusting the item headers after the move is done using the node handler.
55689 +*/
55690 +
55691 +#include "../../forward.h"
55692 +#include "../../debug.h"
55693 +#include "../../key.h"
55694 +#include "../../coord.h"
55695 +#include "../plugin_header.h"
55696 +#include "../item/item.h"
55697 +#include "node.h"
55698 +#include "../plugin.h"
55699 +#include "../../znode.h"
55700 +#include "../../tree.h"
55701 +#include "../../super.h"
55702 +#include "../../reiser4.h"
55703 +
55704 +/**
55705 + * leftmost_key_in_node - get the smallest key in node
55706 + * @node:
55707 + * @key: store result here
55708 + *
55709 + * Stores the leftmost key of @node in @key.
55710 + */
55711 +reiser4_key *leftmost_key_in_node(const znode *node, reiser4_key *key)
55712 +{
55713 +       assert("nikita-1634", node != NULL);
55714 +       assert("nikita-1635", key != NULL);
55715 +
55716 +       if (!node_is_empty(node)) {
55717 +               coord_t first_item;
55718 +
55719 +               coord_init_first_unit(&first_item, (znode *) node);
55720 +               item_key_by_coord(&first_item, key);
55721 +       } else
55722 +               *key = *reiser4_max_key();
55723 +       return key;
55724 +}
55725 +
55726 +node_plugin node_plugins[LAST_NODE_ID] = {
55727 +       [NODE40_ID] = {
55728 +               .h = {
55729 +                       .type_id = REISER4_NODE_PLUGIN_TYPE,
55730 +                       .id = NODE40_ID,
55731 +                       .pops = NULL,
55732 +                       .label = "unified",
55733 +                       .desc = "unified node layout",
55734 +                       .linkage = {NULL, NULL}
55735 +               },
55736 +               .item_overhead = item_overhead_node40,
55737 +               .free_space = free_space_node40,
55738 +               .lookup = lookup_node40,
55739 +               .num_of_items = num_of_items_node40,
55740 +               .item_by_coord = item_by_coord_node40,
55741 +               .length_by_coord = length_by_coord_node40,
55742 +               .plugin_by_coord = plugin_by_coord_node40,
55743 +               .key_at = key_at_node40,
55744 +               .estimate = estimate_node40,
55745 +               .check = check_node40,
55746 +               .parse = parse_node40,
55747 +               .init = init_node40,
55748 +#ifdef GUESS_EXISTS
55749 +               .guess = guess_node40,
55750 +#endif
55751 +               .change_item_size = change_item_size_node40,
55752 +               .create_item = create_item_node40,
55753 +               .update_item_key = update_item_key_node40,
55754 +               .cut_and_kill = kill_node40,
55755 +               .cut = cut_node40,
55756 +               .shift = shift_node40,
55757 +               .shrink_item = shrink_item_node40,
55758 +               .fast_insert = fast_insert_node40,
55759 +               .fast_paste = fast_paste_node40,
55760 +               .fast_cut = fast_cut_node40,
55761 +               .max_item_size = max_item_size_node40,
55762 +               .prepare_removal = prepare_removal_node40,
55763 +               .set_item_plugin = set_item_plugin_node40
55764 +       }
55765 +};
55766 +
55767 +/*
55768 +   Local variables:
55769 +   c-indentation-style: "K&R"
55770 +   mode-name: "LC"
55771 +   c-basic-offset: 8
55772 +   tab-width: 8
55773 +   fill-column: 120
55774 +   scroll-step: 1
55775 +   End:
55776 +*/
55777 diff --git a/fs/reiser4/plugin/node/node.h b/fs/reiser4/plugin/node/node.h
55778 new file mode 100644
55779 index 0000000..af0c641
55780 --- /dev/null
55781 +++ b/fs/reiser4/plugin/node/node.h
55782 @@ -0,0 +1,272 @@
55783 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
55784 +
55785 +/* We need a definition of the default node layout here. */
55786 +
55787 +/* Generally speaking, it is best to have free space in the middle of the
55788 +   node so that two sets of things can grow towards it, and to have the
55789 +   item bodies on the left so that the last one of them grows into free
55790 +   space.  We optimize for the case where we append new items to the end
55791 +   of the node, or grow the last item, because it hurts nothing to so
55792 +   optimize and it is a common special case to do massive insertions in
55793 +   increasing key order (and one of cases more likely to have a real user
55794 +   notice the delay time for).
55795 +
55796 +   formatted leaf default layout: (leaf1)
55797 +
55798 +   |node header:item bodies:free space:key + pluginid + item offset|
55799 +
55800 +   We grow towards the middle, optimizing layout for the case where we
55801 +   append new items to the end of the node.  The node header is fixed
55802 +   length.  Keys, and item offsets plus pluginids for the items
55803 +   corresponding to them are in increasing key order, and are fixed
55804 +   length.  Item offsets are relative to start of node (16 bits creating
55805 +   a node size limit of 64k, 12 bits might be a better choice....).  Item
55806 +   bodies are in decreasing key order.  Item bodies have a variable size.
55807 +   There is a one to one to one mapping of keys to item offsets to item
55808 +   bodies.  Item offsets consist of pointers to the zeroth byte of the
55809 +   item body.  Item length equals the start of the next item minus the
55810 +   start of this item, except the zeroth item whose length equals the end
55811 +   of the node minus the start of that item (plus a byte).  In other
55812 +   words, the item length is not recorded anywhere, and it does not need
55813 +   to be since it is computable.
55814 +
55815 +   Leaf variable length items and keys layout : (lvar)
55816 +
55817 +   |node header:key offset + item offset + pluginid triplets:free space:key bodies:item bodies|
55818 +
55819 +   We grow towards the middle, optimizing layout for the case where we
55820 +   append new items to the end of the node.  The node header is fixed
55821 +   length.  Keys and item offsets for the items corresponding to them are
55822 +   in increasing key order, and keys are variable length.  Item offsets
55823 +   are relative to start of node (16 bits).  Item bodies are in
55824 +   decreasing key order.  Item bodies have a variable size.  There is a
55825 +   one to one to one mapping of keys to item offsets to item bodies.
55826 +   Item offsets consist of pointers to the zeroth byte of the item body.
55827 +   Item length equals the start of the next item's key minus the start of
55828 +   this item, except the zeroth item whose length equals the end of the
55829 +   node minus the start of that item (plus a byte).
55830 +
55831 +   leaf compressed keys layout: (lcomp)
55832 +
55833 +   |node header:key offset + key inherit + item offset pairs:free space:key bodies:item bodies|
55834 +
55835 +   We grow towards the middle, optimizing layout for the case where we
55836 +   append new items to the end of the node.  The node header is fixed
55837 +   length.  Keys and item offsets for the items corresponding to them are
55838 +   in increasing key order, and keys are variable length.  The "key
55839 +   inherit" field indicates how much of the key prefix is identical to
55840 +   the previous key (stem compression as described in "Managing
55841 +   Gigabytes" is used).  key_inherit is a one byte integer.  The
55842 +   intra-node searches performed through this layout are linear searches,
55843 +   and this is theorized to not hurt performance much due to the high
55844 +   cost of processor stalls on modern CPUs, and the small number of keys
55845 +   in a single node.  Item offsets are relative to start of node (16
55846 +   bits).  Item bodies are in decreasing key order.  Item bodies have a
55847 +   variable size.  There is a one to one to one mapping of keys to item
55848 +   offsets to item bodies.  Item offsets consist of pointers to the
55849 +   zeroth byte of the item body.  Item length equals the start of the
55850 +   next item minus the start of this item, except the zeroth item whose
55851 +   length equals the end of the node minus the start of that item (plus a
55852 +   byte).  In other words, item length and key length is not recorded
55853 +   anywhere, and it does not need to be since it is computable.
55854 +
55855 +   internal node default layout: (idef1)
55856 +
55857 +   just like ldef1 except that item bodies are either blocknrs of
55858 +   children or extents, and moving them may require updating parent
55859 +   pointers in the nodes that they point to.
55860 +*/
55861 +
55862 +/* There is an inherent 3-way tradeoff between optimizing and
55863 +   exchanging disks between different architectures and code
55864 +   complexity.  This is optimal and simple and inexchangeable.
55865 +   Someone else can do the code for exchanging disks and make it
55866 +   complex. It would not be that hard.  Using other than the PAGE_SIZE
55867 +   might be suboptimal.
55868 +*/
55869 +
55870 +#if !defined( __REISER4_NODE_H__ )
55871 +#define __REISER4_NODE_H__
55872 +
55873 +#define LEAF40_NODE_SIZE PAGE_CACHE_SIZE
55874 +
55875 +#include "../../dformat.h"
55876 +#include "../plugin_header.h"
55877 +
55878 +#include <linux/types.h>
55879 +
55880 +typedef enum {
55881 +       NS_FOUND = 0,
55882 +       NS_NOT_FOUND = -ENOENT
55883 +} node_search_result;
55884 +
55885 +/* Maximal possible space overhead for creation of new item in a node */
55886 +#define REISER4_NODE_MAX_OVERHEAD ( sizeof( reiser4_key ) + 32 )
55887 +
55888 +typedef enum {
55889 +       REISER4_NODE_DKEYS = (1 << 0),
55890 +       REISER4_NODE_TREE_STABLE = (1 << 1)
55891 +} reiser4_node_check_flag;
55892 +
55893 +/* cut and cut_and_kill have too long list of parameters. This structure is just to safe some space on stack */
55894 +struct cut_list {
55895 +       coord_t *from;
55896 +       coord_t *to;
55897 +       const reiser4_key *from_key;
55898 +       const reiser4_key *to_key;
55899 +       reiser4_key *smallest_removed;
55900 +       carry_plugin_info *info;
55901 +       __u32 flags;
55902 +       struct inode *inode;    /* this is to pass list of eflushed jnodes down to extent_kill_hook */
55903 +       lock_handle *left;
55904 +       lock_handle *right;
55905 +};
55906 +
55907 +struct carry_cut_data;
55908 +struct carry_kill_data;
55909 +
55910 +/* The responsibility of the node plugin is to store and give access
55911 +   to the sequence of items within the node.  */
55912 +typedef struct node_plugin {
55913 +       /* generic plugin fields */
55914 +       plugin_header h;
55915 +
55916 +       /* calculates the amount of space that will be required to store an
55917 +          item which is in addition to the space consumed by the item body.
55918 +          (the space consumed by the item body can be gotten by calling
55919 +          item->estimate) */
55920 +        size_t(*item_overhead) (const znode * node, flow_t * f);
55921 +
55922 +       /* returns free space by looking into node (i.e., without using
55923 +          znode->free_space). */
55924 +        size_t(*free_space) (znode * node);
55925 +       /* search within the node for the one item which might
55926 +          contain the key, invoking item->search_within to search within
55927 +          that item to see if it is in there */
55928 +        node_search_result(*lookup) (znode * node, const reiser4_key * key,
55929 +                                     lookup_bias bias, coord_t * coord);
55930 +       /* number of items in node */
55931 +       int (*num_of_items) (const znode * node);
55932 +
55933 +       /* store information about item in @coord in @data */
55934 +       /* break into several node ops, don't add any more uses of this before doing so */
55935 +       /*int ( *item_at )( const coord_t *coord, reiser4_item_data *data ); */
55936 +       char *(*item_by_coord) (const coord_t * coord);
55937 +       int (*length_by_coord) (const coord_t * coord);
55938 +       item_plugin *(*plugin_by_coord) (const coord_t * coord);
55939 +
55940 +       /* store item key in @key */
55941 +       reiser4_key *(*key_at) (const coord_t * coord, reiser4_key * key);
55942 +       /* conservatively estimate whether unit of what size can fit
55943 +          into node. This estimation should be performed without
55944 +          actually looking into the node's content (free space is saved in
55945 +          znode). */
55946 +        size_t(*estimate) (znode * node);
55947 +
55948 +       /* performs every consistency check the node plugin author could
55949 +          imagine. Optional. */
55950 +       int (*check) (const znode * node, __u32 flags, const char **error);
55951 +
55952 +       /* Called when node is read into memory and node plugin is
55953 +          already detected. This should read some data into znode (like free
55954 +          space counter) and, optionally, check data consistency.
55955 +        */
55956 +       int (*parse) (znode * node);
55957 +       /* This method is called on a new node to initialise plugin specific
55958 +          data (header, etc.) */
55959 +       int (*init) (znode * node);
55960 +       /* Check whether @node content conforms to this plugin format.
55961 +          Probably only useful after support for old V3.x formats is added.
55962 +          Uncomment after 4.0 only.
55963 +        */
55964 +       /*      int ( *guess )( const znode *node ); */
55965 +#if REISER4_DEBUG
55966 +       void (*print) (const char *prefix, const znode * node, __u32 flags);
55967 +#endif
55968 +       /* change size of @item by @by bytes. @item->node has enough free
55969 +          space. When @by > 0 - free space is appended to end of item. When
55970 +          @by < 0 - item is truncated - it is assumed that last @by bytes if
55971 +          the item are freed already */
55972 +       void (*change_item_size) (coord_t * item, int by);
55973 +
55974 +       /* create new item @length bytes long in coord @target */
55975 +       int (*create_item) (coord_t * target, const reiser4_key * key,
55976 +                           reiser4_item_data * data, carry_plugin_info * info);
55977 +
55978 +       /* update key of item. */
55979 +       void (*update_item_key) (coord_t * target, const reiser4_key * key,
55980 +                                carry_plugin_info * info);
55981 +
55982 +       int (*cut_and_kill) (struct carry_kill_data *, carry_plugin_info *);
55983 +       int (*cut) (struct carry_cut_data *, carry_plugin_info *);
55984 +
55985 +       /*
55986 +        * shrink item pointed to by @coord by @delta bytes.
55987 +        */
55988 +       int (*shrink_item) (coord_t * coord, int delta);
55989 +
55990 +       /* copy as much as possible but not more than up to @stop from
55991 +          @stop->node to @target. If (pend == append) then data from beginning of
55992 +          @stop->node are copied to the end of @target. If (pend == prepend) then
55993 +          data from the end of @stop->node are copied to the beginning of
55994 +          @target. Copied data are removed from @stop->node. Information
55995 +          about what to do on upper level is stored in @todo */
55996 +       int (*shift) (coord_t * stop, znode * target, shift_direction pend,
55997 +                     int delete_node, int including_insert_coord,
55998 +                     carry_plugin_info * info);
55999 +       /* return true if this node allows skip carry() in some situations
56000 +          (see fs/reiser4/tree.c:insert_by_coord()). Reiser3.x format
56001 +          emulation doesn't.
56002 +
56003 +          This will speedup insertions that doesn't require updates to the
56004 +          parent, by bypassing initialisation of carry() structures. It's
56005 +          believed that majority of insertions will fit there.
56006 +
56007 +        */
56008 +       int (*fast_insert) (const coord_t * coord);
56009 +       int (*fast_paste) (const coord_t * coord);
56010 +       int (*fast_cut) (const coord_t * coord);
56011 +       /* this limits max size of item which can be inserted into a node and
56012 +          number of bytes item in a node may be appended with */
56013 +       int (*max_item_size) (void);
56014 +       int (*prepare_removal) (znode * empty, carry_plugin_info * info);
56015 +       /* change plugin id of items which are in a node already. Currently it is Used in tail conversion for regular
56016 +        * files */
56017 +       int (*set_item_plugin) (coord_t * coord, item_id);
56018 +} node_plugin;
56019 +
56020 +typedef enum {
56021 +       /* standard unified node layout used for both leaf and internal
56022 +          nodes */
56023 +       NODE40_ID,
56024 +       LAST_NODE_ID
56025 +} reiser4_node_id;
56026 +
56027 +extern reiser4_key *leftmost_key_in_node(const znode * node, reiser4_key * key);
56028 +#if REISER4_DEBUG
56029 +extern void print_node_content(const char *prefix, const znode * node,
56030 +                              __u32 flags);
56031 +#endif
56032 +
56033 +extern void indent_znode(const znode * node);
56034 +
56035 +typedef struct common_node_header {
56036 +       /*
56037 +        * identifier of node plugin. Must be located at the very beginning of
56038 +        * a node.
56039 +        */
56040 +       __le16 plugin_id;
56041 +} common_node_header;
56042 +
56043 +/* __REISER4_NODE_H__ */
56044 +#endif
56045 +/*
56046 + * Local variables:
56047 + * c-indentation-style: "K&R"
56048 + * mode-name: "LC"
56049 + * c-basic-offset: 8
56050 + * tab-width: 8
56051 + * fill-column: 79
56052 + * scroll-step: 1
56053 + * End:
56054 + */
56055 diff --git a/fs/reiser4/plugin/node/node40.c b/fs/reiser4/plugin/node/node40.c
56056 new file mode 100644
56057 index 0000000..6a9cc73
56058 --- /dev/null
56059 +++ b/fs/reiser4/plugin/node/node40.c
56060 @@ -0,0 +1,2924 @@
56061 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
56062 +
56063 +#include "../../debug.h"
56064 +#include "../../key.h"
56065 +#include "../../coord.h"
56066 +#include "../plugin_header.h"
56067 +#include "../item/item.h"
56068 +#include "node.h"
56069 +#include "node40.h"
56070 +#include "../plugin.h"
56071 +#include "../../jnode.h"
56072 +#include "../../znode.h"
56073 +#include "../../pool.h"
56074 +#include "../../carry.h"
56075 +#include "../../tap.h"
56076 +#include "../../tree.h"
56077 +#include "../../super.h"
56078 +#include "../../reiser4.h"
56079 +
56080 +#include <asm/uaccess.h>
56081 +#include <linux/types.h>
56082 +#include <linux/prefetch.h>
56083 +
56084 +/* leaf 40 format:
56085 +
56086 +  [node header | item 0, item 1, .., item N-1 |  free space | item_head N-1, .. item_head 1, item head 0 ]
56087 +   plugin_id (16)                                                key
56088 +   free_space (16)                                               pluginid (16)
56089 +   free_space_start (16)                                         offset (16)
56090 +   level (8)
56091 +   num_items (16)
56092 +   magic (32)
56093 +   flush_time (32)
56094 +*/
56095 +/* NIKITA-FIXME-HANS: I told you guys not less than 10 times to not call it r4fs.  Change to "ReIs". */
56096 +/* magic number that is stored in ->magic field of node header */
56097 +static const __u32 REISER4_NODE_MAGIC = 0x52344653;    /* (*(__u32 *)"R4FS"); */
56098 +
56099 +static int prepare_for_update(znode * left, znode * right,
56100 +                             carry_plugin_info * info);
56101 +
56102 +/* header of node of reiser40 format is at the beginning of node */
56103 +static inline node40_header *node40_node_header(const znode * node     /* node to
56104 +                                                                        * query */ )
56105 +{
56106 +       assert("nikita-567", node != NULL);
56107 +       assert("nikita-568", znode_page(node) != NULL);
56108 +       assert("nikita-569", zdata(node) != NULL);
56109 +       return (node40_header *) zdata(node);
56110 +}
56111 +
56112 +/* functions to get/set fields of node40_header */
56113 +#define nh40_get_magic(nh) le32_to_cpu(get_unaligned(&(nh)->magic))
56114 +#define nh40_get_free_space(nh) le16_to_cpu(get_unaligned(&(nh)->free_space))
56115 +#define nh40_get_free_space_start(nh) le16_to_cpu(get_unaligned(&(nh)->free_space_start))
56116 +#define nh40_get_level(nh) get_unaligned(&(nh)->level)
56117 +#define nh40_get_num_items(nh) le16_to_cpu(get_unaligned(&(nh)->nr_items))
56118 +#define nh40_get_flush_id(nh) le64_to_cpu(get_unaligned(&(nh)->flush_id))
56119 +
56120 +#define nh40_set_magic(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->magic)
56121 +#define nh40_set_free_space(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space)
56122 +#define nh40_set_free_space_start(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->free_space_start)
56123 +#define nh40_set_level(nh, value) put_unaligned(value, &(nh)->level)
56124 +#define nh40_set_num_items(nh, value) put_unaligned(cpu_to_le16(value), &(nh)->nr_items)
56125 +#define nh40_set_mkfs_id(nh, value) put_unaligned(cpu_to_le32(value), &(nh)->mkfs_id)
56126 +
56127 +/* plugin field of node header should be read/set by
56128 +   plugin_by_disk_id/save_disk_plugin */
56129 +
56130 +/* array of item headers is at the end of node */
56131 +static inline item_header40 *node40_ih_at(const znode * node, unsigned pos)
56132 +{
56133 +       return (item_header40 *) (zdata(node) + znode_size(node)) - pos - 1;
56134 +}
56135 +
56136 +/* ( page_address( node -> pg ) + PAGE_CACHE_SIZE ) - pos - 1
56137 + */
56138 +static inline item_header40 *node40_ih_at_coord(const coord_t * coord)
56139 +{
56140 +       return (item_header40 *) (zdata(coord->node) +
56141 +                                 znode_size(coord->node)) - (coord->item_pos) -
56142 +           1;
56143 +}
56144 +
56145 +/* functions to get/set fields of item_header40 */
56146 +#define ih40_get_offset(ih) le16_to_cpu(get_unaligned(&(ih)->offset))
56147 +
56148 +#define ih40_set_offset(ih, value) put_unaligned(cpu_to_le16(value), &(ih)->offset)
56149 +
56150 +/* plugin field of item header should be read/set by
56151 +   plugin_by_disk_id/save_disk_plugin */
56152 +
56153 +/* plugin methods */
56154 +
56155 +/* plugin->u.node.item_overhead
56156 +   look for description of this method in plugin/node/node.h */
56157 +size_t
56158 +item_overhead_node40(const znode * node UNUSED_ARG, flow_t * f UNUSED_ARG)
56159 +{
56160 +       return sizeof(item_header40);
56161 +}
56162 +
56163 +/* plugin->u.node.free_space
56164 +   look for description of this method in plugin/node/node.h */
56165 +size_t free_space_node40(znode * node)
56166 +{
56167 +       assert("nikita-577", node != NULL);
56168 +       assert("nikita-578", znode_is_loaded(node));
56169 +       assert("nikita-579", zdata(node) != NULL);
56170 +
56171 +       return nh40_get_free_space(node40_node_header(node));
56172 +}
56173 +
56174 +/* private inline version of node40_num_of_items() for use in this file. This
56175 +   is necessary, because address of node40_num_of_items() is taken and it is
56176 +   never inlined as a result. */
56177 +static inline short node40_num_of_items_internal(const znode * node)
56178 +{
56179 +       return nh40_get_num_items(node40_node_header(node));
56180 +}
56181 +
56182 +#if REISER4_DEBUG
56183 +static inline void check_num_items(const znode * node)
56184 +{
56185 +       assert("nikita-2749",
56186 +              node40_num_of_items_internal(node) == node->nr_items);
56187 +       assert("nikita-2746", znode_is_write_locked(node));
56188 +}
56189 +#else
56190 +#define check_num_items(node) noop
56191 +#endif
56192 +
56193 +/* plugin->u.node.num_of_items
56194 +   look for description of this method in plugin/node/node.h */
56195 +int num_of_items_node40(const znode * node)
56196 +{
56197 +       return node40_num_of_items_internal(node);
56198 +}
56199 +
56200 +static void
56201 +node40_set_num_items(znode * node, node40_header * nh, unsigned value)
56202 +{
56203 +       assert("nikita-2751", node != NULL);
56204 +       assert("nikita-2750", nh == node40_node_header(node));
56205 +
56206 +       check_num_items(node);
56207 +       nh40_set_num_items(nh, value);
56208 +       node->nr_items = value;
56209 +       check_num_items(node);
56210 +}
56211 +
56212 +/* plugin->u.node.item_by_coord
56213 +   look for description of this method in plugin/node/node.h */
56214 +char *item_by_coord_node40(const coord_t * coord)
56215 +{
56216 +       item_header40 *ih;
56217 +       char *p;
56218 +
56219 +       /* @coord is set to existing item */
56220 +       assert("nikita-596", coord != NULL);
56221 +       assert("vs-255", coord_is_existing_item(coord));
56222 +
56223 +       ih = node40_ih_at_coord(coord);
56224 +       p = zdata(coord->node) + ih40_get_offset(ih);
56225 +       return p;
56226 +}
56227 +
56228 +/* plugin->u.node.length_by_coord
56229 +   look for description of this method in plugin/node/node.h */
56230 +int length_by_coord_node40(const coord_t * coord)
56231 +{
56232 +       item_header40 *ih;
56233 +       int result;
56234 +
56235 +       /* @coord is set to existing item */
56236 +       assert("vs-256", coord != NULL);
56237 +       assert("vs-257", coord_is_existing_item(coord));
56238 +
56239 +       ih = node40_ih_at_coord(coord);
56240 +       if ((int)coord->item_pos ==
56241 +           node40_num_of_items_internal(coord->node) - 1)
56242 +               result =
56243 +                   nh40_get_free_space_start(node40_node_header(coord->node)) -
56244 +                   ih40_get_offset(ih);
56245 +       else
56246 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
56247 +
56248 +       return result;
56249 +}
56250 +
56251 +static pos_in_node_t
56252 +node40_item_length(const znode * node, pos_in_node_t item_pos)
56253 +{
56254 +       item_header40 *ih;
56255 +       pos_in_node_t result;
56256 +
56257 +       /* @coord is set to existing item */
56258 +       assert("vs-256", node != NULL);
56259 +       assert("vs-257", node40_num_of_items_internal(node) > item_pos);
56260 +
56261 +       ih = node40_ih_at(node, item_pos);
56262 +       if (item_pos == node40_num_of_items_internal(node) - 1)
56263 +               result =
56264 +                   nh40_get_free_space_start(node40_node_header(node)) -
56265 +                   ih40_get_offset(ih);
56266 +       else
56267 +               result = ih40_get_offset(ih - 1) - ih40_get_offset(ih);
56268 +
56269 +       return result;
56270 +}
56271 +
56272 +/* plugin->u.node.plugin_by_coord
56273 +   look for description of this method in plugin/node/node.h */
56274 +item_plugin *plugin_by_coord_node40(const coord_t * coord)
56275 +{
56276 +       item_header40 *ih;
56277 +       item_plugin *result;
56278 +
56279 +       /* @coord is set to existing item */
56280 +       assert("vs-258", coord != NULL);
56281 +       assert("vs-259", coord_is_existing_item(coord));
56282 +
56283 +       ih = node40_ih_at_coord(coord);
56284 +       /* pass NULL in stead of current tree. This is time critical call. */
56285 +       result = item_plugin_by_disk_id(NULL, &ih->plugin_id);
56286 +       return result;
56287 +}
56288 +
56289 +/* plugin->u.node.key_at
56290 +   look for description of this method in plugin/node/node.h */
56291 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key)
56292 +{
56293 +       item_header40 *ih;
56294 +
56295 +       assert("nikita-1765", coord_is_existing_item(coord));
56296 +
56297 +       /* @coord is set to existing item */
56298 +       ih = node40_ih_at_coord(coord);
56299 +       memcpy(key, &ih->key, sizeof(reiser4_key));
56300 +       return key;
56301 +}
56302 +
56303 +/* VS-FIXME-HANS: please review whether the below are properly disabled when debugging is disabled */
56304 +
56305 +#define NODE_INCSTAT(n, counter)                                               \
56306 +       reiser4_stat_inc_at_level(znode_get_level(n), node.lookup.counter)
56307 +
56308 +#define NODE_ADDSTAT(n, counter, val)                                          \
56309 +       reiser4_stat_add_at_level(znode_get_level(n), node.lookup.counter, val)
56310 +
56311 +/* plugin->u.node.lookup
56312 +   look for description of this method in plugin/node/node.h */
56313 +node_search_result lookup_node40(znode * node /* node to query */ ,
56314 +                                const reiser4_key * key /* key to look for */ ,
56315 +                                lookup_bias bias /* search bias */ ,
56316 +                                coord_t * coord /* resulting coord */ )
56317 +{
56318 +       int left;
56319 +       int right;
56320 +       int found;
56321 +       int items;
56322 +
56323 +       item_header40 *lefth;
56324 +       item_header40 *righth;
56325 +
56326 +       item_plugin *iplug;
56327 +       item_header40 *bstop;
56328 +       item_header40 *ih;
56329 +       cmp_t order;
56330 +
56331 +       assert("nikita-583", node != NULL);
56332 +       assert("nikita-584", key != NULL);
56333 +       assert("nikita-585", coord != NULL);
56334 +       assert("nikita-2693", znode_is_any_locked(node));
56335 +       cassert(REISER4_SEQ_SEARCH_BREAK > 2);
56336 +
56337 +       items = node_num_items(node);
56338 +
56339 +       if (unlikely(items == 0)) {
56340 +               coord_init_first_unit(coord, node);
56341 +               return NS_NOT_FOUND;
56342 +       }
56343 +
56344 +       /* binary search for item that can contain given key */
56345 +       left = 0;
56346 +       right = items - 1;
56347 +       coord->node = node;
56348 +       coord_clear_iplug(coord);
56349 +       found = 0;
56350 +
56351 +       lefth = node40_ih_at(node, left);
56352 +       righth = node40_ih_at(node, right);
56353 +
56354 +       /* It is known that for small arrays sequential search is on average
56355 +          more efficient than binary. This is because sequential search is
56356 +          coded as tight loop that can be better optimized by compilers and
56357 +          for small array size gain from this optimization makes sequential
56358 +          search the winner. Another, maybe more important, reason for this,
56359 +          is that sequential array is more CPU cache friendly, whereas binary
56360 +          search effectively destroys CPU caching.
56361 +
56362 +          Critical here is the notion of "smallness". Reasonable value of
56363 +          REISER4_SEQ_SEARCH_BREAK can be found by playing with code in
56364 +          fs/reiser4/ulevel/ulevel.c:test_search().
56365 +
56366 +          Don't try to further optimize sequential search by scanning from
56367 +          right to left in attempt to use more efficient loop termination
56368 +          condition (comparison with 0). This doesn't work.
56369 +
56370 +        */
56371 +
56372 +       while (right - left >= REISER4_SEQ_SEARCH_BREAK) {
56373 +               int median;
56374 +               item_header40 *medianh;
56375 +
56376 +               median = (left + right) / 2;
56377 +               medianh = node40_ih_at(node, median);
56378 +
56379 +               assert("nikita-1084", median >= 0);
56380 +               assert("nikita-1085", median < items);
56381 +               switch (keycmp(key, &medianh->key)) {
56382 +               case LESS_THAN:
56383 +                       right = median;
56384 +                       righth = medianh;
56385 +                       break;
56386 +               default:
56387 +                       wrong_return_value("nikita-586", "keycmp");
56388 +               case GREATER_THAN:
56389 +                       left = median;
56390 +                       lefth = medianh;
56391 +                       break;
56392 +               case EQUAL_TO:
56393 +                       do {
56394 +                               --median;
56395 +                               /* headers are ordered from right to left */
56396 +                               ++medianh;
56397 +                       } while (median >= 0 && keyeq(key, &medianh->key));
56398 +                       right = left = median + 1;
56399 +                       ih = lefth = righth = medianh - 1;
56400 +                       found = 1;
56401 +                       break;
56402 +               }
56403 +       }
56404 +       /* sequential scan. Item headers, and, therefore, keys are stored at
56405 +          the rightmost part of a node from right to left. We are trying to
56406 +          access memory from left to right, and hence, scan in _descending_
56407 +          order of item numbers.
56408 +        */
56409 +       if (!found) {
56410 +               for (left = right, ih = righth; left >= 0; ++ih, --left) {
56411 +                       cmp_t comparison;
56412 +
56413 +                       prefetchkey(&(ih + 1)->key);
56414 +                       comparison = keycmp(&ih->key, key);
56415 +                       if (comparison == GREATER_THAN)
56416 +                               continue;
56417 +                       if (comparison == EQUAL_TO) {
56418 +                               found = 1;
56419 +                               do {
56420 +                                       --left;
56421 +                                       ++ih;
56422 +                               } while (left >= 0 && keyeq(&ih->key, key));
56423 +                               ++left;
56424 +                               --ih;
56425 +                       } else {
56426 +                               assert("nikita-1256", comparison == LESS_THAN);
56427 +                       }
56428 +                       break;
56429 +               }
56430 +               if (unlikely(left < 0))
56431 +                       left = 0;
56432 +       }
56433 +
56434 +       assert("nikita-3212", right >= left);
56435 +       assert("nikita-3214",
56436 +              equi(found, keyeq(&node40_ih_at(node, left)->key, key)));
56437 +
56438 +       coord_set_item_pos(coord, left);
56439 +       coord->unit_pos = 0;
56440 +       coord->between = AT_UNIT;
56441 +
56442 +       /* key < leftmost key in a mode or node is corrupted and keys
56443 +          are not sorted  */
56444 +       bstop = node40_ih_at(node, (unsigned)left);
56445 +       order = keycmp(&bstop->key, key);
56446 +       if (unlikely(order == GREATER_THAN)) {
56447 +               if (unlikely(left != 0)) {
56448 +                       /* screw up */
56449 +                       warning("nikita-587", "Key less than %i key in a node",
56450 +                               left);
56451 +                       reiser4_print_key("key", key);
56452 +                       reiser4_print_key("min", &bstop->key);
56453 +                       print_coord_content("coord", coord);
56454 +                       return RETERR(-EIO);
56455 +               } else {
56456 +                       coord->between = BEFORE_UNIT;
56457 +                       return NS_NOT_FOUND;
56458 +               }
56459 +       }
56460 +       /* left <= key, ok */
56461 +       iplug = item_plugin_by_disk_id(znode_get_tree(node), &bstop->plugin_id);
56462 +
56463 +       if (unlikely(iplug == NULL)) {
56464 +               warning("nikita-588", "Unknown plugin %i",
56465 +                       le16_to_cpu(get_unaligned(&bstop->plugin_id)));
56466 +               reiser4_print_key("key", key);
56467 +               print_coord_content("coord", coord);
56468 +               return RETERR(-EIO);
56469 +       }
56470 +
56471 +       coord_set_iplug(coord, iplug);
56472 +
56473 +       /* if exact key from item header was found by binary search, no
56474 +          further checks are necessary. */
56475 +       if (found) {
56476 +               assert("nikita-1259", order == EQUAL_TO);
56477 +               return NS_FOUND;
56478 +       }
56479 +       if (iplug->b.max_key_inside != NULL) {
56480 +               reiser4_key max_item_key;
56481 +
56482 +               /* key > max_item_key --- outside of an item */
56483 +               if (keygt(key, iplug->b.max_key_inside(coord, &max_item_key))) {
56484 +                       coord->unit_pos = 0;
56485 +                       coord->between = AFTER_ITEM;
56486 +                       /* FIXME-VS: key we are looking for does not fit into
56487 +                          found item. Return NS_NOT_FOUND then. Without that
56488 +                          the following case does not work: there is extent of
56489 +                          file 10000, 10001. File 10000, 10002 has been just
56490 +                          created. When writing to position 0 in that file -
56491 +                          traverse_tree will stop here on twig level. When we
56492 +                          want it to go down to leaf level
56493 +                        */
56494 +                       return NS_NOT_FOUND;
56495 +               }
56496 +       }
56497 +
56498 +       if (iplug->b.lookup != NULL) {
56499 +               return iplug->b.lookup(key, bias, coord);
56500 +       } else {
56501 +               assert("nikita-1260", order == LESS_THAN);
56502 +               coord->between = AFTER_UNIT;
56503 +               return (bias == FIND_EXACT) ? NS_NOT_FOUND : NS_FOUND;
56504 +       }
56505 +}
56506 +
56507 +#undef NODE_ADDSTAT
56508 +#undef NODE_INCSTAT
56509 +
56510 +/* plugin->u.node.estimate
56511 +   look for description of this method in plugin/node/node.h */
56512 +size_t estimate_node40(znode * node)
56513 +{
56514 +       size_t result;
56515 +
56516 +       assert("nikita-597", node != NULL);
56517 +
56518 +       result = free_space_node40(node) - sizeof(item_header40);
56519 +
56520 +       return (result > 0) ? result : 0;
56521 +}
56522 +
56523 +/* plugin->u.node.check
56524 +   look for description of this method in plugin/node/node.h */
56525 +int check_node40(const znode * node /* node to check */ ,
56526 +                __u32 flags /* check flags */ ,
56527 +                const char **error /* where to store error message */ )
56528 +{
56529 +       int nr_items;
56530 +       int i;
56531 +       reiser4_key prev;
56532 +       unsigned old_offset;
56533 +       tree_level level;
56534 +       coord_t coord;
56535 +       int result;
56536 +
56537 +       assert("nikita-580", node != NULL);
56538 +       assert("nikita-581", error != NULL);
56539 +       assert("nikita-2948", znode_is_loaded(node));
56540 +
56541 +       if (ZF_ISSET(node, JNODE_HEARD_BANSHEE))
56542 +               return 0;
56543 +
56544 +       assert("nikita-582", zdata(node) != NULL);
56545 +
56546 +       nr_items = node40_num_of_items_internal(node);
56547 +       if (nr_items < 0) {
56548 +               *error = "Negative number of items";
56549 +               return -1;
56550 +       }
56551 +
56552 +       if (flags & REISER4_NODE_DKEYS)
56553 +               prev = *znode_get_ld_key((znode *) node);
56554 +       else
56555 +               prev = *reiser4_min_key();
56556 +
56557 +       old_offset = 0;
56558 +       coord_init_zero(&coord);
56559 +       coord.node = (znode *) node;
56560 +       coord.unit_pos = 0;
56561 +       coord.between = AT_UNIT;
56562 +       level = znode_get_level(node);
56563 +       for (i = 0; i < nr_items; i++) {
56564 +               item_header40 *ih;
56565 +               reiser4_key unit_key;
56566 +               unsigned j;
56567 +
56568 +               ih = node40_ih_at(node, (unsigned)i);
56569 +               coord_set_item_pos(&coord, i);
56570 +               if ((ih40_get_offset(ih) >=
56571 +                    znode_size(node) - nr_items * sizeof(item_header40)) ||
56572 +                   (ih40_get_offset(ih) < sizeof(node40_header))) {
56573 +                       *error = "Offset is out of bounds";
56574 +                       return -1;
56575 +               }
56576 +               if (ih40_get_offset(ih) <= old_offset) {
56577 +                       *error = "Offsets are in wrong order";
56578 +                       return -1;
56579 +               }
56580 +               if ((i == 0) && (ih40_get_offset(ih) != sizeof(node40_header))) {
56581 +                       *error = "Wrong offset of first item";
56582 +                       return -1;
56583 +               }
56584 +               old_offset = ih40_get_offset(ih);
56585 +
56586 +               if (keygt(&prev, &ih->key)) {
56587 +                       *error = "Keys are in wrong order";
56588 +                       return -1;
56589 +               }
56590 +               if (!keyeq(&ih->key, unit_key_by_coord(&coord, &unit_key))) {
56591 +                       *error = "Wrong key of first unit";
56592 +                       return -1;
56593 +               }
56594 +               prev = ih->key;
56595 +               for (j = 0; j < coord_num_units(&coord); ++j) {
56596 +                       coord.unit_pos = j;
56597 +                       unit_key_by_coord(&coord, &unit_key);
56598 +                       if (keygt(&prev, &unit_key)) {
56599 +                               *error = "Unit keys are in wrong order";
56600 +                               return -1;
56601 +                       }
56602 +                       prev = unit_key;
56603 +               }
56604 +               coord.unit_pos = 0;
56605 +               if (level != TWIG_LEVEL && item_is_extent(&coord)) {
56606 +                       *error = "extent on the wrong level";
56607 +                       return -1;
56608 +               }
56609 +               if (level == LEAF_LEVEL && item_is_internal(&coord)) {
56610 +                       *error = "internal item on the wrong level";
56611 +                       return -1;
56612 +               }
56613 +               if (level != LEAF_LEVEL &&
56614 +                   !item_is_internal(&coord) && !item_is_extent(&coord)) {
56615 +                       *error = "wrong item on the internal level";
56616 +                       return -1;
56617 +               }
56618 +               if (level > TWIG_LEVEL && !item_is_internal(&coord)) {
56619 +                       *error = "non-internal item on the internal level";
56620 +                       return -1;
56621 +               }
56622 +#if REISER4_DEBUG
56623 +               if (item_plugin_by_coord(&coord)->b.check
56624 +                   && item_plugin_by_coord(&coord)->b.check(&coord, error))
56625 +                       return -1;
56626 +#endif
56627 +               if (i) {
56628 +                       coord_t prev_coord;
56629 +                       /* two neighboring items can not be mergeable */
56630 +                       coord_dup(&prev_coord, &coord);
56631 +                       coord_prev_item(&prev_coord);
56632 +                       if (are_items_mergeable(&prev_coord, &coord)) {
56633 +                               *error = "mergeable items in one node";
56634 +                               return -1;
56635 +                       }
56636 +
56637 +               }
56638 +       }
56639 +
56640 +       if ((flags & REISER4_NODE_DKEYS) && !node_is_empty(node)) {
56641 +               coord_t coord;
56642 +               item_plugin *iplug;
56643 +
56644 +               coord_init_last_unit(&coord, node);
56645 +               iplug = item_plugin_by_coord(&coord);
56646 +               if ((item_is_extent(&coord) || item_is_tail(&coord)) &&
56647 +                   iplug->s.file.append_key != NULL) {
56648 +                       reiser4_key mkey;
56649 +
56650 +                       iplug->s.file.append_key(&coord, &mkey);
56651 +                       set_key_offset(&mkey, get_key_offset(&mkey) - 1);
56652 +                       read_lock_dk(current_tree);
56653 +                       result = keygt(&mkey, znode_get_rd_key((znode *) node));
56654 +                       read_unlock_dk(current_tree);
56655 +                       if (result) {
56656 +                               *error = "key of rightmost item is too large";
56657 +                               return -1;
56658 +                       }
56659 +               }
56660 +       }
56661 +       if (flags & REISER4_NODE_DKEYS) {
56662 +               read_lock_tree(current_tree);
56663 +               read_lock_dk(current_tree);
56664 +
56665 +               flags |= REISER4_NODE_TREE_STABLE;
56666 +
56667 +               if (keygt(&prev, znode_get_rd_key((znode *) node))) {
56668 +                       if (flags & REISER4_NODE_TREE_STABLE) {
56669 +                               *error = "Last key is greater than rdkey";
56670 +                               read_unlock_dk(current_tree);
56671 +                               read_unlock_tree(current_tree);
56672 +                               return -1;
56673 +                       }
56674 +               }
56675 +               if (keygt
56676 +                   (znode_get_ld_key((znode *) node),
56677 +                    znode_get_rd_key((znode *) node))) {
56678 +                       *error = "ldkey is greater than rdkey";
56679 +                       read_unlock_dk(current_tree);
56680 +                       read_unlock_tree(current_tree);
56681 +                       return -1;
56682 +               }
56683 +               if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) &&
56684 +                   (node->left != NULL) &&
56685 +                   !ZF_ISSET(node->left, JNODE_HEARD_BANSHEE) &&
56686 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
56687 +                        !keyeq(znode_get_rd_key(node->left),
56688 +                               znode_get_ld_key((znode *) node)))
56689 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56690 +                           keygt(znode_get_rd_key(node->left),
56691 +                                 znode_get_ld_key((znode *) node)))) {
56692 +                       *error = "left rdkey or ldkey is wrong";
56693 +                       read_unlock_dk(current_tree);
56694 +                       read_unlock_tree(current_tree);
56695 +                       return -1;
56696 +               }
56697 +               if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
56698 +                   (node->right != NULL) &&
56699 +                   !ZF_ISSET(node->right, JNODE_HEARD_BANSHEE) &&
56700 +                   ergo(flags & REISER4_NODE_TREE_STABLE,
56701 +                        !keyeq(znode_get_rd_key((znode *) node),
56702 +                               znode_get_ld_key(node->right)))
56703 +                   && ergo(!(flags & REISER4_NODE_TREE_STABLE),
56704 +                           keygt(znode_get_rd_key((znode *) node),
56705 +                                 znode_get_ld_key(node->right)))) {
56706 +                       *error = "rdkey or right ldkey is wrong";
56707 +                       read_unlock_dk(current_tree);
56708 +                       read_unlock_tree(current_tree);
56709 +                       return -1;
56710 +               }
56711 +
56712 +               read_unlock_dk(current_tree);
56713 +               read_unlock_tree(current_tree);
56714 +       }
56715 +
56716 +       return 0;
56717 +}
56718 +
56719 +/* plugin->u.node.parse
56720 +   look for description of this method in plugin/node/node.h */
56721 +int parse_node40(znode * node /* node to parse */ )
56722 +{
56723 +       node40_header *header;
56724 +       int result;
56725 +       d8 level;
56726 +
56727 +       header = node40_node_header((znode *) node);
56728 +       result = -EIO;
56729 +       level = nh40_get_level(header);
56730 +       if (unlikely(((__u8) znode_get_level(node)) != level))
56731 +               warning("nikita-494", "Wrong level found in node: %i != %i",
56732 +                       znode_get_level(node), level);
56733 +       else if (unlikely(nh40_get_magic(header) != REISER4_NODE_MAGIC))
56734 +               warning("nikita-495",
56735 +                       "Wrong magic in tree node: want %x, got %x",
56736 +                       REISER4_NODE_MAGIC, nh40_get_magic(header));
56737 +       else {
56738 +               node->nr_items = node40_num_of_items_internal(node);
56739 +               result = 0;
56740 +       }
56741 +       return RETERR(result);
56742 +}
56743 +
56744 +/* plugin->u.node.init
56745 +   look for description of this method in plugin/node/node.h */
56746 +int init_node40(znode * node /* node to initialise */ )
56747 +{
56748 +       node40_header *header;
56749 +
56750 +       assert("nikita-570", node != NULL);
56751 +       assert("nikita-572", zdata(node) != NULL);
56752 +
56753 +       header = node40_node_header(node);
56754 +       memset(header, 0, sizeof(node40_header));
56755 +       nh40_set_free_space(header, znode_size(node) - sizeof(node40_header));
56756 +       nh40_set_free_space_start(header, sizeof(node40_header));
56757 +       /* sane hypothesis: 0 in CPU format is 0 in disk format */
56758 +       /* items: 0 */
56759 +       save_plugin_id(node_plugin_to_plugin(node->nplug),
56760 +                      &header->common_header.plugin_id);
56761 +       nh40_set_level(header, znode_get_level(node));
56762 +       nh40_set_magic(header, REISER4_NODE_MAGIC);
56763 +       node->nr_items = 0;
56764 +       nh40_set_mkfs_id(header, reiser4_mkfs_id(reiser4_get_current_sb()));
56765 +
56766 +       /* flags: 0 */
56767 +       return 0;
56768 +}
56769 +
56770 +#ifdef GUESS_EXISTS
56771 +int guess_node40(const znode * node /* node to guess plugin of */ )
56772 +{
56773 +       node40_header *nethack;
56774 +
56775 +       assert("nikita-1058", node != NULL);
56776 +       nethack = node40_node_header(node);
56777 +       return
56778 +           (nh40_get_magic(nethack) == REISER4_NODE_MAGIC) &&
56779 +           (plugin_by_disk_id(znode_get_tree(node),
56780 +                              REISER4_NODE_PLUGIN_TYPE,
56781 +                              &nethack->common_header.plugin_id)->h.id ==
56782 +            NODE40_ID);
56783 +}
56784 +#endif
56785 +
56786 +/* plugin->u.node.chage_item_size
56787 +   look for description of this method in plugin/node/node.h */
56788 +void change_item_size_node40(coord_t * coord, int by)
56789 +{
56790 +       node40_header *nh;
56791 +       item_header40 *ih;
56792 +       char *item_data;
56793 +       int item_length;
56794 +       unsigned i;
56795 +
56796 +       /* make sure that @item is coord of existing item */
56797 +       assert("vs-210", coord_is_existing_item(coord));
56798 +
56799 +       nh = node40_node_header(coord->node);
56800 +
56801 +       item_data = item_by_coord_node40(coord);
56802 +       item_length = length_by_coord_node40(coord);
56803 +
56804 +       /* move item bodies */
56805 +       ih = node40_ih_at_coord(coord);
56806 +       memmove(item_data + item_length + by, item_data + item_length,
56807 +               nh40_get_free_space_start(node40_node_header(coord->node)) -
56808 +               (ih40_get_offset(ih) + item_length));
56809 +
56810 +       /* update offsets of moved items */
56811 +       for (i = coord->item_pos + 1; i < nh40_get_num_items(nh); i++) {
56812 +               ih = node40_ih_at(coord->node, i);
56813 +               ih40_set_offset(ih, ih40_get_offset(ih) + by);
56814 +       }
56815 +
56816 +       /* update node header */
56817 +       nh40_set_free_space(nh, nh40_get_free_space(nh) - by);
56818 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) + by);
56819 +}
56820 +
56821 +static int should_notify_parent(const znode * node)
56822 +{
56823 +       /* FIXME_JMACD This looks equivalent to znode_is_root(), right? -josh */
56824 +       return !disk_addr_eq(znode_get_block(node),
56825 +                            &znode_get_tree(node)->root_block);
56826 +}
56827 +
56828 +/* plugin->u.node.create_item
56829 +   look for description of this method in plugin/node/node.h */
56830 +int
56831 +create_item_node40(coord_t *target, const reiser4_key *key,
56832 +                  reiser4_item_data *data, carry_plugin_info *info)
56833 +{
56834 +       node40_header *nh;
56835 +       item_header40 *ih;
56836 +       unsigned offset;
56837 +       unsigned i;
56838 +
56839 +       nh = node40_node_header(target->node);
56840 +
56841 +       assert("vs-212", coord_is_between_items(target));
56842 +       /* node must have enough free space */
56843 +       assert("vs-254",
56844 +              free_space_node40(target->node) >=
56845 +              data->length + sizeof(item_header40));
56846 +       assert("vs-1410", data->length >= 0);
56847 +
56848 +       if (coord_set_to_right(target))
56849 +               /* there are not items to the right of @target, so, new item
56850 +                  will be inserted after last one */
56851 +               coord_set_item_pos(target, nh40_get_num_items(nh));
56852 +
56853 +       if (target->item_pos < nh40_get_num_items(nh)) {
56854 +               /* there are items to be moved to prepare space for new
56855 +                  item */
56856 +               ih = node40_ih_at_coord(target);
56857 +               /* new item will start at this offset */
56858 +               offset = ih40_get_offset(ih);
56859 +
56860 +               memmove(zdata(target->node) + offset + data->length,
56861 +                       zdata(target->node) + offset,
56862 +                       nh40_get_free_space_start(nh) - offset);
56863 +               /* update headers of moved items */
56864 +               for (i = target->item_pos; i < nh40_get_num_items(nh); i++) {
56865 +                       ih = node40_ih_at(target->node, i);
56866 +                       ih40_set_offset(ih, ih40_get_offset(ih) + data->length);
56867 +               }
56868 +
56869 +               /* @ih is set to item header of the last item, move item headers */
56870 +               memmove(ih - 1, ih,
56871 +                       sizeof(item_header40) * (nh40_get_num_items(nh) -
56872 +                                                target->item_pos));
56873 +       } else {
56874 +               /* new item will start at this offset */
56875 +               offset = nh40_get_free_space_start(nh);
56876 +       }
56877 +
56878 +       /* make item header for the new item */
56879 +       ih = node40_ih_at_coord(target);
56880 +       memcpy(&ih->key, key, sizeof(reiser4_key));
56881 +       ih40_set_offset(ih, offset);
56882 +       save_plugin_id(item_plugin_to_plugin(data->iplug), &ih->plugin_id);
56883 +
56884 +       /* update node header */
56885 +       nh40_set_free_space(nh,
56886 +                           nh40_get_free_space(nh) - data->length -
56887 +                           sizeof(item_header40));
56888 +       nh40_set_free_space_start(nh,
56889 +                                 nh40_get_free_space_start(nh) + data->length);
56890 +       node40_set_num_items(target->node, nh, nh40_get_num_items(nh) + 1);
56891 +
56892 +       /* FIXME: check how does create_item work when between is set to BEFORE_UNIT */
56893 +       target->unit_pos = 0;
56894 +       target->between = AT_UNIT;
56895 +       coord_clear_iplug(target);
56896 +
56897 +       /* initialize item */
56898 +       if (data->iplug->b.init != NULL) {
56899 +               data->iplug->b.init(target, NULL, data);
56900 +       }
56901 +       /* copy item body */
56902 +       if (data->iplug->b.paste != NULL) {
56903 +               data->iplug->b.paste(target, data, info);
56904 +       } else if (data->data != NULL) {
56905 +               if (data->user) {
56906 +                       /* AUDIT: Are we really should not check that pointer
56907 +                          from userspace was valid and data bytes were
56908 +                          available? How will we return -EFAULT of some kind
56909 +                          without this check? */
56910 +                       assert("nikita-3038", reiser4_schedulable());
56911 +                       /* copy data from user space */
56912 +                       __copy_from_user(zdata(target->node) + offset,
56913 +                                        (const char __user *)data->data,
56914 +                                        (unsigned)data->length);
56915 +               } else
56916 +                       /* copy from kernel space */
56917 +                       memcpy(zdata(target->node) + offset, data->data,
56918 +                              (unsigned)data->length);
56919 +       }
56920 +
56921 +       if (target->item_pos == 0) {
56922 +               /* left delimiting key has to be updated */
56923 +               prepare_for_update(NULL, target->node, info);
56924 +       }
56925 +
56926 +       if (item_plugin_by_coord(target)->b.create_hook != NULL) {
56927 +               item_plugin_by_coord(target)->b.create_hook(target, data->arg);
56928 +       }
56929 +
56930 +       return 0;
56931 +}
56932 +
56933 +/* plugin->u.node.update_item_key
56934 +   look for description of this method in plugin/node/node.h */
56935 +void
56936 +update_item_key_node40(coord_t * target, const reiser4_key * key,
56937 +                      carry_plugin_info * info)
56938 +{
56939 +       item_header40 *ih;
56940 +
56941 +       ih = node40_ih_at_coord(target);
56942 +       memcpy(&ih->key, key, sizeof(reiser4_key));
56943 +
56944 +       if (target->item_pos == 0) {
56945 +               prepare_for_update(NULL, target->node, info);
56946 +       }
56947 +}
56948 +
56949 +/* this bits encode cut mode */
56950 +#define CMODE_TAIL 1
56951 +#define CMODE_WHOLE 2
56952 +#define CMODE_HEAD 4
56953 +
56954 +struct cut40_info {
56955 +       int mode;
56956 +       pos_in_node_t tail_removed;     /* position of item which gets tail removed */
56957 +       pos_in_node_t first_removed;    /* position of first the leftmost item among items removed completely */
56958 +       pos_in_node_t removed_count;    /* number of items removed completely */
56959 +       pos_in_node_t head_removed;     /* position of item which gets head removed */
56960 +
56961 +       pos_in_node_t freed_space_start;
56962 +       pos_in_node_t freed_space_end;
56963 +       pos_in_node_t first_moved;
56964 +       pos_in_node_t head_removed_location;
56965 +};
56966 +
56967 +static void init_cinfo(struct cut40_info *cinfo)
56968 +{
56969 +       cinfo->mode = 0;
56970 +       cinfo->tail_removed = MAX_POS_IN_NODE;
56971 +       cinfo->first_removed = MAX_POS_IN_NODE;
56972 +       cinfo->removed_count = MAX_POS_IN_NODE;
56973 +       cinfo->head_removed = MAX_POS_IN_NODE;
56974 +       cinfo->freed_space_start = MAX_POS_IN_NODE;
56975 +       cinfo->freed_space_end = MAX_POS_IN_NODE;
56976 +       cinfo->first_moved = MAX_POS_IN_NODE;
56977 +       cinfo->head_removed_location = MAX_POS_IN_NODE;
56978 +}
56979 +
56980 +/* complete cut_node40/kill_node40 content by removing the gap created by */
56981 +static void compact(znode * node, struct cut40_info *cinfo)
56982 +{
56983 +       node40_header *nh;
56984 +       item_header40 *ih;
56985 +       pos_in_node_t freed;
56986 +       pos_in_node_t pos, nr_items;
56987 +
56988 +       assert("vs-1526", (cinfo->freed_space_start != MAX_POS_IN_NODE &&
56989 +                          cinfo->freed_space_end != MAX_POS_IN_NODE &&
56990 +                          cinfo->first_moved != MAX_POS_IN_NODE));
56991 +       assert("vs-1523", cinfo->freed_space_end >= cinfo->freed_space_start);
56992 +
56993 +       nh = node40_node_header(node);
56994 +       nr_items = nh40_get_num_items(nh);
56995 +
56996 +       /* remove gap made up by removal */
56997 +       memmove(zdata(node) + cinfo->freed_space_start,
56998 +               zdata(node) + cinfo->freed_space_end,
56999 +               nh40_get_free_space_start(nh) - cinfo->freed_space_end);
57000 +
57001 +       /* update item headers of moved items - change their locations */
57002 +       pos = cinfo->first_moved;
57003 +       ih = node40_ih_at(node, pos);
57004 +       if (cinfo->head_removed_location != MAX_POS_IN_NODE) {
57005 +               assert("vs-1580", pos == cinfo->head_removed);
57006 +               ih40_set_offset(ih, cinfo->head_removed_location);
57007 +               pos++;
57008 +               ih--;
57009 +       }
57010 +
57011 +       freed = cinfo->freed_space_end - cinfo->freed_space_start;
57012 +       for (; pos < nr_items; pos++, ih--) {
57013 +               assert("vs-1581", ih == node40_ih_at(node, pos));
57014 +               ih40_set_offset(ih, ih40_get_offset(ih) - freed);
57015 +       }
57016 +
57017 +       /* free space start moved to right */
57018 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - freed);
57019 +
57020 +       if (cinfo->removed_count != MAX_POS_IN_NODE) {
57021 +               /* number of items changed. Remove item headers of those items */
57022 +               ih = node40_ih_at(node, nr_items - 1);
57023 +               memmove(ih + cinfo->removed_count, ih,
57024 +                       sizeof(item_header40) * (nr_items -
57025 +                                                cinfo->removed_count -
57026 +                                                cinfo->first_removed));
57027 +               freed += sizeof(item_header40) * cinfo->removed_count;
57028 +               node40_set_num_items(node, nh, nr_items - cinfo->removed_count);
57029 +       }
57030 +
57031 +       /* total amount of free space increased */
57032 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + freed);
57033 +}
57034 +
57035 +int shrink_item_node40(coord_t * coord, int delta)
57036 +{
57037 +       node40_header *nh;
57038 +       item_header40 *ih;
57039 +       pos_in_node_t pos;
57040 +       pos_in_node_t nr_items;
57041 +       char *end;
57042 +       znode *node;
57043 +       int off;
57044 +
57045 +       assert("nikita-3487", coord != NULL);
57046 +       assert("nikita-3488", delta >= 0);
57047 +
57048 +       node = coord->node;
57049 +       nh = node40_node_header(node);
57050 +       nr_items = nh40_get_num_items(nh);
57051 +
57052 +       ih = node40_ih_at_coord(coord);
57053 +       assert("nikita-3489", delta <= length_by_coord_node40(coord));
57054 +       off = ih40_get_offset(ih) + length_by_coord_node40(coord);
57055 +       end = zdata(node) + off;
57056 +
57057 +       /* remove gap made up by removal */
57058 +       memmove(end - delta, end, nh40_get_free_space_start(nh) - off);
57059 +
57060 +       /* update item headers of moved items - change their locations */
57061 +       pos = coord->item_pos + 1;
57062 +       ih = node40_ih_at(node, pos);
57063 +       for (; pos < nr_items; pos++, ih--) {
57064 +               assert("nikita-3490", ih == node40_ih_at(node, pos));
57065 +               ih40_set_offset(ih, ih40_get_offset(ih) - delta);
57066 +       }
57067 +
57068 +       /* free space start moved to left */
57069 +       nh40_set_free_space_start(nh, nh40_get_free_space_start(nh) - delta);
57070 +       /* total amount of free space increased */
57071 +       nh40_set_free_space(nh, nh40_get_free_space(nh) + delta);
57072 +       /*
57073 +        * This method does _not_ changes number of items. Hence, it cannot
57074 +        * make node empty. Also it doesn't remove items at all, which means
57075 +        * that no keys have to be updated either.
57076 +        */
57077 +       return 0;
57078 +}
57079 +
57080 +/* this is used by cut_node40 and kill_node40. It analyses input parameters and calculates cut mode. There are 2 types
57081 +   of cut. First is when a unit is removed from the middle of an item.  In this case this function returns 1. All the
57082 +   rest fits into second case: 0 or 1 of items getting tail cut, 0 or more items removed completely and 0 or 1 item
57083 +   getting head cut. Function returns 0 in this case */
57084 +static int
57085 +parse_cut(struct cut40_info *cinfo, const struct cut_kill_params *params)
57086 +{
57087 +       reiser4_key left_key, right_key;
57088 +       reiser4_key min_from_key, max_to_key;
57089 +       const reiser4_key *from_key, *to_key;
57090 +
57091 +       init_cinfo(cinfo);
57092 +
57093 +       /* calculate minimal key stored in first item of items to be cut (params->from) */
57094 +       item_key_by_coord(params->from, &min_from_key);
57095 +       /* and max key stored in last item of items to be cut (params->to) */
57096 +       max_item_key_by_coord(params->to, &max_to_key);
57097 +
57098 +       /* if cut key range is not defined in input parameters - define it using cut coord range */
57099 +       if (params->from_key == NULL) {
57100 +               assert("vs-1513", params->to_key == NULL);
57101 +               unit_key_by_coord(params->from, &left_key);
57102 +               from_key = &left_key;
57103 +               max_unit_key_by_coord(params->to, &right_key);
57104 +               to_key = &right_key;
57105 +       } else {
57106 +               from_key = params->from_key;
57107 +               to_key = params->to_key;
57108 +       }
57109 +
57110 +       if (params->from->item_pos == params->to->item_pos) {
57111 +               if (keylt(&min_from_key, from_key)
57112 +                   && keylt(to_key, &max_to_key))
57113 +                       return 1;
57114 +
57115 +               if (keygt(from_key, &min_from_key)) {
57116 +                       /* tail of item is to be cut cut */
57117 +                       cinfo->tail_removed = params->from->item_pos;
57118 +                       cinfo->mode |= CMODE_TAIL;
57119 +               } else if (keylt(to_key, &max_to_key)) {
57120 +                       /* head of item is to be cut */
57121 +                       cinfo->head_removed = params->from->item_pos;
57122 +                       cinfo->mode |= CMODE_HEAD;
57123 +               } else {
57124 +                       /* item is removed completely */
57125 +                       cinfo->first_removed = params->from->item_pos;
57126 +                       cinfo->removed_count = 1;
57127 +                       cinfo->mode |= CMODE_WHOLE;
57128 +               }
57129 +       } else {
57130 +               cinfo->first_removed = params->from->item_pos + 1;
57131 +               cinfo->removed_count =
57132 +                   params->to->item_pos - params->from->item_pos - 1;
57133 +
57134 +               if (keygt(from_key, &min_from_key)) {
57135 +                       /* first item is not cut completely */
57136 +                       cinfo->tail_removed = params->from->item_pos;
57137 +                       cinfo->mode |= CMODE_TAIL;
57138 +               } else {
57139 +                       cinfo->first_removed--;
57140 +                       cinfo->removed_count++;
57141 +               }
57142 +               if (keylt(to_key, &max_to_key)) {
57143 +                       /* last item is not cut completely */
57144 +                       cinfo->head_removed = params->to->item_pos;
57145 +                       cinfo->mode |= CMODE_HEAD;
57146 +               } else {
57147 +                       cinfo->removed_count++;
57148 +               }
57149 +               if (cinfo->removed_count)
57150 +                       cinfo->mode |= CMODE_WHOLE;
57151 +       }
57152 +
57153 +       return 0;
57154 +}
57155 +
57156 +static void
57157 +call_kill_hooks(znode * node, pos_in_node_t from, pos_in_node_t count,
57158 +               carry_kill_data * kdata)
57159 +{
57160 +       coord_t coord;
57161 +       item_plugin *iplug;
57162 +       pos_in_node_t pos;
57163 +
57164 +       coord.node = node;
57165 +       coord.unit_pos = 0;
57166 +       coord.between = AT_UNIT;
57167 +       for (pos = 0; pos < count; pos++) {
57168 +               coord_set_item_pos(&coord, from + pos);
57169 +               coord.unit_pos = 0;
57170 +               coord.between = AT_UNIT;
57171 +               iplug = item_plugin_by_coord(&coord);
57172 +               if (iplug->b.kill_hook) {
57173 +                       iplug->b.kill_hook(&coord, 0, coord_num_units(&coord),
57174 +                                          kdata);
57175 +               }
57176 +       }
57177 +}
57178 +
57179 +/* this is used to kill item partially */
57180 +static pos_in_node_t
57181 +kill_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
57182 +          reiser4_key * smallest_removed, reiser4_key * new_first_key)
57183 +{
57184 +       struct carry_kill_data *kdata;
57185 +       item_plugin *iplug;
57186 +
57187 +       kdata = data;
57188 +       iplug = item_plugin_by_coord(coord);
57189 +
57190 +       assert("vs-1524", iplug->b.kill_units);
57191 +       return iplug->b.kill_units(coord, from, to, kdata, smallest_removed,
57192 +                                  new_first_key);
57193 +}
57194 +
57195 +/* call item plugin to cut tail of file */
57196 +static pos_in_node_t
57197 +kill_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
57198 +{
57199 +       struct carry_kill_data *kdata;
57200 +       pos_in_node_t to;
57201 +
57202 +       kdata = data;
57203 +       to = coord_last_unit_pos(coord);
57204 +       return kill_units(coord, coord->unit_pos, to, kdata, smallest_removed,
57205 +                         NULL);
57206 +}
57207 +
57208 +/* call item plugin to cut head of item */
57209 +static pos_in_node_t
57210 +kill_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
57211 +         reiser4_key * new_first_key)
57212 +{
57213 +       return kill_units(coord, 0, coord->unit_pos, data, smallest_removed,
57214 +                         new_first_key);
57215 +}
57216 +
57217 +/* this is used to cut item partially */
57218 +static pos_in_node_t
57219 +cut_units(coord_t * coord, pos_in_node_t from, pos_in_node_t to, void *data,
57220 +         reiser4_key * smallest_removed, reiser4_key * new_first_key)
57221 +{
57222 +       carry_cut_data *cdata;
57223 +       item_plugin *iplug;
57224 +
57225 +       cdata = data;
57226 +       iplug = item_plugin_by_coord(coord);
57227 +       assert("vs-302", iplug->b.cut_units);
57228 +       return iplug->b.cut_units(coord, from, to, cdata, smallest_removed,
57229 +                                 new_first_key);
57230 +}
57231 +
57232 +/* call item plugin to cut tail of file */
57233 +static pos_in_node_t
57234 +cut_tail(coord_t * coord, void *data, reiser4_key * smallest_removed)
57235 +{
57236 +       carry_cut_data *cdata;
57237 +       pos_in_node_t to;
57238 +
57239 +       cdata = data;
57240 +       to = coord_last_unit_pos(cdata->params.from);
57241 +       return cut_units(coord, coord->unit_pos, to, data, smallest_removed, NULL);
57242 +}
57243 +
57244 +/* call item plugin to cut head of item */
57245 +static pos_in_node_t
57246 +cut_head(coord_t * coord, void *data, reiser4_key * smallest_removed,
57247 +        reiser4_key * new_first_key)
57248 +{
57249 +       return cut_units(coord, 0, coord->unit_pos, data, smallest_removed,
57250 +                        new_first_key);
57251 +}
57252 +
57253 +/* this returns 1 of key of first item changed, 0 - if it did not */
57254 +static int
57255 +prepare_for_compact(struct cut40_info *cinfo,
57256 +                   const struct cut_kill_params *params, int is_cut,
57257 +                   void *data, carry_plugin_info * info)
57258 +{
57259 +       znode *node;
57260 +       item_header40 *ih;
57261 +       pos_in_node_t freed;
57262 +       pos_in_node_t item_pos;
57263 +       coord_t coord;
57264 +       reiser4_key new_first_key;
57265 +       pos_in_node_t(*kill_units_f) (coord_t *, pos_in_node_t, pos_in_node_t,
57266 +                                     void *, reiser4_key *, reiser4_key *);
57267 +       pos_in_node_t(*kill_tail_f) (coord_t *, void *, reiser4_key *);
57268 +       pos_in_node_t(*kill_head_f) (coord_t *, void *, reiser4_key *,
57269 +                                    reiser4_key *);
57270 +       int retval;
57271 +
57272 +       retval = 0;
57273 +
57274 +       node = params->from->node;
57275 +
57276 +       assert("vs-184", node == params->to->node);
57277 +       assert("vs-312", !node_is_empty(node));
57278 +       assert("vs-297",
57279 +              coord_compare(params->from, params->to) != COORD_CMP_ON_RIGHT);
57280 +
57281 +       if (is_cut) {
57282 +               kill_units_f = cut_units;
57283 +               kill_tail_f = cut_tail;
57284 +               kill_head_f = cut_head;
57285 +       } else {
57286 +               kill_units_f = kill_units;
57287 +               kill_tail_f = kill_tail;
57288 +               kill_head_f = kill_head;
57289 +       }
57290 +
57291 +       if (parse_cut(cinfo, params) == 1) {
57292 +               /* cut from the middle of item */
57293 +               freed =
57294 +                   kill_units_f(params->from, params->from->unit_pos,
57295 +                                params->to->unit_pos, data,
57296 +                                params->smallest_removed, NULL);
57297 +
57298 +               item_pos = params->from->item_pos;
57299 +               ih = node40_ih_at(node, item_pos);
57300 +               cinfo->freed_space_start =
57301 +                   ih40_get_offset(ih) + node40_item_length(node,
57302 +                                                            item_pos) - freed;
57303 +               cinfo->freed_space_end = cinfo->freed_space_start + freed;
57304 +               cinfo->first_moved = item_pos + 1;
57305 +       } else {
57306 +               assert("vs-1521", (cinfo->tail_removed != MAX_POS_IN_NODE ||
57307 +                                  cinfo->first_removed != MAX_POS_IN_NODE ||
57308 +                                  cinfo->head_removed != MAX_POS_IN_NODE));
57309 +
57310 +               switch (cinfo->mode) {
57311 +               case CMODE_TAIL:
57312 +                       /* one item gets cut partially from its end */
57313 +                       assert("vs-1562",
57314 +                              cinfo->tail_removed == params->from->item_pos);
57315 +
57316 +                       freed =
57317 +                           kill_tail_f(params->from, data,
57318 +                                       params->smallest_removed);
57319 +
57320 +                       item_pos = cinfo->tail_removed;
57321 +                       ih = node40_ih_at(node, item_pos);
57322 +                       cinfo->freed_space_start =
57323 +                           ih40_get_offset(ih) + node40_item_length(node,
57324 +                                                                    item_pos) -
57325 +                           freed;
57326 +                       cinfo->freed_space_end =
57327 +                           cinfo->freed_space_start + freed;
57328 +                       cinfo->first_moved = cinfo->tail_removed + 1;
57329 +                       break;
57330 +
57331 +               case CMODE_WHOLE:
57332 +                       /* one or more items get removed completely */
57333 +                       assert("vs-1563",
57334 +                              cinfo->first_removed == params->from->item_pos);
57335 +                       assert("vs-1564", cinfo->removed_count > 0
57336 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
57337 +
57338 +                       /* call kill hook for all items removed completely */
57339 +                       if (is_cut == 0)
57340 +                               call_kill_hooks(node, cinfo->first_removed,
57341 +                                               cinfo->removed_count, data);
57342 +
57343 +                       item_pos = cinfo->first_removed;
57344 +                       ih = node40_ih_at(node, item_pos);
57345 +
57346 +                       if (params->smallest_removed)
57347 +                               memcpy(params->smallest_removed, &ih->key,
57348 +                                      sizeof(reiser4_key));
57349 +
57350 +                       cinfo->freed_space_start = ih40_get_offset(ih);
57351 +
57352 +                       item_pos += (cinfo->removed_count - 1);
57353 +                       ih -= (cinfo->removed_count - 1);
57354 +                       cinfo->freed_space_end =
57355 +                           ih40_get_offset(ih) + node40_item_length(node,
57356 +                                                                    item_pos);
57357 +                       cinfo->first_moved = item_pos + 1;
57358 +                       if (cinfo->first_removed == 0)
57359 +                               /* key of first item of the node changes */
57360 +                               retval = 1;
57361 +                       break;
57362 +
57363 +               case CMODE_HEAD:
57364 +                       /* one item gets cut partially from its head */
57365 +                       assert("vs-1565",
57366 +                              cinfo->head_removed == params->from->item_pos);
57367 +
57368 +                       freed =
57369 +                           kill_head_f(params->to, data,
57370 +                                       params->smallest_removed,
57371 +                                       &new_first_key);
57372 +
57373 +                       item_pos = cinfo->head_removed;
57374 +                       ih = node40_ih_at(node, item_pos);
57375 +                       cinfo->freed_space_start = ih40_get_offset(ih);
57376 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
57377 +                       cinfo->first_moved = cinfo->head_removed + 1;
57378 +
57379 +                       /* item head is removed, therefore, item key changed */
57380 +                       coord.node = node;
57381 +                       coord_set_item_pos(&coord, item_pos);
57382 +                       coord.unit_pos = 0;
57383 +                       coord.between = AT_UNIT;
57384 +                       update_item_key_node40(&coord, &new_first_key, NULL);
57385 +                       if (item_pos == 0)
57386 +                               /* key of first item of the node changes */
57387 +                               retval = 1;
57388 +                       break;
57389 +
57390 +               case CMODE_TAIL | CMODE_WHOLE:
57391 +                       /* one item gets cut from its end and one or more items get removed completely */
57392 +                       assert("vs-1566",
57393 +                              cinfo->tail_removed == params->from->item_pos);
57394 +                       assert("vs-1567",
57395 +                              cinfo->first_removed == cinfo->tail_removed + 1);
57396 +                       assert("vs-1564", cinfo->removed_count > 0
57397 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
57398 +
57399 +                       freed =
57400 +                           kill_tail_f(params->from, data,
57401 +                                       params->smallest_removed);
57402 +
57403 +                       item_pos = cinfo->tail_removed;
57404 +                       ih = node40_ih_at(node, item_pos);
57405 +                       cinfo->freed_space_start =
57406 +                           ih40_get_offset(ih) + node40_item_length(node,
57407 +                                                                    item_pos) -
57408 +                           freed;
57409 +
57410 +                       /* call kill hook for all items removed completely */
57411 +                       if (is_cut == 0)
57412 +                               call_kill_hooks(node, cinfo->first_removed,
57413 +                                               cinfo->removed_count, data);
57414 +
57415 +                       item_pos += cinfo->removed_count;
57416 +                       ih -= cinfo->removed_count;
57417 +                       cinfo->freed_space_end =
57418 +                           ih40_get_offset(ih) + node40_item_length(node,
57419 +                                                                    item_pos);
57420 +                       cinfo->first_moved = item_pos + 1;
57421 +                       break;
57422 +
57423 +               case CMODE_WHOLE | CMODE_HEAD:
57424 +                       /* one or more items get removed completely and one item gets cut partially from its head */
57425 +                       assert("vs-1568",
57426 +                              cinfo->first_removed == params->from->item_pos);
57427 +                       assert("vs-1564", cinfo->removed_count > 0
57428 +                              && cinfo->removed_count != MAX_POS_IN_NODE);
57429 +                       assert("vs-1569",
57430 +                              cinfo->head_removed ==
57431 +                              cinfo->first_removed + cinfo->removed_count);
57432 +
57433 +                       /* call kill hook for all items removed completely */
57434 +                       if (is_cut == 0)
57435 +                               call_kill_hooks(node, cinfo->first_removed,
57436 +                                               cinfo->removed_count, data);
57437 +
57438 +                       item_pos = cinfo->first_removed;
57439 +                       ih = node40_ih_at(node, item_pos);
57440 +
57441 +                       if (params->smallest_removed)
57442 +                               memcpy(params->smallest_removed, &ih->key,
57443 +                                      sizeof(reiser4_key));
57444 +
57445 +                       freed =
57446 +                           kill_head_f(params->to, data, NULL, &new_first_key);
57447 +
57448 +                       cinfo->freed_space_start = ih40_get_offset(ih);
57449 +
57450 +                       ih = node40_ih_at(node, cinfo->head_removed);
57451 +                       /* this is the most complex case. Item which got head removed and items which are to be moved
57452 +                          intact change their location differently. */
57453 +                       cinfo->freed_space_end = ih40_get_offset(ih) + freed;
57454 +                       cinfo->first_moved = cinfo->head_removed;
57455 +                       cinfo->head_removed_location = cinfo->freed_space_start;
57456 +
57457 +                       /* item head is removed, therefore, item key changed */
57458 +                       coord.node = node;
57459 +                       coord_set_item_pos(&coord, cinfo->head_removed);
57460 +                       coord.unit_pos = 0;
57461 +                       coord.between = AT_UNIT;
57462 +                       update_item_key_node40(&coord, &new_first_key, NULL);
57463 +
57464 +                       assert("vs-1579", cinfo->first_removed == 0);
57465 +                       /* key of first item of the node changes */
57466 +                       retval = 1;
57467 +                       break;
57468 +
57469 +               case CMODE_TAIL | CMODE_HEAD:
57470 +                       /* one item get cut from its end and its neighbor gets cut from its tail */
57471 +                       impossible("vs-1576", "this can not happen currently");
57472 +                       break;
57473 +
57474 +               case CMODE_TAIL | CMODE_WHOLE | CMODE_HEAD:
57475 +                       impossible("vs-1577", "this can not happen currently");
57476 +                       break;
57477 +               default:
57478 +                       impossible("vs-1578", "unexpected cut mode");
57479 +                       break;
57480 +               }
57481 +       }
57482 +       return retval;
57483 +}
57484 +
57485 +/* plugin->u.node.kill
57486 +   return value is number of items removed completely */
57487 +int kill_node40(struct carry_kill_data *kdata, carry_plugin_info * info)
57488 +{
57489 +       znode *node;
57490 +       struct cut40_info cinfo;
57491 +       int first_key_changed;
57492 +
57493 +       node = kdata->params.from->node;
57494 +
57495 +       first_key_changed =
57496 +           prepare_for_compact(&cinfo, &kdata->params, 0 /* not cut */ , kdata,
57497 +                               info);
57498 +       compact(node, &cinfo);
57499 +
57500 +       if (info) {
57501 +               /* it is not called by node40_shift, so we have to take care
57502 +                  of changes on upper levels */
57503 +               if (node_is_empty(node)
57504 +                   && !(kdata->flags & DELETE_RETAIN_EMPTY))
57505 +                       /* all contents of node is deleted */
57506 +                       prepare_removal_node40(node, info);
57507 +               else if (first_key_changed) {
57508 +                       prepare_for_update(NULL, node, info);
57509 +               }
57510 +       }
57511 +
57512 +       coord_clear_iplug(kdata->params.from);
57513 +       coord_clear_iplug(kdata->params.to);
57514 +
57515 +       znode_make_dirty(node);
57516 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57517 +}
57518 +
57519 +/* plugin->u.node.cut
57520 +   return value is number of items removed completely */
57521 +int cut_node40(struct carry_cut_data *cdata, carry_plugin_info * info)
57522 +{
57523 +       znode *node;
57524 +       struct cut40_info cinfo;
57525 +       int first_key_changed;
57526 +
57527 +       node = cdata->params.from->node;
57528 +
57529 +       first_key_changed =
57530 +           prepare_for_compact(&cinfo, &cdata->params, 1 /* not cut */ , cdata,
57531 +                               info);
57532 +       compact(node, &cinfo);
57533 +
57534 +       if (info) {
57535 +               /* it is not called by node40_shift, so we have to take care
57536 +                  of changes on upper levels */
57537 +               if (node_is_empty(node))
57538 +                       /* all contents of node is deleted */
57539 +                       prepare_removal_node40(node, info);
57540 +               else if (first_key_changed) {
57541 +                       prepare_for_update(NULL, node, info);
57542 +               }
57543 +       }
57544 +
57545 +       coord_clear_iplug(cdata->params.from);
57546 +       coord_clear_iplug(cdata->params.to);
57547 +
57548 +       znode_make_dirty(node);
57549 +       return cinfo.removed_count == MAX_POS_IN_NODE ? 0 : cinfo.removed_count;
57550 +}
57551 +
57552 +/* this structure is used by shift method of node40 plugin */
57553 +struct shift_params {
57554 +       shift_direction pend;   /* when @pend == append - we are shifting to
57555 +                                  left, when @pend == prepend - to right */
57556 +       coord_t wish_stop;      /* when shifting to left this is last unit we
57557 +                                  want shifted, when shifting to right - this
57558 +                                  is set to unit we want to start shifting
57559 +                                  from */
57560 +       znode *target;
57561 +       int everything;         /* it is set to 1 if everything we have to shift is
57562 +                                  shifted, 0 - otherwise */
57563 +
57564 +       /* FIXME-VS: get rid of read_stop */
57565 +
57566 +       /* these are set by estimate_shift */
57567 +       coord_t real_stop;      /* this will be set to last unit which will be
57568 +                                  really shifted */
57569 +
57570 +       /* coordinate in source node before operation of unit which becomes
57571 +          first after shift to left of last after shift to right */
57572 +       union {
57573 +               coord_t future_first;
57574 +               coord_t future_last;
57575 +       } u;
57576 +
57577 +       unsigned merging_units; /* number of units of first item which have to
57578 +                                  be merged with last item of target node */
57579 +       unsigned merging_bytes; /* number of bytes in those units */
57580 +
57581 +       unsigned entire;        /* items shifted in their entirety */
57582 +       unsigned entire_bytes;  /* number of bytes in those items */
57583 +
57584 +       unsigned part_units;    /* number of units of partially copied item */
57585 +       unsigned part_bytes;    /* number of bytes in those units */
57586 +
57587 +       unsigned shift_bytes;   /* total number of bytes in items shifted (item
57588 +                                  headers not included) */
57589 +
57590 +};
57591 +
57592 +static int item_creation_overhead(coord_t *item)
57593 +{
57594 +       return node_plugin_by_coord(item)->item_overhead(item->node, NULL);
57595 +}
57596 +
57597 +/* how many units are there in @source starting from source->unit_pos
57598 +   but not further than @stop_coord */
57599 +static int
57600 +wanted_units(coord_t *source, coord_t *stop_coord, shift_direction pend)
57601 +{
57602 +       if (pend == SHIFT_LEFT) {
57603 +               assert("vs-181", source->unit_pos == 0);
57604 +       } else {
57605 +               assert("vs-182",
57606 +                      source->unit_pos == coord_last_unit_pos(source));
57607 +       }
57608 +
57609 +       if (source->item_pos != stop_coord->item_pos) {
57610 +               /* @source and @stop_coord are different items */
57611 +               return coord_last_unit_pos(source) + 1;
57612 +       }
57613 +
57614 +       if (pend == SHIFT_LEFT) {
57615 +               return stop_coord->unit_pos + 1;
57616 +       } else {
57617 +               return source->unit_pos - stop_coord->unit_pos + 1;
57618 +       }
57619 +}
57620 +
57621 +/* this calculates what can be copied from @shift->wish_stop.node to
57622 +   @shift->target */
57623 +static void
57624 +estimate_shift(struct shift_params *shift, const reiser4_context * ctx)
57625 +{
57626 +       unsigned target_free_space, size;
57627 +       pos_in_node_t stop_item;        /* item which estimating should not consider */
57628 +       unsigned want;          /* number of units of item we want shifted */
57629 +       coord_t source;         /* item being estimated */
57630 +       item_plugin *iplug;
57631 +
57632 +       /* shifting to left/right starts from first/last units of
57633 +          @shift->wish_stop.node */
57634 +       if (shift->pend == SHIFT_LEFT) {
57635 +               coord_init_first_unit(&source, shift->wish_stop.node);
57636 +       } else {
57637 +               coord_init_last_unit(&source, shift->wish_stop.node);
57638 +       }
57639 +       shift->real_stop = source;
57640 +
57641 +       /* free space in target node and number of items in source */
57642 +       target_free_space = znode_free_space(shift->target);
57643 +
57644 +       shift->everything = 0;
57645 +       if (!node_is_empty(shift->target)) {
57646 +               /* target node is not empty, check for boundary items
57647 +                  mergeability */
57648 +               coord_t to;
57649 +
57650 +               /* item we try to merge @source with */
57651 +               if (shift->pend == SHIFT_LEFT) {
57652 +                       coord_init_last_unit(&to, shift->target);
57653 +               } else {
57654 +                       coord_init_first_unit(&to, shift->target);
57655 +               }
57656 +
57657 +               if ((shift->pend == SHIFT_LEFT) ? are_items_mergeable(&to,
57658 +                                                                     &source) :
57659 +                   are_items_mergeable(&source, &to)) {
57660 +                       /* how many units of @source do we want to merge to
57661 +                          item @to */
57662 +                       want =
57663 +                           wanted_units(&source, &shift->wish_stop,
57664 +                                        shift->pend);
57665 +
57666 +                       /* how many units of @source we can merge to item
57667 +                          @to */
57668 +                       iplug = item_plugin_by_coord(&source);
57669 +                       if (iplug->b.can_shift != NULL)
57670 +                               shift->merging_units =
57671 +                                   iplug->b.can_shift(target_free_space,
57672 +                                                      &source, shift->target,
57673 +                                                      shift->pend, &size,
57674 +                                                      want);
57675 +                       else {
57676 +                               shift->merging_units = 0;
57677 +                               size = 0;
57678 +                       }
57679 +                       shift->merging_bytes = size;
57680 +                       shift->shift_bytes += size;
57681 +                       /* update stop coord to be set to last unit of @source
57682 +                          we can merge to @target */
57683 +                       if (shift->merging_units)
57684 +                               /* at least one unit can be shifted */
57685 +                               shift->real_stop.unit_pos =
57686 +                                   (shift->merging_units - source.unit_pos -
57687 +                                    1) * shift->pend;
57688 +                       else {
57689 +                               /* nothing can be shifted */
57690 +                               if (shift->pend == SHIFT_LEFT)
57691 +                                       coord_init_before_first_item(&shift->
57692 +                                                                    real_stop,
57693 +                                                                    source.
57694 +                                                                    node);
57695 +                               else
57696 +                                       coord_init_after_last_item(&shift->
57697 +                                                                  real_stop,
57698 +                                                                  source.node);
57699 +                       }
57700 +                       assert("nikita-2081", shift->real_stop.unit_pos + 1);
57701 +
57702 +                       if (shift->merging_units != want) {
57703 +                               /* we could not copy as many as we want, so,
57704 +                                  there is no reason for estimating any
57705 +                                  longer */
57706 +                               return;
57707 +                       }
57708 +
57709 +                       target_free_space -= size;
57710 +                       coord_add_item_pos(&source, shift->pend);
57711 +               }
57712 +       }
57713 +
57714 +       /* number of item nothing of which we want to shift */
57715 +       stop_item = shift->wish_stop.item_pos + shift->pend;
57716 +
57717 +       /* calculate how many items can be copied into given free
57718 +          space as whole */
57719 +       for (; source.item_pos != stop_item;
57720 +            coord_add_item_pos(&source, shift->pend)) {
57721 +               if (shift->pend == SHIFT_RIGHT)
57722 +                       source.unit_pos = coord_last_unit_pos(&source);
57723 +
57724 +               /* how many units of @source do we want to copy */
57725 +               want = wanted_units(&source, &shift->wish_stop, shift->pend);
57726 +
57727 +               if (want == coord_last_unit_pos(&source) + 1) {
57728 +                       /* we want this item to be copied entirely */
57729 +                       size =
57730 +                           item_length_by_coord(&source) +
57731 +                           item_creation_overhead(&source);
57732 +                       if (size <= target_free_space) {
57733 +                               /* item fits into target node as whole */
57734 +                               target_free_space -= size;
57735 +                               shift->shift_bytes +=
57736 +                                   size - item_creation_overhead(&source);
57737 +                               shift->entire_bytes +=
57738 +                                   size - item_creation_overhead(&source);
57739 +                               shift->entire++;
57740 +
57741 +                               /* update shift->real_stop coord to be set to
57742 +                                  last unit of @source we can merge to
57743 +                                  @target */
57744 +                               shift->real_stop = source;
57745 +                               if (shift->pend == SHIFT_LEFT)
57746 +                                       shift->real_stop.unit_pos =
57747 +                                           coord_last_unit_pos(&shift->
57748 +                                                               real_stop);
57749 +                               else
57750 +                                       shift->real_stop.unit_pos = 0;
57751 +                               continue;
57752 +                       }
57753 +               }
57754 +
57755 +               /* we reach here only for an item which does not fit into
57756 +                  target node in its entirety. This item may be either
57757 +                  partially shifted, or not shifted at all. We will have to
57758 +                  create new item in target node, so decrease amout of free
57759 +                  space by an item creation overhead. We can reach here also
57760 +                  if stop coord is in this item */
57761 +               if (target_free_space >=
57762 +                   (unsigned)item_creation_overhead(&source)) {
57763 +                       target_free_space -= item_creation_overhead(&source);
57764 +                       iplug = item_plugin_by_coord(&source);
57765 +                       if (iplug->b.can_shift) {
57766 +                               shift->part_units = iplug->b.can_shift(target_free_space,
57767 +                                                                      &source,
57768 +                                                                      NULL, /* target */
57769 +                                                                      shift->pend,
57770 +                                                                      &size,
57771 +                                                                      want);
57772 +                       } else {
57773 +                               target_free_space = 0;
57774 +                               shift->part_units = 0;
57775 +                               size = 0;
57776 +                       }
57777 +               } else {
57778 +                       target_free_space = 0;
57779 +                       shift->part_units = 0;
57780 +                       size = 0;
57781 +               }
57782 +               shift->part_bytes = size;
57783 +               shift->shift_bytes += size;
57784 +
57785 +               /* set @shift->real_stop to last unit of @source we can merge
57786 +                  to @shift->target */
57787 +               if (shift->part_units) {
57788 +                       shift->real_stop = source;
57789 +                       shift->real_stop.unit_pos =
57790 +                           (shift->part_units - source.unit_pos -
57791 +                            1) * shift->pend;
57792 +                       assert("nikita-2082", shift->real_stop.unit_pos + 1);
57793 +               }
57794 +
57795 +               if (want != shift->part_units)
57796 +                       /* not everything wanted were shifted */
57797 +                       return;
57798 +               break;
57799 +       }
57800 +
57801 +       shift->everything = 1;
57802 +}
57803 +
57804 +static void
57805 +copy_units(coord_t * target, coord_t * source, unsigned from, unsigned count,
57806 +          shift_direction dir, unsigned free_space)
57807 +{
57808 +       item_plugin *iplug;
57809 +
57810 +       assert("nikita-1463", target != NULL);
57811 +       assert("nikita-1464", source != NULL);
57812 +       assert("nikita-1465", from + count <= coord_num_units(source));
57813 +
57814 +       iplug = item_plugin_by_coord(source);
57815 +       assert("nikita-1468", iplug == item_plugin_by_coord(target));
57816 +       iplug->b.copy_units(target, source, from, count, dir, free_space);
57817 +
57818 +       if (dir == SHIFT_RIGHT) {
57819 +               /* FIXME-VS: this looks not necessary. update_item_key was
57820 +                  called already by copy_units method */
57821 +               reiser4_key split_key;
57822 +
57823 +               assert("nikita-1469", target->unit_pos == 0);
57824 +
57825 +               unit_key_by_coord(target, &split_key);
57826 +               node_plugin_by_coord(target)->update_item_key(target,
57827 +                                                             &split_key, NULL);
57828 +       }
57829 +}
57830 +
57831 +/* copy part of @shift->real_stop.node starting either from its beginning or
57832 +   from its end and ending at @shift->real_stop to either the end or the
57833 +   beginning of @shift->target */
57834 +static void copy(struct shift_params *shift)
57835 +{
57836 +       node40_header *nh;
57837 +       coord_t from;
57838 +       coord_t to;
57839 +       item_header40 *from_ih, *to_ih;
57840 +       int free_space_start;
57841 +       int new_items;
57842 +       unsigned old_items;
57843 +       int old_offset;
57844 +       unsigned i;
57845 +
57846 +       nh = node40_node_header(shift->target);
57847 +       free_space_start = nh40_get_free_space_start(nh);
57848 +       old_items = nh40_get_num_items(nh);
57849 +       new_items = shift->entire + (shift->part_units ? 1 : 0);
57850 +       assert("vs-185",
57851 +              shift->shift_bytes ==
57852 +              shift->merging_bytes + shift->entire_bytes + shift->part_bytes);
57853 +
57854 +       from = shift->wish_stop;
57855 +
57856 +       coord_init_first_unit(&to, shift->target);
57857 +
57858 +       /* NOTE:NIKITA->VS not sure what I am doing: shift->target is empty,
57859 +          hence to.between is set to EMPTY_NODE above. Looks like we want it
57860 +          to be AT_UNIT.
57861 +
57862 +          Oh, wonders of ->betweeness...
57863 +
57864 +        */
57865 +       to.between = AT_UNIT;
57866 +
57867 +       if (shift->pend == SHIFT_LEFT) {
57868 +               /* copying to left */
57869 +
57870 +               coord_set_item_pos(&from, 0);
57871 +               from_ih = node40_ih_at(from.node, 0);
57872 +
57873 +               coord_set_item_pos(&to,
57874 +                                  node40_num_of_items_internal(to.node) - 1);
57875 +               if (shift->merging_units) {
57876 +                       /* expand last item, so that plugin methods will see
57877 +                          correct data */
57878 +                       free_space_start += shift->merging_bytes;
57879 +                       nh40_set_free_space_start(nh,
57880 +                                                 (unsigned)free_space_start);
57881 +                       nh40_set_free_space(nh,
57882 +                                           nh40_get_free_space(nh) -
57883 +                                           shift->merging_bytes);
57884 +
57885 +                       /* appending last item of @target */
57886 +                       copy_units(&to, &from, 0,       /* starting from 0-th unit */
57887 +                                  shift->merging_units, SHIFT_LEFT,
57888 +                                  shift->merging_bytes);
57889 +                       coord_inc_item_pos(&from);
57890 +                       from_ih--;
57891 +                       coord_inc_item_pos(&to);
57892 +               }
57893 +
57894 +               to_ih = node40_ih_at(shift->target, old_items);
57895 +               if (shift->entire) {
57896 +                       /* copy @entire items entirely */
57897 +
57898 +                       /* copy item headers */
57899 +                       memcpy(to_ih - shift->entire + 1,
57900 +                              from_ih - shift->entire + 1,
57901 +                              shift->entire * sizeof(item_header40));
57902 +                       /* update item header offset */
57903 +                       old_offset = ih40_get_offset(from_ih);
57904 +                       /* AUDIT: Looks like if we calculate old_offset + free_space_start here instead of just old_offset, we can perform one "add" operation less per each iteration */
57905 +                       for (i = 0; i < shift->entire; i++, to_ih--, from_ih--)
57906 +                               ih40_set_offset(to_ih,
57907 +                                               ih40_get_offset(from_ih) -
57908 +                                               old_offset + free_space_start);
57909 +
57910 +                       /* copy item bodies */
57911 +                       memcpy(zdata(shift->target) + free_space_start, zdata(from.node) + old_offset,  /*ih40_get_offset (from_ih), */
57912 +                              shift->entire_bytes);
57913 +
57914 +                       coord_add_item_pos(&from, (int)shift->entire);
57915 +                       coord_add_item_pos(&to, (int)shift->entire);
57916 +               }
57917 +
57918 +               nh40_set_free_space_start(nh,
57919 +                                         free_space_start +
57920 +                                         shift->shift_bytes -
57921 +                                         shift->merging_bytes);
57922 +               nh40_set_free_space(nh,
57923 +                                   nh40_get_free_space(nh) -
57924 +                                   (shift->shift_bytes - shift->merging_bytes +
57925 +                                    sizeof(item_header40) * new_items));
57926 +
57927 +               /* update node header */
57928 +               node40_set_num_items(shift->target, nh, old_items + new_items);
57929 +               assert("vs-170",
57930 +                      nh40_get_free_space(nh) < znode_size(shift->target));
57931 +
57932 +               if (shift->part_units) {
57933 +                       /* copy heading part (@part units) of @source item as
57934 +                          a new item into @target->node */
57935 +
57936 +                       /* copy item header of partially copied item */
57937 +                       coord_set_item_pos(&to,
57938 +                                          node40_num_of_items_internal(to.node)
57939 +                                          - 1);
57940 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
57941 +                       ih40_set_offset(to_ih,
57942 +                                       nh40_get_free_space_start(nh) -
57943 +                                       shift->part_bytes);
57944 +                       if (item_plugin_by_coord(&to)->b.init)
57945 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
57946 +                                                                 NULL);
57947 +                       copy_units(&to, &from, 0, shift->part_units, SHIFT_LEFT,
57948 +                                  shift->part_bytes);
57949 +               }
57950 +
57951 +       } else {
57952 +               /* copying to right */
57953 +
57954 +               coord_set_item_pos(&from,
57955 +                                  node40_num_of_items_internal(from.node) - 1);
57956 +               from_ih = node40_ih_at_coord(&from);
57957 +
57958 +               coord_set_item_pos(&to, 0);
57959 +
57960 +               /* prepare space for new items */
57961 +               memmove(zdata(to.node) + sizeof(node40_header) +
57962 +                       shift->shift_bytes,
57963 +                       zdata(to.node) + sizeof(node40_header),
57964 +                       free_space_start - sizeof(node40_header));
57965 +               /* update item headers of moved items */
57966 +               to_ih = node40_ih_at(to.node, 0);
57967 +               /* first item gets @merging_bytes longer. free space appears
57968 +                  at its beginning */
57969 +               if (!node_is_empty(to.node))
57970 +                       ih40_set_offset(to_ih,
57971 +                                       ih40_get_offset(to_ih) +
57972 +                                       shift->shift_bytes -
57973 +                                       shift->merging_bytes);
57974 +
57975 +               for (i = 1; i < old_items; i++)
57976 +                       ih40_set_offset(to_ih - i,
57977 +                                       ih40_get_offset(to_ih - i) +
57978 +                                       shift->shift_bytes);
57979 +
57980 +               /* move item headers to make space for new items */
57981 +               memmove(to_ih - old_items + 1 - new_items,
57982 +                       to_ih - old_items + 1,
57983 +                       sizeof(item_header40) * old_items);
57984 +               to_ih -= (new_items - 1);
57985 +
57986 +               nh40_set_free_space_start(nh,
57987 +                                         free_space_start +
57988 +                                         shift->shift_bytes);
57989 +               nh40_set_free_space(nh,
57990 +                                   nh40_get_free_space(nh) -
57991 +                                   (shift->shift_bytes +
57992 +                                    sizeof(item_header40) * new_items));
57993 +
57994 +               /* update node header */
57995 +               node40_set_num_items(shift->target, nh, old_items + new_items);
57996 +               assert("vs-170",
57997 +                      nh40_get_free_space(nh) < znode_size(shift->target));
57998 +
57999 +               if (shift->merging_units) {
58000 +                       coord_add_item_pos(&to, new_items);
58001 +                       to.unit_pos = 0;
58002 +                       to.between = AT_UNIT;
58003 +                       /* prepend first item of @to */
58004 +                       copy_units(&to, &from,
58005 +                                  coord_last_unit_pos(&from) -
58006 +                                  shift->merging_units + 1,
58007 +                                  shift->merging_units, SHIFT_RIGHT,
58008 +                                  shift->merging_bytes);
58009 +                       coord_dec_item_pos(&from);
58010 +                       from_ih++;
58011 +               }
58012 +
58013 +               if (shift->entire) {
58014 +                       /* copy @entire items entirely */
58015 +
58016 +                       /* copy item headers */
58017 +                       memcpy(to_ih, from_ih,
58018 +                              shift->entire * sizeof(item_header40));
58019 +
58020 +                       /* update item header offset */
58021 +                       old_offset =
58022 +                           ih40_get_offset(from_ih + shift->entire - 1);
58023 +                       /* AUDIT: old_offset + sizeof (node40_header) + shift->part_bytes calculation can be taken off the loop. */
58024 +                       for (i = 0; i < shift->entire; i++, to_ih++, from_ih++)
58025 +                               ih40_set_offset(to_ih,
58026 +                                               ih40_get_offset(from_ih) -
58027 +                                               old_offset +
58028 +                                               sizeof(node40_header) +
58029 +                                               shift->part_bytes);
58030 +                       /* copy item bodies */
58031 +                       coord_add_item_pos(&from, -(int)(shift->entire - 1));
58032 +                       memcpy(zdata(to.node) + sizeof(node40_header) +
58033 +                              shift->part_bytes, item_by_coord_node40(&from),
58034 +                              shift->entire_bytes);
58035 +                       coord_dec_item_pos(&from);
58036 +               }
58037 +
58038 +               if (shift->part_units) {
58039 +                       coord_set_item_pos(&to, 0);
58040 +                       to.unit_pos = 0;
58041 +                       to.between = AT_UNIT;
58042 +                       /* copy heading part (@part units) of @source item as
58043 +                          a new item into @target->node */
58044 +
58045 +                       /* copy item header of partially copied item */
58046 +                       memcpy(to_ih, from_ih, sizeof(item_header40));
58047 +                       ih40_set_offset(to_ih, sizeof(node40_header));
58048 +                       if (item_plugin_by_coord(&to)->b.init)
58049 +                               item_plugin_by_coord(&to)->b.init(&to, &from,
58050 +                                                                 NULL);
58051 +                       copy_units(&to, &from,
58052 +                                  coord_last_unit_pos(&from) -
58053 +                                  shift->part_units + 1, shift->part_units,
58054 +                                  SHIFT_RIGHT, shift->part_bytes);
58055 +               }
58056 +       }
58057 +}
58058 +
58059 +/* remove everything either before or after @fact_stop. Number of items
58060 +   removed completely is returned */
58061 +static int delete_copied(struct shift_params *shift)
58062 +{
58063 +       coord_t from;
58064 +       coord_t to;
58065 +       struct carry_cut_data cdata;
58066 +
58067 +       if (shift->pend == SHIFT_LEFT) {
58068 +               /* we were shifting to left, remove everything from the
58069 +                  beginning of @shift->wish_stop->node upto
58070 +                  @shift->wish_stop */
58071 +               coord_init_first_unit(&from, shift->real_stop.node);
58072 +               to = shift->real_stop;
58073 +
58074 +               /* store old coordinate of unit which will be first after
58075 +                  shift to left */
58076 +               shift->u.future_first = to;
58077 +               coord_next_unit(&shift->u.future_first);
58078 +       } else {
58079 +               /* we were shifting to right, remove everything from
58080 +                  @shift->stop_coord upto to end of
58081 +                  @shift->stop_coord->node */
58082 +               from = shift->real_stop;
58083 +               coord_init_last_unit(&to, from.node);
58084 +
58085 +               /* store old coordinate of unit which will be last after
58086 +                  shift to right */
58087 +               shift->u.future_last = from;
58088 +               coord_prev_unit(&shift->u.future_last);
58089 +       }
58090 +
58091 +       cdata.params.from = &from;
58092 +       cdata.params.to = &to;
58093 +       cdata.params.from_key = NULL;
58094 +       cdata.params.to_key = NULL;
58095 +       cdata.params.smallest_removed = NULL;
58096 +       return cut_node40(&cdata, NULL);
58097 +}
58098 +
58099 +/* something was moved between @left and @right. Add carry operation to @info
58100 +   list to have carry to update delimiting key between them */
58101 +static int
58102 +prepare_for_update(znode * left, znode * right, carry_plugin_info * info)
58103 +{
58104 +       carry_op *op;
58105 +       carry_node *cn;
58106 +
58107 +       if (info == NULL)
58108 +               /* nowhere to send operation to. */
58109 +               return 0;
58110 +
58111 +       if (!should_notify_parent(right))
58112 +               return 0;
58113 +
58114 +       op = node_post_carry(info, COP_UPDATE, right, 1);
58115 +       if (IS_ERR(op) || op == NULL)
58116 +               return op ? PTR_ERR(op) : -EIO;
58117 +
58118 +       if (left != NULL) {
58119 +               carry_node *reference;
58120 +
58121 +               if (info->doing)
58122 +                       reference = insert_carry_node(info->doing,
58123 +                                                     info->todo, left);
58124 +               else
58125 +                       reference = op->node;
58126 +               assert("nikita-2992", reference != NULL);
58127 +               cn = reiser4_add_carry(info->todo, POOLO_BEFORE, reference);
58128 +               if (IS_ERR(cn))
58129 +                       return PTR_ERR(cn);
58130 +               cn->parent = 1;
58131 +               cn->node = left;
58132 +               if (ZF_ISSET(left, JNODE_ORPHAN))
58133 +                       cn->left_before = 1;
58134 +               op->u.update.left = cn;
58135 +       } else
58136 +               op->u.update.left = NULL;
58137 +       return 0;
58138 +}
58139 +
58140 +/* plugin->u.node.prepare_removal
58141 +   to delete a pointer to @empty from the tree add corresponding carry
58142 +   operation (delete) to @info list */
58143 +int prepare_removal_node40(znode * empty, carry_plugin_info * info)
58144 +{
58145 +       carry_op *op;
58146 +       reiser4_tree *tree;
58147 +
58148 +       if (!should_notify_parent(empty))
58149 +               return 0;
58150 +       /* already on a road to Styx */
58151 +       if (ZF_ISSET(empty, JNODE_HEARD_BANSHEE))
58152 +               return 0;
58153 +       op = node_post_carry(info, COP_DELETE, empty, 1);
58154 +       if (IS_ERR(op) || op == NULL)
58155 +               return RETERR(op ? PTR_ERR(op) : -EIO);
58156 +
58157 +       op->u.delete.child = NULL;
58158 +       op->u.delete.flags = 0;
58159 +
58160 +       /* fare thee well */
58161 +       tree = znode_get_tree(empty);
58162 +       read_lock_tree(tree);
58163 +       write_lock_dk(tree);
58164 +       znode_set_ld_key(empty, znode_get_rd_key(empty));
58165 +       if (znode_is_left_connected(empty) && empty->left)
58166 +               znode_set_rd_key(empty->left, znode_get_rd_key(empty));
58167 +       write_unlock_dk(tree);
58168 +       read_unlock_tree(tree);
58169 +
58170 +       ZF_SET(empty, JNODE_HEARD_BANSHEE);
58171 +       return 0;
58172 +}
58173 +
58174 +/* something were shifted from @insert_coord->node to @shift->target, update
58175 +   @insert_coord correspondingly */
58176 +static void
58177 +adjust_coord(coord_t * insert_coord, struct shift_params *shift, int removed,
58178 +            int including_insert_coord)
58179 +{
58180 +       /* item plugin was invalidated by shifting */
58181 +       coord_clear_iplug(insert_coord);
58182 +
58183 +       if (node_is_empty(shift->wish_stop.node)) {
58184 +               assert("vs-242", shift->everything);
58185 +               if (including_insert_coord) {
58186 +                       if (shift->pend == SHIFT_RIGHT) {
58187 +                               /* set @insert_coord before first unit of
58188 +                                  @shift->target node */
58189 +                               coord_init_before_first_item(insert_coord,
58190 +                                                            shift->target);
58191 +                       } else {
58192 +                               /* set @insert_coord after last in target node */
58193 +                               coord_init_after_last_item(insert_coord,
58194 +                                                          shift->target);
58195 +                       }
58196 +               } else {
58197 +                       /* set @insert_coord inside of empty node. There is
58198 +                          only one possible coord within an empty
58199 +                          node. init_first_unit will set that coord */
58200 +                       coord_init_first_unit(insert_coord,
58201 +                                             shift->wish_stop.node);
58202 +               }
58203 +               return;
58204 +       }
58205 +
58206 +       if (shift->pend == SHIFT_RIGHT) {
58207 +               /* there was shifting to right */
58208 +               if (shift->everything) {
58209 +                       /* everything wanted was shifted */
58210 +                       if (including_insert_coord) {
58211 +                               /* @insert_coord is set before first unit of
58212 +                                  @to node */
58213 +                               coord_init_before_first_item(insert_coord,
58214 +                                                            shift->target);
58215 +                               insert_coord->between = BEFORE_UNIT;
58216 +                       } else {
58217 +                               /* @insert_coord is set after last unit of
58218 +                                  @insert->node */
58219 +                               coord_init_last_unit(insert_coord,
58220 +                                                    shift->wish_stop.node);
58221 +                               insert_coord->between = AFTER_UNIT;
58222 +                       }
58223 +               }
58224 +               return;
58225 +       }
58226 +
58227 +       /* there was shifting to left */
58228 +       if (shift->everything) {
58229 +               /* everything wanted was shifted */
58230 +               if (including_insert_coord) {
58231 +                       /* @insert_coord is set after last unit in @to node */
58232 +                       coord_init_after_last_item(insert_coord, shift->target);
58233 +               } else {
58234 +                       /* @insert_coord is set before first unit in the same
58235 +                          node */
58236 +                       coord_init_before_first_item(insert_coord,
58237 +                                                    shift->wish_stop.node);
58238 +               }
58239 +               return;
58240 +       }
58241 +
58242 +       /* FIXME-VS: the code below is complicated because with between ==
58243 +          AFTER_ITEM unit_pos is set to 0 */
58244 +
58245 +       if (!removed) {
58246 +               /* no items were shifted entirely */
58247 +               assert("vs-195", shift->merging_units == 0
58248 +                      || shift->part_units == 0);
58249 +
58250 +               if (shift->real_stop.item_pos == insert_coord->item_pos) {
58251 +                       if (shift->merging_units) {
58252 +                               if (insert_coord->between == AFTER_UNIT) {
58253 +                                       assert("nikita-1441",
58254 +                                              insert_coord->unit_pos >=
58255 +                                              shift->merging_units);
58256 +                                       insert_coord->unit_pos -=
58257 +                                           shift->merging_units;
58258 +                               } else if (insert_coord->between == BEFORE_UNIT) {
58259 +                                       assert("nikita-2090",
58260 +                                              insert_coord->unit_pos >
58261 +                                              shift->merging_units);
58262 +                                       insert_coord->unit_pos -=
58263 +                                           shift->merging_units;
58264 +                               }
58265 +
58266 +                               assert("nikita-2083",
58267 +                                      insert_coord->unit_pos + 1);
58268 +                       } else {
58269 +                               if (insert_coord->between == AFTER_UNIT) {
58270 +                                       assert("nikita-1442",
58271 +                                              insert_coord->unit_pos >=
58272 +                                              shift->part_units);
58273 +                                       insert_coord->unit_pos -=
58274 +                                           shift->part_units;
58275 +                               } else if (insert_coord->between == BEFORE_UNIT) {
58276 +                                       assert("nikita-2089",
58277 +                                              insert_coord->unit_pos >
58278 +                                              shift->part_units);
58279 +                                       insert_coord->unit_pos -=
58280 +                                           shift->part_units;
58281 +                               }
58282 +
58283 +                               assert("nikita-2084",
58284 +                                      insert_coord->unit_pos + 1);
58285 +                       }
58286 +               }
58287 +               return;
58288 +       }
58289 +
58290 +       /* we shifted to left and there was no enough space for everything */
58291 +       switch (insert_coord->between) {
58292 +       case AFTER_UNIT:
58293 +       case BEFORE_UNIT:
58294 +               if (shift->real_stop.item_pos == insert_coord->item_pos)
58295 +                       insert_coord->unit_pos -= shift->part_units;
58296 +       case AFTER_ITEM:
58297 +               coord_add_item_pos(insert_coord, -removed);
58298 +               break;
58299 +       default:
58300 +               impossible("nikita-2087", "not ready");
58301 +       }
58302 +       assert("nikita-2085", insert_coord->unit_pos + 1);
58303 +}
58304 +
58305 +static int call_shift_hooks(struct shift_params *shift)
58306 +{
58307 +       unsigned i, shifted;
58308 +       coord_t coord;
58309 +       item_plugin *iplug;
58310 +
58311 +       assert("vs-275", !node_is_empty(shift->target));
58312 +
58313 +       /* number of items shift touches */
58314 +       shifted =
58315 +           shift->entire + (shift->merging_units ? 1 : 0) +
58316 +           (shift->part_units ? 1 : 0);
58317 +
58318 +       if (shift->pend == SHIFT_LEFT) {
58319 +               /* moved items are at the end */
58320 +               coord_init_last_unit(&coord, shift->target);
58321 +               coord.unit_pos = 0;
58322 +
58323 +               assert("vs-279", shift->pend == 1);
58324 +               for (i = 0; i < shifted; i++) {
58325 +                       unsigned from, count;
58326 +
58327 +                       iplug = item_plugin_by_coord(&coord);
58328 +                       if (i == 0 && shift->part_units) {
58329 +                               assert("vs-277",
58330 +                                      coord_num_units(&coord) ==
58331 +                                      shift->part_units);
58332 +                               count = shift->part_units;
58333 +                               from = 0;
58334 +                       } else if (i == shifted - 1 && shift->merging_units) {
58335 +                               count = shift->merging_units;
58336 +                               from = coord_num_units(&coord) - count;
58337 +                       } else {
58338 +                               count = coord_num_units(&coord);
58339 +                               from = 0;
58340 +                       }
58341 +
58342 +                       if (iplug->b.shift_hook) {
58343 +                               iplug->b.shift_hook(&coord, from, count,
58344 +                                                   shift->wish_stop.node);
58345 +                       }
58346 +                       coord_add_item_pos(&coord, -shift->pend);
58347 +               }
58348 +       } else {
58349 +               /* moved items are at the beginning */
58350 +               coord_init_first_unit(&coord, shift->target);
58351 +
58352 +               assert("vs-278", shift->pend == -1);
58353 +               for (i = 0; i < shifted; i++) {
58354 +                       unsigned from, count;
58355 +
58356 +                       iplug = item_plugin_by_coord(&coord);
58357 +                       if (i == 0 && shift->part_units) {
58358 +                               assert("vs-277",
58359 +                                      coord_num_units(&coord) ==
58360 +                                      shift->part_units);
58361 +                               count = coord_num_units(&coord);
58362 +                               from = 0;
58363 +                       } else if (i == shifted - 1 && shift->merging_units) {
58364 +                               count = shift->merging_units;
58365 +                               from = 0;
58366 +                       } else {
58367 +                               count = coord_num_units(&coord);
58368 +                               from = 0;
58369 +                       }
58370 +
58371 +                       if (iplug->b.shift_hook) {
58372 +                               iplug->b.shift_hook(&coord, from, count,
58373 +                                                   shift->wish_stop.node);
58374 +                       }
58375 +                       coord_add_item_pos(&coord, -shift->pend);
58376 +               }
58377 +       }
58378 +
58379 +       return 0;
58380 +}
58381 +
58382 +/* shift to left is completed. Return 1 if unit @old was moved to left neighbor */
58383 +static int
58384 +unit_moved_left(const struct shift_params *shift, const coord_t * old)
58385 +{
58386 +       assert("vs-944", shift->real_stop.node == old->node);
58387 +
58388 +       if (shift->real_stop.item_pos < old->item_pos)
58389 +               return 0;
58390 +       if (shift->real_stop.item_pos == old->item_pos) {
58391 +               if (shift->real_stop.unit_pos < old->unit_pos)
58392 +                       return 0;
58393 +       }
58394 +       return 1;
58395 +}
58396 +
58397 +/* shift to right is completed. Return 1 if unit @old was moved to right
58398 +   neighbor */
58399 +static int
58400 +unit_moved_right(const struct shift_params *shift, const coord_t * old)
58401 +{
58402 +       assert("vs-944", shift->real_stop.node == old->node);
58403 +
58404 +       if (shift->real_stop.item_pos > old->item_pos)
58405 +               return 0;
58406 +       if (shift->real_stop.item_pos == old->item_pos) {
58407 +               if (shift->real_stop.unit_pos > old->unit_pos)
58408 +                       return 0;
58409 +       }
58410 +       return 1;
58411 +}
58412 +
58413 +/* coord @old was set in node from which shift was performed. What was shifted
58414 +   is stored in @shift. Update @old correspondingly to performed shift */
58415 +static coord_t *adjust_coord2(const struct shift_params *shift,
58416 +                             const coord_t * old, coord_t * new)
58417 +{
58418 +       coord_clear_iplug(new);
58419 +       new->between = old->between;
58420 +
58421 +       coord_clear_iplug(new);
58422 +       if (old->node == shift->target) {
58423 +               if (shift->pend == SHIFT_LEFT) {
58424 +                       /* coord which is set inside of left neighbor does not
58425 +                          change during shift to left */
58426 +                       coord_dup(new, old);
58427 +                       return new;
58428 +               }
58429 +               new->node = old->node;
58430 +               coord_set_item_pos(new,
58431 +                                  old->item_pos + shift->entire +
58432 +                                  (shift->part_units ? 1 : 0));
58433 +               new->unit_pos = old->unit_pos;
58434 +               if (old->item_pos == 0 && shift->merging_units)
58435 +                       new->unit_pos += shift->merging_units;
58436 +               return new;
58437 +       }
58438 +
58439 +       assert("vs-977", old->node == shift->wish_stop.node);
58440 +       if (shift->pend == SHIFT_LEFT) {
58441 +               if (unit_moved_left(shift, old)) {
58442 +                       /* unit @old moved to left neighbor. Calculate its
58443 +                          coordinate there */
58444 +                       new->node = shift->target;
58445 +                       coord_set_item_pos(new,
58446 +                                          node_num_items(shift->target) -
58447 +                                          shift->entire -
58448 +                                          (shift->part_units ? 1 : 0) +
58449 +                                          old->item_pos);
58450 +
58451 +                       new->unit_pos = old->unit_pos;
58452 +                       if (shift->merging_units) {
58453 +                               coord_dec_item_pos(new);
58454 +                               if (old->item_pos == 0) {
58455 +                                       /* unit_pos only changes if item got
58456 +                                          merged */
58457 +                                       new->unit_pos =
58458 +                                           coord_num_units(new) -
58459 +                                           (shift->merging_units -
58460 +                                            old->unit_pos);
58461 +                               }
58462 +                       }
58463 +               } else {
58464 +                       /* unit @old did not move to left neighbor.
58465 +
58466 +                          Use _nocheck, because @old is outside of its node.
58467 +                        */
58468 +                       coord_dup_nocheck(new, old);
58469 +                       coord_add_item_pos(new,
58470 +                                          -shift->u.future_first.item_pos);
58471 +                       if (new->item_pos == 0)
58472 +                               new->unit_pos -= shift->u.future_first.unit_pos;
58473 +               }
58474 +       } else {
58475 +               if (unit_moved_right(shift, old)) {
58476 +                       /* unit @old moved to right neighbor */
58477 +                       new->node = shift->target;
58478 +                       coord_set_item_pos(new,
58479 +                                          old->item_pos -
58480 +                                          shift->real_stop.item_pos);
58481 +                       if (new->item_pos == 0) {
58482 +                               /* unit @old might change unit pos */
58483 +                               coord_set_item_pos(new,
58484 +                                                  old->unit_pos -
58485 +                                                  shift->real_stop.unit_pos);
58486 +                       }
58487 +               } else {
58488 +                       /* unit @old did not move to right neighbor, therefore
58489 +                          it did not change */
58490 +                       coord_dup(new, old);
58491 +               }
58492 +       }
58493 +       coord_set_iplug(new, item_plugin_by_coord(new));
58494 +       return new;
58495 +}
58496 +
58497 +/* this is called when shift is completed (something of source node is copied
58498 +   to target and deleted in source) to update all taps set in current
58499 +   context */
58500 +static void update_taps(const struct shift_params *shift)
58501 +{
58502 +       tap_t *tap;
58503 +       coord_t new;
58504 +
58505 +       for_all_taps(tap) {
58506 +               /* update only taps set to nodes participating in shift */
58507 +               if (tap->coord->node == shift->wish_stop.node
58508 +                   || tap->coord->node == shift->target)
58509 +                       tap_to_coord(tap,
58510 +                                    adjust_coord2(shift, tap->coord, &new));
58511 +       }
58512 +}
58513 +
58514 +#if REISER4_DEBUG
58515 +
58516 +struct shift_check {
58517 +       reiser4_key key;
58518 +       __u16 plugin_id;
58519 +       union {
58520 +               __u64 bytes;
58521 +               __u64 entries;
58522 +               void *unused;
58523 +       } u;
58524 +};
58525 +
58526 +void *shift_check_prepare(const znode * left, const znode * right)
58527 +{
58528 +       pos_in_node_t i, nr_items;
58529 +       int mergeable;
58530 +       struct shift_check *data;
58531 +       item_header40 *ih;
58532 +
58533 +       if (node_is_empty(left) || node_is_empty(right))
58534 +               mergeable = 0;
58535 +       else {
58536 +               coord_t l, r;
58537 +
58538 +               coord_init_last_unit(&l, left);
58539 +               coord_init_first_unit(&r, right);
58540 +               mergeable = are_items_mergeable(&l, &r);
58541 +       }
58542 +       nr_items =
58543 +           node40_num_of_items_internal(left) +
58544 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58545 +       data =
58546 +               kmalloc(sizeof(struct shift_check) * nr_items,
58547 +                       reiser4_ctx_gfp_mask_get());
58548 +       if (data != NULL) {
58549 +               coord_t coord;
58550 +               pos_in_node_t item_pos;
58551 +
58552 +               coord_init_first_unit(&coord, left);
58553 +               i = 0;
58554 +
58555 +               for (item_pos = 0;
58556 +                    item_pos < node40_num_of_items_internal(left);
58557 +                    item_pos++) {
58558 +
58559 +                       coord_set_item_pos(&coord, item_pos);
58560 +                       ih = node40_ih_at_coord(&coord);
58561 +
58562 +                       data[i].key = ih->key;
58563 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58564 +                       switch (data[i].plugin_id) {
58565 +                       case CTAIL_ID:
58566 +                       case FORMATTING_ID:
58567 +                               data[i].u.bytes = coord_num_units(&coord);
58568 +                               break;
58569 +                       case EXTENT_POINTER_ID:
58570 +                               data[i].u.bytes =
58571 +                                       reiser4_extent_size(&coord,
58572 +                                                      coord_num_units(&coord));
58573 +                               break;
58574 +                       case COMPOUND_DIR_ID:
58575 +                               data[i].u.entries = coord_num_units(&coord);
58576 +                               break;
58577 +                       default:
58578 +                               data[i].u.unused = NULL;
58579 +                               break;
58580 +                       }
58581 +                       i++;
58582 +               }
58583 +
58584 +               coord_init_first_unit(&coord, right);
58585 +
58586 +               if (mergeable) {
58587 +                       assert("vs-1609", i != 0);
58588 +
58589 +                       ih = node40_ih_at_coord(&coord);
58590 +
58591 +                       assert("vs-1589",
58592 +                              data[i - 1].plugin_id ==
58593 +                              le16_to_cpu(get_unaligned(&ih->plugin_id)));
58594 +                       switch (data[i - 1].plugin_id) {
58595 +                       case CTAIL_ID:
58596 +                       case FORMATTING_ID:
58597 +                               data[i - 1].u.bytes += coord_num_units(&coord);
58598 +                               break;
58599 +                       case EXTENT_POINTER_ID:
58600 +                               data[i - 1].u.bytes +=
58601 +                                   reiser4_extent_size(&coord,
58602 +                                               coord_num_units(&coord));
58603 +                               break;
58604 +                       case COMPOUND_DIR_ID:
58605 +                               data[i - 1].u.entries +=
58606 +                                   coord_num_units(&coord);
58607 +                               break;
58608 +                       default:
58609 +                               impossible("vs-1605", "wrong mergeable item");
58610 +                               break;
58611 +                       }
58612 +                       item_pos = 1;
58613 +               } else
58614 +                       item_pos = 0;
58615 +               for (; item_pos < node40_num_of_items_internal(right);
58616 +                    item_pos++) {
58617 +
58618 +                       assert("vs-1604", i < nr_items);
58619 +                       coord_set_item_pos(&coord, item_pos);
58620 +                       ih = node40_ih_at_coord(&coord);
58621 +
58622 +                       data[i].key = ih->key;
58623 +                       data[i].plugin_id = le16_to_cpu(get_unaligned(&ih->plugin_id));
58624 +                       switch (data[i].plugin_id) {
58625 +                       case CTAIL_ID:
58626 +                       case FORMATTING_ID:
58627 +                               data[i].u.bytes = coord_num_units(&coord);
58628 +                               break;
58629 +                       case EXTENT_POINTER_ID:
58630 +                               data[i].u.bytes =
58631 +                                   reiser4_extent_size(&coord,
58632 +                                               coord_num_units(&coord));
58633 +                               break;
58634 +                       case COMPOUND_DIR_ID:
58635 +                               data[i].u.entries = coord_num_units(&coord);
58636 +                               break;
58637 +                       default:
58638 +                               data[i].u.unused = NULL;
58639 +                               break;
58640 +                       }
58641 +                       i++;
58642 +               }
58643 +               assert("vs-1606", i == nr_items);
58644 +       }
58645 +       return data;
58646 +}
58647 +
58648 +void shift_check(void *vp, const znode * left, const znode * right)
58649 +{
58650 +       pos_in_node_t i, nr_items;
58651 +       coord_t coord;
58652 +       __u64 last_bytes;
58653 +       int mergeable;
58654 +       item_header40 *ih;
58655 +       pos_in_node_t item_pos;
58656 +       struct shift_check *data;
58657 +
58658 +       data = (struct shift_check *)vp;
58659 +
58660 +       if (data == NULL)
58661 +               return;
58662 +
58663 +       if (node_is_empty(left) || node_is_empty(right))
58664 +               mergeable = 0;
58665 +       else {
58666 +               coord_t l, r;
58667 +
58668 +               coord_init_last_unit(&l, left);
58669 +               coord_init_first_unit(&r, right);
58670 +               mergeable = are_items_mergeable(&l, &r);
58671 +       }
58672 +
58673 +       nr_items =
58674 +           node40_num_of_items_internal(left) +
58675 +           node40_num_of_items_internal(right) - (mergeable ? 1 : 0);
58676 +
58677 +       i = 0;
58678 +       last_bytes = 0;
58679 +
58680 +       coord_init_first_unit(&coord, left);
58681 +
58682 +       for (item_pos = 0; item_pos < node40_num_of_items_internal(left);
58683 +            item_pos++) {
58684 +
58685 +               coord_set_item_pos(&coord, item_pos);
58686 +               ih = node40_ih_at_coord(&coord);
58687 +
58688 +               assert("vs-1611", i == item_pos);
58689 +               assert("vs-1590", keyeq(&ih->key, &data[i].key));
58690 +               assert("vs-1591",
58691 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58692 +               if ((i < (node40_num_of_items_internal(left) - 1))
58693 +                   || !mergeable) {
58694 +                       switch (data[i].plugin_id) {
58695 +                       case CTAIL_ID:
58696 +                       case FORMATTING_ID:
58697 +                               assert("vs-1592",
58698 +                                      data[i].u.bytes ==
58699 +                                      coord_num_units(&coord));
58700 +                               break;
58701 +                       case EXTENT_POINTER_ID:
58702 +                               assert("vs-1593",
58703 +                                      data[i].u.bytes ==
58704 +                                      reiser4_extent_size(&coord,
58705 +                                                          coord_num_units
58706 +                                                          (&coord)));
58707 +                               break;
58708 +                       case COMPOUND_DIR_ID:
58709 +                               assert("vs-1594",
58710 +                                      data[i].u.entries ==
58711 +                                      coord_num_units(&coord));
58712 +                               break;
58713 +                       default:
58714 +                               break;
58715 +                       }
58716 +               }
58717 +               if (item_pos == (node40_num_of_items_internal(left) - 1)
58718 +                   && mergeable) {
58719 +                       switch (data[i].plugin_id) {
58720 +                       case CTAIL_ID:
58721 +                       case FORMATTING_ID:
58722 +                               last_bytes = coord_num_units(&coord);
58723 +                               break;
58724 +                       case EXTENT_POINTER_ID:
58725 +                               last_bytes =
58726 +                                   reiser4_extent_size(&coord,
58727 +                                               coord_num_units(&coord));
58728 +                               break;
58729 +                       case COMPOUND_DIR_ID:
58730 +                               last_bytes = coord_num_units(&coord);
58731 +                               break;
58732 +                       default:
58733 +                               impossible("vs-1595", "wrong mergeable item");
58734 +                               break;
58735 +                       }
58736 +               }
58737 +               i++;
58738 +       }
58739 +
58740 +       coord_init_first_unit(&coord, right);
58741 +       if (mergeable) {
58742 +               ih = node40_ih_at_coord(&coord);
58743 +
58744 +               assert("vs-1589",
58745 +                      data[i - 1].plugin_id == le16_to_cpu(get_unaligned(&ih->plugin_id)));
58746 +               assert("vs-1608", last_bytes != 0);
58747 +               switch (data[i - 1].plugin_id) {
58748 +               case CTAIL_ID:
58749 +               case FORMATTING_ID:
58750 +                       assert("vs-1596",
58751 +                              data[i - 1].u.bytes ==
58752 +                              last_bytes + coord_num_units(&coord));
58753 +                       break;
58754 +
58755 +               case EXTENT_POINTER_ID:
58756 +                       assert("vs-1597",
58757 +                              data[i - 1].u.bytes ==
58758 +                              last_bytes + reiser4_extent_size(&coord,
58759 +                                                               coord_num_units
58760 +                                                               (&coord)));
58761 +                       break;
58762 +
58763 +               case COMPOUND_DIR_ID:
58764 +                       assert("vs-1598",
58765 +                              data[i - 1].u.bytes ==
58766 +                              last_bytes + coord_num_units(&coord));
58767 +                       break;
58768 +               default:
58769 +                       impossible("vs-1599", "wrong mergeable item");
58770 +                       break;
58771 +               }
58772 +               item_pos = 1;
58773 +       } else
58774 +               item_pos = 0;
58775 +
58776 +       for (; item_pos < node40_num_of_items_internal(right); item_pos++) {
58777 +
58778 +               coord_set_item_pos(&coord, item_pos);
58779 +               ih = node40_ih_at_coord(&coord);
58780 +
58781 +               assert("vs-1612", keyeq(&ih->key, &data[i].key));
58782 +               assert("vs-1613",
58783 +                      le16_to_cpu(get_unaligned(&ih->plugin_id)) == data[i].plugin_id);
58784 +               switch (data[i].plugin_id) {
58785 +               case CTAIL_ID:
58786 +               case FORMATTING_ID:
58787 +                       assert("vs-1600",
58788 +                              data[i].u.bytes == coord_num_units(&coord));
58789 +                       break;
58790 +               case EXTENT_POINTER_ID:
58791 +                       assert("vs-1601",
58792 +                              data[i].u.bytes ==
58793 +                              reiser4_extent_size(&coord,
58794 +                                                  coord_num_units
58795 +                                                  (&coord)));
58796 +                       break;
58797 +               case COMPOUND_DIR_ID:
58798 +                       assert("vs-1602",
58799 +                              data[i].u.entries == coord_num_units(&coord));
58800 +                       break;
58801 +               default:
58802 +                       break;
58803 +               }
58804 +               i++;
58805 +       }
58806 +
58807 +       assert("vs-1603", i == nr_items);
58808 +       kfree(data);
58809 +}
58810 +
58811 +#endif
58812 +
58813 +/* plugin->u.node.shift
58814 +   look for description of this method in plugin/node/node.h */
58815 +int shift_node40(coord_t * from, znode * to, shift_direction pend, int delete_child,   /* if @from->node becomes empty - it will be
58816 +                                                                                          deleted from the tree if this is set to 1 */
58817 +                int including_stop_coord, carry_plugin_info * info)
58818 +{
58819 +       struct shift_params shift;
58820 +       int result;
58821 +       znode *left, *right;
58822 +       znode *source;
58823 +       int target_empty;
58824 +
58825 +       assert("nikita-2161", coord_check(from));
58826 +
58827 +       memset(&shift, 0, sizeof(shift));
58828 +       shift.pend = pend;
58829 +       shift.wish_stop = *from;
58830 +       shift.target = to;
58831 +
58832 +       assert("nikita-1473", znode_is_write_locked(from->node));
58833 +       assert("nikita-1474", znode_is_write_locked(to));
58834 +
58835 +       source = from->node;
58836 +
58837 +       /* set @shift.wish_stop to rightmost/leftmost unit among units we want
58838 +          shifted */
58839 +       if (pend == SHIFT_LEFT) {
58840 +               result = coord_set_to_left(&shift.wish_stop);
58841 +               left = to;
58842 +               right = from->node;
58843 +       } else {
58844 +               result = coord_set_to_right(&shift.wish_stop);
58845 +               left = from->node;
58846 +               right = to;
58847 +       }
58848 +
58849 +       if (result) {
58850 +               /* move insertion coord even if there is nothing to move */
58851 +               if (including_stop_coord) {
58852 +                       /* move insertion coord (@from) */
58853 +                       if (pend == SHIFT_LEFT) {
58854 +                               /* after last item in target node */
58855 +                               coord_init_after_last_item(from, to);
58856 +                       } else {
58857 +                               /* before first item in target node */
58858 +                               coord_init_before_first_item(from, to);
58859 +                       }
58860 +               }
58861 +
58862 +               if (delete_child && node_is_empty(shift.wish_stop.node))
58863 +                       result =
58864 +                           prepare_removal_node40(shift.wish_stop.node, info);
58865 +               else
58866 +                       result = 0;
58867 +               /* there is nothing to shift */
58868 +               assert("nikita-2078", coord_check(from));
58869 +               return result;
58870 +       }
58871 +
58872 +       target_empty = node_is_empty(to);
58873 +
58874 +       /* when first node plugin with item body compression is implemented,
58875 +          this must be changed to call node specific plugin */
58876 +
58877 +       /* shift->stop_coord is updated to last unit which really will be
58878 +          shifted */
58879 +       estimate_shift(&shift, get_current_context());
58880 +       if (!shift.shift_bytes) {
58881 +               /* we could not shift anything */
58882 +               assert("nikita-2079", coord_check(from));
58883 +               return 0;
58884 +       }
58885 +
58886 +       copy(&shift);
58887 +
58888 +       /* result value of this is important. It is used by adjust_coord below */
58889 +       result = delete_copied(&shift);
58890 +
58891 +       assert("vs-1610", result >= 0);
58892 +       assert("vs-1471",
58893 +              ((reiser4_context *) current->journal_info)->magic ==
58894 +              context_magic);
58895 +
58896 +       /* item which has been moved from one node to another might want to do
58897 +          something on that event. This can be done by item's shift_hook
58898 +          method, which will be now called for every moved items */
58899 +       call_shift_hooks(&shift);
58900 +
58901 +       assert("vs-1472",
58902 +              ((reiser4_context *) current->journal_info)->magic ==
58903 +              context_magic);
58904 +
58905 +       update_taps(&shift);
58906 +
58907 +       assert("vs-1473",
58908 +              ((reiser4_context *) current->journal_info)->magic ==
58909 +              context_magic);
58910 +
58911 +       /* adjust @from pointer in accordance with @including_stop_coord flag
58912 +          and amount of data which was really shifted */
58913 +       adjust_coord(from, &shift, result, including_stop_coord);
58914 +
58915 +       if (target_empty)
58916 +               /*
58917 +                * items were shifted into empty node. Update delimiting key.
58918 +                */
58919 +               result = prepare_for_update(NULL, left, info);
58920 +
58921 +       /* add update operation to @info, which is the list of operations to
58922 +          be performed on a higher level */
58923 +       result = prepare_for_update(left, right, info);
58924 +       if (!result && node_is_empty(source) && delete_child) {
58925 +               /* all contents of @from->node is moved to @to and @from->node
58926 +                  has to be removed from the tree, so, on higher level we
58927 +                  will be removing the pointer to node @from->node */
58928 +               result = prepare_removal_node40(source, info);
58929 +       }
58930 +       assert("nikita-2080", coord_check(from));
58931 +       return result ? result : (int)shift.shift_bytes;
58932 +}
58933 +
58934 +/* plugin->u.node.fast_insert()
58935 +   look for description of this method in plugin/node/node.h */
58936 +int fast_insert_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58937 +{
58938 +       return 1;
58939 +}
58940 +
58941 +/* plugin->u.node.fast_paste()
58942 +   look for description of this method in plugin/node/node.h */
58943 +int fast_paste_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58944 +{
58945 +       return 1;
58946 +}
58947 +
58948 +/* plugin->u.node.fast_cut()
58949 +   look for description of this method in plugin/node/node.h */
58950 +int fast_cut_node40(const coord_t * coord UNUSED_ARG /* node to query */ )
58951 +{
58952 +       return 1;
58953 +}
58954 +
58955 +/* plugin->u.node.modify - not defined */
58956 +
58957 +/* plugin->u.node.max_item_size */
58958 +int max_item_size_node40(void)
58959 +{
58960 +       return reiser4_get_current_sb()->s_blocksize - sizeof(node40_header) -
58961 +           sizeof(item_header40);
58962 +}
58963 +
58964 +/* plugin->u.node.set_item_plugin */
58965 +int set_item_plugin_node40(coord_t *coord, item_id id)
58966 +{
58967 +       item_header40 *ih;
58968 +
58969 +       ih = node40_ih_at_coord(coord);
58970 +       put_unaligned(cpu_to_le16(id), &ih->plugin_id);
58971 +       coord->iplugid = id;
58972 +       return 0;
58973 +}
58974 +
58975 +/*
58976 +   Local variables:
58977 +   c-indentation-style: "K&R"
58978 +   mode-name: "LC"
58979 +   c-basic-offset: 8
58980 +   tab-width: 8
58981 +   fill-column: 120
58982 +   scroll-step: 1
58983 +   End:
58984 +*/
58985 diff --git a/fs/reiser4/plugin/node/node40.h b/fs/reiser4/plugin/node/node40.h
58986 new file mode 100644
58987 index 0000000..8ae375b
58988 --- /dev/null
58989 +++ b/fs/reiser4/plugin/node/node40.h
58990 @@ -0,0 +1,125 @@
58991 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
58992 +
58993 +#if !defined( __REISER4_NODE40_H__ )
58994 +#define __REISER4_NODE40_H__
58995 +
58996 +#include "../../forward.h"
58997 +#include "../../dformat.h"
58998 +#include "node.h"
58999 +
59000 +#include <linux/types.h>
59001 +
59002 +/* format of node header for 40 node layouts. Keep bloat out of this struct.  */
59003 +typedef struct node40_header {
59004 +       /* identifier of node plugin. Must be located at the very beginning
59005 +          of a node. */
59006 +       common_node_header common_header;       /* this is 16 bits */
59007 +       /* number of items. Should be first element in the node header,
59008 +          because we haven't yet finally decided whether it shouldn't go into
59009 +          common_header.
59010 +        */
59011 +/* NIKITA-FIXME-HANS: Create a macro such that if there is only one
59012 + * node format at compile time, and it is this one, accesses do not function dereference when
59013 + * accessing these fields (and otherwise they do).  Probably 80% of users will only have one node format at a time throughout the life of reiser4.  */
59014 +       d16 nr_items;
59015 +       /* free space in node measured in bytes */
59016 +       d16 free_space;
59017 +       /* offset to start of free space in node */
59018 +       d16 free_space_start;
59019 +       /* for reiser4_fsck.  When information about what is a free
59020 +          block is corrupted, and we try to recover everything even
59021 +          if marked as freed, then old versions of data may
59022 +          duplicate newer versions, and this field allows us to
59023 +          restore the newer version.  Also useful for when users
59024 +          who don't have the new trashcan installed on their linux distro
59025 +          delete the wrong files and send us desperate emails
59026 +          offering $25 for them back.  */
59027 +
59028 +       /* magic field we need to tell formatted nodes NIKITA-FIXME-HANS: improve this comment */
59029 +       d32 magic;
59030 +       /* flushstamp is made of mk_id and write_counter. mk_id is an
59031 +          id generated randomly at mkreiserfs time. So we can just
59032 +          skip all nodes with different mk_id. write_counter is d64
59033 +          incrementing counter of writes on disk. It is used for
59034 +          choosing the newest data at fsck time. NIKITA-FIXME-HANS: why was field name changed but not comment? */
59035 +
59036 +       d32 mkfs_id;
59037 +       d64 flush_id;
59038 +       /* node flags to be used by fsck (reiser4ck or reiser4fsck?)
59039 +          and repacker NIKITA-FIXME-HANS: say more or reference elsewhere that says more */
59040 +       d16 flags;
59041 +
59042 +       /* 1 is leaf level, 2 is twig level, root is the numerically
59043 +          largest level */
59044 +       d8 level;
59045 +
59046 +       d8 pad;
59047 +} PACKED node40_header;
59048 +
59049 +/* item headers are not standard across all node layouts, pass
59050 +   pos_in_node to functions instead */
59051 +typedef struct item_header40 {
59052 +       /* key of item */
59053 +       /*  0 */ reiser4_key key;
59054 +       /* offset from start of a node measured in 8-byte chunks */
59055 +       /* 24 */ d16 offset;
59056 +       /* 26 */ d16 flags;
59057 +       /* 28 */ d16 plugin_id;
59058 +} PACKED item_header40;
59059 +
59060 +size_t item_overhead_node40(const znode * node, flow_t * aflow);
59061 +size_t free_space_node40(znode * node);
59062 +node_search_result lookup_node40(znode * node, const reiser4_key * key,
59063 +                                lookup_bias bias, coord_t * coord);
59064 +int num_of_items_node40(const znode * node);
59065 +char *item_by_coord_node40(const coord_t * coord);
59066 +int length_by_coord_node40(const coord_t * coord);
59067 +item_plugin *plugin_by_coord_node40(const coord_t * coord);
59068 +reiser4_key *key_at_node40(const coord_t * coord, reiser4_key * key);
59069 +size_t estimate_node40(znode * node);
59070 +int check_node40(const znode * node, __u32 flags, const char **error);
59071 +int parse_node40(znode * node);
59072 +int init_node40(znode * node);
59073 +#ifdef GUESS_EXISTS
59074 +int guess_node40(const znode * node);
59075 +#endif
59076 +void change_item_size_node40(coord_t * coord, int by);
59077 +int create_item_node40(coord_t * target, const reiser4_key * key,
59078 +                      reiser4_item_data * data, carry_plugin_info * info);
59079 +void update_item_key_node40(coord_t * target, const reiser4_key * key,
59080 +                           carry_plugin_info * info);
59081 +int kill_node40(struct carry_kill_data *, carry_plugin_info *);
59082 +int cut_node40(struct carry_cut_data *, carry_plugin_info *);
59083 +int shift_node40(coord_t * from, znode * to, shift_direction pend,
59084 +                /* if @from->node becomes
59085 +                   empty - it will be deleted from
59086 +                   the tree if this is set to 1
59087 +                 */
59088 +                int delete_child, int including_stop_coord,
59089 +                carry_plugin_info * info);
59090 +
59091 +int fast_insert_node40(const coord_t * coord);
59092 +int fast_paste_node40(const coord_t * coord);
59093 +int fast_cut_node40(const coord_t * coord);
59094 +int max_item_size_node40(void);
59095 +int prepare_removal_node40(znode * empty, carry_plugin_info * info);
59096 +int set_item_plugin_node40(coord_t * coord, item_id id);
59097 +int shrink_item_node40(coord_t * coord, int delta);
59098 +
59099 +#if REISER4_DEBUG
59100 +void *shift_check_prepare(const znode *left, const znode *right);
59101 +void shift_check(void *vp, const znode *left, const znode *right);
59102 +#endif
59103 +
59104 +/* __REISER4_NODE40_H__ */
59105 +#endif
59106 +/*
59107 +   Local variables:
59108 +   c-indentation-style: "K&R"
59109 +   mode-name: "LC"
59110 +   c-basic-offset: 8
59111 +   tab-width: 8
59112 +   fill-column: 120
59113 +   scroll-step: 1
59114 +   End:
59115 +*/
59116 diff --git a/fs/reiser4/plugin/object.c b/fs/reiser4/plugin/object.c
59117 new file mode 100644
59118 index 0000000..ae999e3
59119 --- /dev/null
59120 +++ b/fs/reiser4/plugin/object.c
59121 @@ -0,0 +1,516 @@
59122 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59123 + * reiser4/README */
59124 +
59125 +/*
59126 + * Examples of object plugins: file, directory, symlink, special file.
59127 + *
59128 + * Plugins associated with inode:
59129 + *
59130 + * Plugin of inode is plugin referenced by plugin-id field of on-disk
59131 + * stat-data. How we store this plugin in in-core inode is not
59132 + * important. Currently pointers are used, another variant is to store offsets
59133 + * and do array lookup on each access.
59134 + *
59135 + * Now, each inode has one selected plugin: object plugin that
59136 + * determines what type of file this object is: directory, regular etc.
59137 + *
59138 + * This main plugin can use other plugins that are thus subordinated to
59139 + * it. Directory instance of object plugin uses hash; regular file
59140 + * instance uses tail policy plugin.
59141 + *
59142 + * Object plugin is either taken from id in stat-data or guessed from
59143 + * i_mode bits. Once it is established we ask it to install its
59144 + * subordinate plugins, by looking again in stat-data or inheriting them
59145 + * from parent.
59146 + *
59147 + * How new inode is initialized during ->read_inode():
59148 + * 1 read stat-data and initialize inode fields: i_size, i_mode,
59149 + *   i_generation, capabilities etc.
59150 + * 2 read plugin id from stat data or try to guess plugin id
59151 + *   from inode->i_mode bits if plugin id is missing.
59152 + * 3 Call ->init_inode() method of stat-data plugin to initialise inode fields.
59153 + *
59154 + * NIKITA-FIXME-HANS: can you say a little about 1 being done before 3?  What
59155 + * if stat data does contain i_size, etc., due to it being an unusual plugin?
59156 + *
59157 + * 4 Call ->activate() method of object's plugin. Plugin is either read from
59158 + *    from stat-data or guessed from mode bits
59159 + * 5 Call ->inherit() method of object plugin to inherit as yet un initialized
59160 + *    plugins from parent.
59161 + *
59162 + * Easy induction proves that on last step all plugins of inode would be
59163 + * initialized.
59164 + *
59165 + * When creating new object:
59166 + * 1 obtain object plugin id (see next period)
59167 + * NIKITA-FIXME-HANS: period?
59168 + * 2 ->install() this plugin
59169 + * 3 ->inherit() the rest from the parent
59170 + *
59171 + * We need some examples of creating an object with default and non-default
59172 + * plugin ids.  Nikita, please create them.
59173 + */
59174 +
59175 +#include "../inode.h"
59176 +
59177 +static int _bugop(void)
59178 +{
59179 +       BUG_ON(1);
59180 +       return 0;
59181 +}
59182 +
59183 +#define bugop ((void *)_bugop)
59184 +
59185 +static int _dummyop(void)
59186 +{
59187 +       return 0;
59188 +}
59189 +
59190 +#define dummyop ((void *)_dummyop)
59191 +
59192 +static int change_file(struct inode *inode,
59193 +                      reiser4_plugin * plugin,
59194 +                      pset_member memb)
59195 +{
59196 +       /* cannot change object plugin of already existing object */
59197 +       if (memb == PSET_FILE)
59198 +               return RETERR(-EINVAL);
59199 +
59200 +       /* Change PSET_CREATE */
59201 +       return aset_set_unsafe(&reiser4_inode_data(inode)->pset, memb, plugin);
59202 +}
59203 +
59204 +static reiser4_plugin_ops file_plugin_ops = {
59205 +       .change = change_file
59206 +};
59207 +
59208 +/*
59209 + * Definitions of object plugins.
59210 + */
59211 +
59212 +file_plugin file_plugins[LAST_FILE_PLUGIN_ID] = {
59213 +       [UNIX_FILE_PLUGIN_ID] = {
59214 +               .h = {
59215 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59216 +                       .id = UNIX_FILE_PLUGIN_ID,
59217 +                       .groups = (1 << REISER4_REGULAR_FILE),
59218 +                       .pops = &file_plugin_ops,
59219 +                       .label = "reg",
59220 +                       .desc = "regular file",
59221 +                       .linkage = {NULL, NULL},
59222 +               },
59223 +               .inode_ops = {
59224 +                       .permission = reiser4_permission_common,
59225 +                       .setattr = setattr_unix_file,
59226 +                       .getattr = reiser4_getattr_common
59227 +               },
59228 +               .file_ops = {
59229 +                       .llseek = generic_file_llseek,
59230 +                       .read = read_unix_file,
59231 +                       .write = write_unix_file,
59232 +                       .aio_read = generic_file_aio_read,
59233 +                       .ioctl = ioctl_unix_file,
59234 +                       .mmap = mmap_unix_file,
59235 +                       .open = open_unix_file,
59236 +                       .release = release_unix_file,
59237 +                       .fsync = sync_unix_file,
59238 +                       .sendfile = sendfile_unix_file
59239 +               },
59240 +               .as_ops = {
59241 +                       .writepage = reiser4_writepage,
59242 +                       .readpage = readpage_unix_file,
59243 +                       .sync_page = block_sync_page,
59244 +                       .writepages = writepages_unix_file,
59245 +                       .set_page_dirty = reiser4_set_page_dirty,
59246 +                       .readpages = readpages_unix_file,
59247 +                       .prepare_write = prepare_write_unix_file,
59248 +                       .commit_write = commit_write_unix_file,
59249 +                       .bmap = bmap_unix_file,
59250 +                       .invalidatepage = reiser4_invalidatepage,
59251 +                       .releasepage = reiser4_releasepage
59252 +               },
59253 +               .write_sd_by_inode = write_sd_by_inode_common,
59254 +               .flow_by_inode = flow_by_inode_unix_file,
59255 +               .key_by_inode = key_by_inode_and_offset_common,
59256 +               .set_plug_in_inode = set_plug_in_inode_common,
59257 +               .adjust_to_parent = adjust_to_parent_common,
59258 +               .create_object = reiser4_create_object_common,
59259 +               .delete_object = delete_object_unix_file,
59260 +               .add_link = reiser4_add_link_common,
59261 +               .rem_link = reiser4_rem_link_common,
59262 +               .owns_item = owns_item_unix_file,
59263 +               .can_add_link = can_add_link_common,
59264 +               .detach = dummyop,
59265 +               .bind = dummyop,
59266 +               .safelink = safelink_common,
59267 +               .estimate = {
59268 +                       .create = estimate_create_common,
59269 +                       .update = estimate_update_common,
59270 +                       .unlink = estimate_unlink_common
59271 +               },
59272 +               .init_inode_data = init_inode_data_unix_file,
59273 +               .cut_tree_worker = cut_tree_worker_common,
59274 +               .wire = {
59275 +                       .write = wire_write_common,
59276 +                       .read = wire_read_common,
59277 +                       .get = wire_get_common,
59278 +                       .size = wire_size_common,
59279 +                       .done = wire_done_common
59280 +               }
59281 +       },
59282 +       [DIRECTORY_FILE_PLUGIN_ID] = {
59283 +               .h = {
59284 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59285 +                       .id = DIRECTORY_FILE_PLUGIN_ID,
59286 +                       .groups = (1 << REISER4_DIRECTORY_FILE),
59287 +                       .pops = &file_plugin_ops,
59288 +                       .label = "dir",
59289 +                       .desc = "directory",
59290 +                       .linkage = {NULL, NULL}
59291 +               },
59292 +               .inode_ops = {.create = NULL},
59293 +               .file_ops = {.owner = NULL},
59294 +               .as_ops = {.writepage = NULL},
59295 +
59296 +               .write_sd_by_inode = write_sd_by_inode_common,
59297 +               .flow_by_inode = bugop,
59298 +               .key_by_inode = bugop,
59299 +               .set_plug_in_inode = set_plug_in_inode_common,
59300 +               .adjust_to_parent = adjust_to_parent_common_dir,
59301 +               .create_object = reiser4_create_object_common,
59302 +               .delete_object = reiser4_delete_dir_common,
59303 +               .add_link = reiser4_add_link_common,
59304 +               .rem_link = rem_link_common_dir,
59305 +               .owns_item = owns_item_common_dir,
59306 +               .can_add_link = can_add_link_common,
59307 +               .can_rem_link = can_rem_link_common_dir,
59308 +               .detach = reiser4_detach_common_dir,
59309 +               .bind = reiser4_bind_common_dir,
59310 +               .safelink = safelink_common,
59311 +               .estimate = {
59312 +                       .create = estimate_create_common_dir,
59313 +                       .update = estimate_update_common,
59314 +                       .unlink = estimate_unlink_common_dir
59315 +               },
59316 +               .wire = {
59317 +                       .write = wire_write_common,
59318 +                       .read = wire_read_common,
59319 +                       .get = wire_get_common,
59320 +                       .size = wire_size_common,
59321 +                       .done = wire_done_common
59322 +               },
59323 +               .init_inode_data = init_inode_ordering,
59324 +               .cut_tree_worker = cut_tree_worker_common,
59325 +       },
59326 +       [SYMLINK_FILE_PLUGIN_ID] = {
59327 +               .h = {
59328 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59329 +                       .id = SYMLINK_FILE_PLUGIN_ID,
59330 +                       .groups = (1 << REISER4_SYMLINK_FILE),
59331 +                       .pops = &file_plugin_ops,
59332 +                       .label = "symlink",
59333 +                       .desc = "symbolic link",
59334 +                       .linkage = {NULL,NULL}
59335 +               },
59336 +               .inode_ops = {
59337 +                       .readlink = generic_readlink,
59338 +                       .follow_link = reiser4_follow_link_common,
59339 +                       .permission = reiser4_permission_common,
59340 +                       .setattr = reiser4_setattr_common,
59341 +                       .getattr = reiser4_getattr_common
59342 +               },
59343 +               /* inode->i_fop of symlink is initialized by NULL in setup_inode_ops */
59344 +               .file_ops = {.owner = NULL},
59345 +               .as_ops = {.writepage = NULL},
59346 +
59347 +               .write_sd_by_inode = write_sd_by_inode_common,
59348 +               .set_plug_in_inode = set_plug_in_inode_common,
59349 +               .adjust_to_parent = adjust_to_parent_common,
59350 +               .create_object = reiser4_create_symlink,
59351 +               .delete_object = reiser4_delete_object_common,
59352 +               .add_link = reiser4_add_link_common,
59353 +               .rem_link = reiser4_rem_link_common,
59354 +               .can_add_link = can_add_link_common,
59355 +               .detach = dummyop,
59356 +               .bind = dummyop,
59357 +               .safelink = safelink_common,
59358 +               .estimate = {
59359 +                       .create = estimate_create_common,
59360 +                       .update = estimate_update_common,
59361 +                       .unlink = estimate_unlink_common
59362 +               },
59363 +               .init_inode_data = init_inode_ordering,
59364 +               .cut_tree_worker = cut_tree_worker_common,
59365 +               .destroy_inode = destroy_inode_symlink,
59366 +               .wire = {
59367 +                       .write = wire_write_common,
59368 +                       .read = wire_read_common,
59369 +                       .get = wire_get_common,
59370 +                       .size = wire_size_common,
59371 +                       .done = wire_done_common
59372 +               }
59373 +       },
59374 +       [SPECIAL_FILE_PLUGIN_ID] = {
59375 +               .h = {
59376 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59377 +                       .id = SPECIAL_FILE_PLUGIN_ID,
59378 +                       .groups = (1 << REISER4_SPECIAL_FILE),
59379 +                       .pops = &file_plugin_ops,
59380 +                       .label = "special",
59381 +                       .desc =
59382 +                       "special: fifo, device or socket",
59383 +                       .linkage = {NULL, NULL}
59384 +               },
59385 +               .inode_ops = {
59386 +                       .permission = reiser4_permission_common,
59387 +                       .setattr = reiser4_setattr_common,
59388 +                       .getattr = reiser4_getattr_common
59389 +               },
59390 +               /* file_ops of special files (sockets, block, char, fifo) are
59391 +                  initialized by init_special_inode. */
59392 +               .file_ops = {.owner = NULL},
59393 +               .as_ops = {.writepage = NULL},
59394 +
59395 +               .write_sd_by_inode = write_sd_by_inode_common,
59396 +               .set_plug_in_inode = set_plug_in_inode_common,
59397 +               .adjust_to_parent = adjust_to_parent_common,
59398 +               .create_object = reiser4_create_object_common,
59399 +               .delete_object = reiser4_delete_object_common,
59400 +               .add_link = reiser4_add_link_common,
59401 +               .rem_link = reiser4_rem_link_common,
59402 +               .owns_item = owns_item_common,
59403 +               .can_add_link = can_add_link_common,
59404 +               .detach = dummyop,
59405 +               .bind = dummyop,
59406 +               .safelink = safelink_common,
59407 +               .estimate = {
59408 +                       .create = estimate_create_common,
59409 +                       .update = estimate_update_common,
59410 +                       .unlink = estimate_unlink_common
59411 +               },
59412 +               .init_inode_data = init_inode_ordering,
59413 +               .cut_tree_worker = cut_tree_worker_common,
59414 +               .wire = {
59415 +                       .write = wire_write_common,
59416 +                       .read = wire_read_common,
59417 +                       .get = wire_get_common,
59418 +                       .size = wire_size_common,
59419 +                       .done = wire_done_common
59420 +               }
59421 +       },
59422 +       [CRYPTCOMPRESS_FILE_PLUGIN_ID] = {
59423 +               .h = {
59424 +                       .type_id = REISER4_FILE_PLUGIN_TYPE,
59425 +                       .id = CRYPTCOMPRESS_FILE_PLUGIN_ID,
59426 +                       .groups = (1 << REISER4_REGULAR_FILE),
59427 +                       .pops = &file_plugin_ops,
59428 +                       .label = "cryptcompress",
59429 +                       .desc = "cryptcompress file",
59430 +                       .linkage = {NULL, NULL}
59431 +               },
59432 +               .inode_ops = {
59433 +                       .permission = reiser4_permission_common,
59434 +                       .setattr = prot_setattr_cryptcompress,
59435 +                       .getattr = reiser4_getattr_common
59436 +               },
59437 +               .file_ops = {
59438 +                       .llseek = generic_file_llseek,
59439 +                       .read = prot_read_cryptcompress,
59440 +                       .write = prot_write_cryptcompress,
59441 +                       .aio_read = generic_file_aio_read,
59442 +                       .mmap = prot_mmap_cryptcompress,
59443 +                       .release = prot_release_cryptcompress,
59444 +                       .fsync = reiser4_sync_common,
59445 +                       .sendfile = prot_sendfile_cryptcompress
59446 +               },
59447 +               .as_ops = {
59448 +                       .writepage = reiser4_writepage,
59449 +                       .readpage = readpage_cryptcompress,
59450 +                       .sync_page = block_sync_page,
59451 +                       .writepages = writepages_cryptcompress,
59452 +                       .set_page_dirty = reiser4_set_page_dirty,
59453 +                       .readpages = readpages_cryptcompress,
59454 +                       .prepare_write = prepare_write_common,
59455 +                       .invalidatepage = reiser4_invalidatepage,
59456 +                       .releasepage = reiser4_releasepage
59457 +               },
59458 +               .write_sd_by_inode = write_sd_by_inode_common,
59459 +               .flow_by_inode = flow_by_inode_cryptcompress,
59460 +               .key_by_inode = key_by_inode_cryptcompress,
59461 +               .set_plug_in_inode = set_plug_in_inode_common,
59462 +               .adjust_to_parent = adjust_to_parent_cryptcompress,
59463 +               .create_object = create_cryptcompress,
59464 +               .open_object = open_object_cryptcompress,
59465 +               .delete_object = delete_object_cryptcompress,
59466 +               .add_link = reiser4_add_link_common,
59467 +               .rem_link = reiser4_rem_link_common,
59468 +               .owns_item = owns_item_common,
59469 +               .can_add_link = can_add_link_common,
59470 +               .detach = dummyop,
59471 +               .bind = dummyop,
59472 +               .safelink = safelink_common,
59473 +               .estimate = {
59474 +                       .create = estimate_create_common,
59475 +                       .update = estimate_update_common,
59476 +                       .unlink = estimate_unlink_common
59477 +               },
59478 +               .init_inode_data = init_inode_data_cryptcompress,
59479 +               .cut_tree_worker = cut_tree_worker_cryptcompress,
59480 +               .destroy_inode = destroy_inode_cryptcompress,
59481 +               .wire = {
59482 +                       .write = wire_write_common,
59483 +                       .read = wire_read_common,
59484 +                       .get = wire_get_common,
59485 +                       .size = wire_size_common,
59486 +                       .done = wire_done_common
59487 +               }
59488 +       }
59489 +};
59490 +
59491 +static int change_dir(struct inode *inode,
59492 +                     reiser4_plugin * plugin,
59493 +                     pset_member memb)
59494 +{
59495 +       /* cannot change dir plugin of already existing object */
59496 +       return RETERR(-EINVAL);
59497 +}
59498 +
59499 +static reiser4_plugin_ops dir_plugin_ops = {
59500 +       .change = change_dir
59501 +};
59502 +
59503 +/*
59504 + * definition of directory plugins
59505 + */
59506 +
59507 +dir_plugin dir_plugins[LAST_DIR_ID] = {
59508 +       /* standard hashed directory plugin */
59509 +       [HASHED_DIR_PLUGIN_ID] = {
59510 +               .h = {
59511 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
59512 +                       .id = HASHED_DIR_PLUGIN_ID,
59513 +                       .pops = &dir_plugin_ops,
59514 +                       .label = "dir",
59515 +                       .desc = "hashed directory",
59516 +                       .linkage = {NULL, NULL}
59517 +               },
59518 +               .inode_ops = {
59519 +                       .create = reiser4_create_common,
59520 +                       .lookup = reiser4_lookup_common,
59521 +                       .link = reiser4_link_common,
59522 +                       .unlink = reiser4_unlink_common,
59523 +                       .symlink = reiser4_symlink_common,
59524 +                       .mkdir = reiser4_mkdir_common,
59525 +                       .rmdir = reiser4_unlink_common,
59526 +                       .mknod = reiser4_mknod_common,
59527 +                       .rename = reiser4_rename_common,
59528 +                       .permission = reiser4_permission_common,
59529 +                       .setattr = reiser4_setattr_common,
59530 +                       .getattr = reiser4_getattr_common
59531 +               },
59532 +               .file_ops = {
59533 +                       .llseek = reiser4_llseek_dir_common,
59534 +                       .read = generic_read_dir,
59535 +                       .readdir = reiser4_readdir_common,
59536 +                       .release = reiser4_release_dir_common,
59537 +                       .fsync = reiser4_sync_common
59538 +               },
59539 +               .as_ops = {
59540 +                       .writepage = bugop,
59541 +                       .sync_page = bugop,
59542 +                       .writepages = dummyop,
59543 +                       .set_page_dirty = bugop,
59544 +                       .readpages = bugop,
59545 +                       .prepare_write = bugop,
59546 +                       .commit_write = bugop,
59547 +                       .bmap = bugop,
59548 +                       .invalidatepage = bugop,
59549 +                       .releasepage = bugop
59550 +               },
59551 +               .get_parent = get_parent_common,
59552 +               .is_name_acceptable = is_name_acceptable_common,
59553 +               .build_entry_key = build_entry_key_hashed,
59554 +               .build_readdir_key = build_readdir_key_common,
59555 +               .add_entry = reiser4_add_entry_common,
59556 +               .rem_entry = reiser4_rem_entry_common,
59557 +               .init = reiser4_dir_init_common,
59558 +               .done = reiser4_dir_done_common,
59559 +               .attach = reiser4_attach_common,
59560 +               .detach = reiser4_detach_common,
59561 +               .estimate = {
59562 +                       .add_entry = estimate_add_entry_common,
59563 +                       .rem_entry = estimate_rem_entry_common,
59564 +                       .unlink = dir_estimate_unlink_common
59565 +               }
59566 +       },
59567 +       /* hashed directory for which seekdir/telldir are guaranteed to
59568 +        * work. Brain-damage. */
59569 +       [SEEKABLE_HASHED_DIR_PLUGIN_ID] = {
59570 +               .h = {
59571 +                       .type_id = REISER4_DIR_PLUGIN_TYPE,
59572 +                       .id = SEEKABLE_HASHED_DIR_PLUGIN_ID,
59573 +                       .pops = &dir_plugin_ops,
59574 +                       .label = "dir32",
59575 +                       .desc = "directory hashed with 31 bit hash",
59576 +                       .linkage = {NULL, NULL}
59577 +               },
59578 +               .inode_ops = {
59579 +                       .create = reiser4_create_common,
59580 +                       .lookup = reiser4_lookup_common,
59581 +                       .link = reiser4_link_common,
59582 +                       .unlink = reiser4_unlink_common,
59583 +                       .symlink = reiser4_symlink_common,
59584 +                       .mkdir = reiser4_mkdir_common,
59585 +                       .rmdir = reiser4_unlink_common,
59586 +                       .mknod = reiser4_mknod_common,
59587 +                       .rename = reiser4_rename_common,
59588 +                       .permission = reiser4_permission_common,
59589 +                       .setattr = reiser4_setattr_common,
59590 +                       .getattr = reiser4_getattr_common
59591 +               },
59592 +               .file_ops = {
59593 +                       .llseek = reiser4_llseek_dir_common,
59594 +                       .read = generic_read_dir,
59595 +                       .readdir = reiser4_readdir_common,
59596 +                       .release = reiser4_release_dir_common,
59597 +                       .fsync = reiser4_sync_common
59598 +               },
59599 +               .as_ops = {
59600 +                       .writepage = bugop,
59601 +                       .sync_page = bugop,
59602 +                       .writepages = dummyop,
59603 +                       .set_page_dirty = bugop,
59604 +                       .readpages = bugop,
59605 +                       .prepare_write = bugop,
59606 +                       .commit_write = bugop,
59607 +                       .bmap = bugop,
59608 +                       .invalidatepage = bugop,
59609 +                       .releasepage = bugop
59610 +               },
59611 +               .get_parent = get_parent_common,
59612 +               .is_name_acceptable = is_name_acceptable_common,
59613 +               .build_entry_key = build_entry_key_seekable,
59614 +               .build_readdir_key = build_readdir_key_common,
59615 +               .add_entry = reiser4_add_entry_common,
59616 +               .rem_entry = reiser4_rem_entry_common,
59617 +               .init = reiser4_dir_init_common,
59618 +               .done = reiser4_dir_done_common,
59619 +               .attach = reiser4_attach_common,
59620 +               .detach = reiser4_detach_common,
59621 +               .estimate = {
59622 +                       .add_entry = estimate_add_entry_common,
59623 +                       .rem_entry = estimate_rem_entry_common,
59624 +                       .unlink = dir_estimate_unlink_common
59625 +               }
59626 +       }
59627 +};
59628 +
59629 +/* Make Linus happy.
59630 +   Local variables:
59631 +   c-indentation-style: "K&R"
59632 +   mode-name: "LC"
59633 +   c-basic-offset: 8
59634 +   tab-width: 8
59635 +   fill-column: 120
59636 +   End:
59637 +*/
59638 diff --git a/fs/reiser4/plugin/object.h b/fs/reiser4/plugin/object.h
59639 new file mode 100644
59640 index 0000000..440c369
59641 --- /dev/null
59642 +++ b/fs/reiser4/plugin/object.h
59643 @@ -0,0 +1,121 @@
59644 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by
59645 + * reiser4/README */
59646 +
59647 +/* Declaration of object plugin functions. */
59648 +
59649 +#if !defined( __FS_REISER4_PLUGIN_OBJECT_H__ )
59650 +#define __FS_REISER4_PLUGIN_OBJECT_H__
59651 +
59652 +#include "../type_safe_hash.h"
59653 +
59654 +/* common implementations of inode operations */
59655 +int reiser4_create_common(struct inode *parent, struct dentry *dentry,
59656 +                         int mode, struct nameidata *);
59657 +struct dentry * reiser4_lookup_common(struct inode *parent,
59658 +                                     struct dentry *dentry,
59659 +                                     struct nameidata *nameidata);
59660 +int reiser4_link_common(struct dentry *existing, struct inode *parent,
59661 +                       struct dentry *newname);
59662 +int reiser4_unlink_common(struct inode *parent, struct dentry *victim);
59663 +int reiser4_mkdir_common(struct inode *parent, struct dentry *dentry, int mode);
59664 +int reiser4_symlink_common(struct inode *parent, struct dentry *dentry,
59665 +                  const char *linkname);
59666 +int reiser4_mknod_common(struct inode *parent, struct dentry *dentry,
59667 +                int mode, dev_t rdev);
59668 +int reiser4_rename_common(struct inode *old_dir, struct dentry *old_name,
59669 +                         struct inode *new_dir, struct dentry *new_name);
59670 +void *reiser4_follow_link_common(struct dentry *, struct nameidata *data);
59671 +int reiser4_permission_common(struct inode *, int mask,
59672 +                             struct nameidata *nameidata);
59673 +int reiser4_setattr_common(struct dentry *, struct iattr *);
59674 +int reiser4_getattr_common(struct vfsmount *mnt, struct dentry *,
59675 +                          struct kstat *);
59676 +
59677 +/* common implementations of file operations */
59678 +loff_t reiser4_llseek_dir_common(struct file *, loff_t off, int origin);
59679 +int reiser4_readdir_common(struct file *, void *dirent, filldir_t);
59680 +int reiser4_release_dir_common(struct inode *, struct file *);
59681 +int reiser4_sync_common(struct file *, struct dentry *, int datasync);
59682 +
59683 +/* common implementations of address space operations */
59684 +int prepare_write_common(struct file *, struct page *, unsigned from,
59685 +                        unsigned to);
59686 +
59687 +/* file plugin operations: common implementations */
59688 +int write_sd_by_inode_common(struct inode *);
59689 +int key_by_inode_and_offset_common(struct inode *, loff_t, reiser4_key *);
59690 +int set_plug_in_inode_common(struct inode *object, struct inode *parent,
59691 +                            reiser4_object_create_data *);
59692 +int adjust_to_parent_common(struct inode *object, struct inode *parent,
59693 +                           struct inode *root);
59694 +int adjust_to_parent_common_dir(struct inode *object, struct inode *parent,
59695 +                               struct inode *root);
59696 +int adjust_to_parent_cryptcompress(struct inode *object, struct inode *parent,
59697 +                                  struct inode *root);
59698 +int reiser4_create_object_common(struct inode *object, struct inode *parent,
59699 +                                reiser4_object_create_data *);
59700 +int reiser4_delete_object_common(struct inode *);
59701 +int reiser4_delete_dir_common(struct inode *);
59702 +int reiser4_add_link_common(struct inode *object, struct inode *parent);
59703 +int reiser4_rem_link_common(struct inode *object, struct inode *parent);
59704 +int rem_link_common_dir(struct inode *object, struct inode *parent);
59705 +int owns_item_common(const struct inode *, const coord_t *);
59706 +int owns_item_common_dir(const struct inode *, const coord_t *);
59707 +int can_add_link_common(const struct inode *);
59708 +int can_rem_link_common_dir(const struct inode *);
59709 +int reiser4_detach_common_dir(struct inode *child, struct inode *parent);
59710 +int reiser4_bind_common_dir(struct inode *child, struct inode *parent);
59711 +int safelink_common(struct inode *, reiser4_safe_link_t, __u64 value);
59712 +reiser4_block_nr estimate_create_common(const struct inode *);
59713 +reiser4_block_nr estimate_create_common_dir(const struct inode *);
59714 +reiser4_block_nr estimate_update_common(const struct inode *);
59715 +reiser4_block_nr estimate_unlink_common(const struct inode *,
59716 +                                       const struct inode *);
59717 +reiser4_block_nr estimate_unlink_common_dir(const struct inode *,
59718 +                                           const struct inode *);
59719 +char *wire_write_common(struct inode *, char *start);
59720 +char *wire_read_common(char *addr, reiser4_object_on_wire *);
59721 +struct dentry *wire_get_common(struct super_block *, reiser4_object_on_wire *);
59722 +int wire_size_common(struct inode *);
59723 +void wire_done_common(reiser4_object_on_wire *);
59724 +
59725 +/* dir plugin operations: common implementations */
59726 +struct dentry *get_parent_common(struct inode *child);
59727 +int is_name_acceptable_common(const struct inode *, const char *name, int len);
59728 +void build_entry_key_common(const struct inode *,
59729 +                           const struct qstr *qname, reiser4_key *);
59730 +int build_readdir_key_common(struct file *dir, reiser4_key *);
59731 +int reiser4_add_entry_common(struct inode *object, struct dentry *where,
59732 +                    reiser4_object_create_data *, reiser4_dir_entry_desc *);
59733 +int reiser4_rem_entry_common(struct inode *object, struct dentry *where,
59734 +                    reiser4_dir_entry_desc *);
59735 +int reiser4_dir_init_common(struct inode *object, struct inode *parent,
59736 +                           reiser4_object_create_data *);
59737 +int reiser4_dir_done_common(struct inode *);
59738 +int reiser4_attach_common(struct inode *child, struct inode *parent);
59739 +int reiser4_detach_common(struct inode *object, struct inode *parent);
59740 +reiser4_block_nr estimate_add_entry_common(const struct inode *);
59741 +reiser4_block_nr estimate_rem_entry_common(const struct inode *);
59742 +reiser4_block_nr dir_estimate_unlink_common(const struct inode *,
59743 +                                           const struct inode *);
59744 +
59745 +/* these are essential parts of common implementations, they are to make
59746 +   customized implementations easier */
59747 +int do_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
59748 +
59749 +/* merely useful functions */
59750 +int lookup_sd(struct inode *, znode_lock_mode, coord_t *, lock_handle *,
59751 +             const reiser4_key *, int silent);
59752 +
59753 +/* __FS_REISER4_PLUGIN_OBJECT_H__ */
59754 +#endif
59755 +
59756 +/* Make Linus happy.
59757 +   Local variables:
59758 +   c-indentation-style: "K&R"
59759 +   mode-name: "LC"
59760 +   c-basic-offset: 8
59761 +   tab-width: 8
59762 +   fill-column: 120
59763 +   End:
59764 +*/
59765 diff --git a/fs/reiser4/plugin/plugin.c b/fs/reiser4/plugin/plugin.c
59766 new file mode 100644
59767 index 0000000..8261878
59768 --- /dev/null
59769 +++ b/fs/reiser4/plugin/plugin.c
59770 @@ -0,0 +1,578 @@
59771 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
59772 + * reiser4/README */
59773 +
59774 +/* Basic plugin infrastructure, lookup etc. */
59775 +
59776 +/* PLUGINS:
59777 +
59778 +   Plugins are internal Reiser4 "modules" or "objects" used to increase
59779 +   extensibility and allow external users to easily adapt reiser4 to
59780 +   their needs.
59781 +
59782 +   Plugins are classified into several disjoint "types". Plugins
59783 +   belonging to the particular plugin type are termed "instances" of
59784 +   this type. Currently the following types are present:
59785 +
59786 +    . object plugin
59787 +    . hash plugin
59788 +    . tail plugin
59789 +    . perm plugin
59790 +    . item plugin
59791 +    . node layout plugin
59792 +
59793 +NIKITA-FIXME-HANS: update this list, and review this entire comment for currency
59794 +
59795 +   Object (file) plugin determines how given file-system object serves
59796 +   standard VFS requests for read, write, seek, mmap etc. Instances of
59797 +   file plugins are: regular file, directory, symlink. Another example
59798 +   of file plugin is audit plugin, that optionally records accesses to
59799 +   underlying object and forwards requests to it.
59800 +
59801 +   Hash plugins compute hashes used by reiser4 to store and locate
59802 +   files within directories. Instances of hash plugin type are: r5,
59803 +   tea, rupasov.
59804 +
59805 +   Tail plugins (or, more precisely, tail policy plugins) determine
59806 +   when last part of the file should be stored in a formatted item.
59807 +
59808 +   Perm plugins control permissions granted for a process accessing a file.
59809 +
59810 +   Scope and lookup:
59811 +
59812 +   label such that pair ( type_label, plugin_label ) is unique.  This
59813 +   pair is a globally persistent and user-visible plugin
59814 +   identifier. Internally kernel maintains plugins and plugin types in
59815 +   arrays using an index into those arrays as plugin and plugin type
59816 +   identifiers. File-system in turn, also maintains persistent
59817 +   "dictionary" which is mapping from plugin label to numerical
59818 +   identifier which is stored in file-system objects.  That is, we
59819 +   store the offset into the plugin array for that plugin type as the
59820 +   plugin id in the stat data of the filesystem object.
59821 +
59822 +   plugin_labels have meaning for the user interface that assigns
59823 +   plugins to files, and may someday have meaning for dynamic loading of
59824 +   plugins and for copying of plugins from one fs instance to
59825 +   another by utilities like cp and tar.
59826 +
59827 +   Internal kernel plugin type identifier (index in plugins[] array) is
59828 +   of type reiser4_plugin_type. Set of available plugin types is
59829 +   currently static, but dynamic loading doesn't seem to pose
59830 +   insurmountable problems.
59831 +
59832 +   Within each type plugins are addressed by the identifiers of type
59833 +   reiser4_plugin_id (indices in
59834 +   reiser4_plugin_type_data.builtin[]). Such identifiers are only
59835 +   required to be unique within one type, not globally.
59836 +
59837 +   Thus, plugin in memory is uniquely identified by the pair (type_id,
59838 +   id).
59839 +
59840 +   Usage:
59841 +
59842 +   There exists only one instance of each plugin instance, but this
59843 +   single instance can be associated with many entities (file-system
59844 +   objects, items, nodes, transactions, file-descriptors etc.). Entity
59845 +   to which plugin of given type is termed (due to the lack of
59846 +   imagination) "subject" of this plugin type and, by abuse of
59847 +   terminology, subject of particular instance of this type to which
59848 +   it's attached currently. For example, inode is subject of object
59849 +   plugin type. Inode representing directory is subject of directory
59850 +   plugin, hash plugin type and some particular instance of hash plugin
59851 +   type. Inode, representing regular file is subject of "regular file"
59852 +   plugin, tail-policy plugin type etc.
59853 +
59854 +   With each subject the plugin possibly stores some state. For example,
59855 +   the state of a directory plugin (instance of object plugin type) is pointer
59856 +   to hash plugin (if directories always use hashing that is). State of
59857 +   audit plugin is file descriptor (struct file) of log file or some
59858 +   magic value to do logging through printk().
59859 +
59860 +   Interface:
59861 +
59862 +   In addition to a scalar identifier, each plugin type and plugin
59863 +   proper has a "label": short string and a "description"---longer
59864 +   descriptive string. Labels and descriptions of plugin types are
59865 +   hard-coded into plugins[] array, declared and defined in
59866 +   plugin.c. Label and description of plugin are stored in .label and
59867 +   .desc fields of reiser4_plugin_header respectively. It's possible to
59868 +   locate plugin by the pair of labels.
59869 +
59870 +   Features:
59871 +
59872 +    . user-level plugin manipulations:
59873 +      + reiser4("filename/..file_plugin<='audit'");
59874 +      + write(open("filename/..file_plugin"), "audit", 8);
59875 +
59876 +    . user level utilities lsplug and chplug to manipulate plugins.
59877 +      Utilities are not of primary priority. Possibly they will be not
59878 +      working on v4.0
59879 +
59880 +NIKITA-FIXME-HANS: this should be a mkreiserfs option not a mount option, do you agree?  I don't think that specifying it at mount time, and then changing it with each mount, is a good model for usage.
59881 +
59882 +    . mount option "plug" to set-up plugins of root-directory.
59883 +      "plug=foo:bar" will set "bar" as default plugin of type "foo".
59884 +
59885 +   Limitations:
59886 +
59887 +    . each plugin type has to provide at least one builtin
59888 +      plugin. This is technical limitation and it can be lifted in the
59889 +      future.
59890 +
59891 +   TODO:
59892 +
59893 +   New plugin types/plugings:
59894 +   Things we should be able to separately choose to inherit:
59895 +
59896 +   security plugins
59897 +
59898 +   stat data
59899 +
59900 +   file bodies
59901 +
59902 +   file plugins
59903 +
59904 +   dir plugins
59905 +
59906 +    . perm:acl
59907 +
59908 +    d audi---audit plugin intercepting and possibly logging all
59909 +      accesses to object. Requires to put stub functions in file_operations
59910 +      in stead of generic_file_*.
59911 +
59912 +NIKITA-FIXME-HANS: why make overflows a plugin?
59913 +    . over---handle hash overflows
59914 +
59915 +    . sqnt---handle different access patterns and instruments read-ahead
59916 +
59917 +NIKITA-FIXME-HANS: describe the line below in more detail.
59918 +
59919 +    . hier---handle inheritance of plugins along file-system hierarchy
59920 +
59921 +   Different kinds of inheritance: on creation vs. on access.
59922 +   Compatible/incompatible plugins.
59923 +   Inheritance for multi-linked files.
59924 +   Layered plugins.
59925 +   Notion of plugin context is abandoned.
59926 +
59927 +Each file is associated
59928 +   with one plugin and dependant plugins (hash, etc.) are stored as
59929 +   main plugin state. Now, if we have plugins used for regular files
59930 +   but not for directories, how such plugins would be inherited?
59931 +    . always store them with directories also
59932 +
59933 +NIKTIA-FIXME-HANS: Do the line above.  It is not exclusive of doing the line below which is also useful.
59934 +
59935 +    . use inheritance hierarchy, independent of file-system namespace
59936 +
59937 +*/
59938 +
59939 +#include "../debug.h"
59940 +#include "../dformat.h"
59941 +#include "plugin_header.h"
59942 +#include "item/static_stat.h"
59943 +#include "node/node.h"
59944 +#include "security/perm.h"
59945 +#include "space/space_allocator.h"
59946 +#include "disk_format/disk_format.h"
59947 +#include "plugin.h"
59948 +#include "../reiser4.h"
59949 +#include "../jnode.h"
59950 +#include "../inode.h"
59951 +
59952 +#include <linux/fs.h>          /* for struct super_block  */
59953 +
59954 +/* public interface */
59955 +
59956 +/* initialise plugin sub-system. Just call this once on reiser4 startup. */
59957 +int init_plugins(void);
59958 +int setup_plugins(struct super_block *super, reiser4_plugin ** area);
59959 +int locate_plugin(struct inode *inode, plugin_locator * loc);
59960 +
59961 +/**
59962 + * init_plugins - initialize plugins
59963 + *
59964 + * Initializes plugin sub-system. It is part of reiser4 module
59965 + * initialization. For each plugin of each type init method is called and each
59966 + * plugin is put into list of plugins.
59967 + */
59968 +int init_plugins(void)
59969 +{
59970 +       reiser4_plugin_type type_id;
59971 +
59972 +       for (type_id = 0; type_id < REISER4_PLUGIN_TYPES; ++type_id) {
59973 +               reiser4_plugin_type_data *ptype;
59974 +               int i;
59975 +
59976 +               ptype = &plugins[type_id];
59977 +               assert("nikita-3508", ptype->label != NULL);
59978 +               assert("nikita-3509", ptype->type_id == type_id);
59979 +
59980 +               INIT_LIST_HEAD(&ptype->plugins_list);
59981 +/* NIKITA-FIXME-HANS: change builtin_num to some other name lacking the term builtin. */
59982 +               for (i = 0; i < ptype->builtin_num; ++i) {
59983 +                       reiser4_plugin *plugin;
59984 +
59985 +                       plugin = plugin_at(ptype, i);
59986 +
59987 +                       if (plugin->h.label == NULL)
59988 +                               /* uninitialized slot encountered */
59989 +                               continue;
59990 +                       assert("nikita-3445", plugin->h.type_id == type_id);
59991 +                       plugin->h.id = i;
59992 +                       if (plugin->h.pops != NULL &&
59993 +                           plugin->h.pops->init != NULL) {
59994 +                               int result;
59995 +
59996 +                               result = plugin->h.pops->init(plugin);
59997 +                               if (result != 0)
59998 +                                       return result;
59999 +                       }
60000 +                       INIT_LIST_HEAD(&plugin->h.linkage);
60001 +                       list_add_tail(&plugin->h.linkage, &ptype->plugins_list);
60002 +               }
60003 +       }
60004 +       return 0;
60005 +}
60006 +
60007 +/* true if plugin type id is valid */
60008 +int is_plugin_type_valid(reiser4_plugin_type type)
60009 +{
60010 +       /* "type" is unsigned, so no comparison with 0 is
60011 +          necessary */
60012 +       return (type < REISER4_PLUGIN_TYPES);
60013 +}
60014 +
60015 +/* true if plugin id is valid */
60016 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id)
60017 +{
60018 +       assert("nikita-1653", is_plugin_type_valid(type));
60019 +       return id < plugins[type].builtin_num;
60020 +}
60021 +
60022 +/* return plugin by its @type and @id.
60023 +
60024 +   Both arguments are checked for validness: this is supposed to be called
60025 +   from user-level.
60026 +
60027 +NIKITA-FIXME-HANS: Do you instead mean that this checks ids created in
60028 +user space, and passed to the filesystem by use of method files? Your
60029 +comment really confused me on the first reading....
60030 +
60031 +*/
60032 +reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type /* plugin type
60033 +                                                                * unchecked */,
60034 +                                   reiser4_plugin_id id        /* plugin id,
60035 +                                                                * unchecked */)
60036 +{
60037 +       if (is_plugin_type_valid(type)) {
60038 +               if (is_plugin_id_valid(type, id))
60039 +                       return plugin_at(&plugins[type], id);
60040 +               else
60041 +                       /* id out of bounds */
60042 +                       warning("nikita-2913",
60043 +                               "Invalid plugin id: [%i:%i]", type, id);
60044 +       } else
60045 +               /* type_id out of bounds */
60046 +               warning("nikita-2914", "Invalid type_id: %i", type);
60047 +       return NULL;
60048 +}
60049 +
60050 +/**
60051 + * save_plugin_id - store plugin id in disk format
60052 + * @plugin: plugin to convert
60053 + * @area: where to store result
60054 + *
60055 + * Puts id of @plugin in little endian format to address @area.
60056 + */
60057 +int save_plugin_id(reiser4_plugin *plugin /* plugin to convert */ ,
60058 +                  d16 *area /* where to store result */ )
60059 +{
60060 +       assert("nikita-1261", plugin != NULL);
60061 +       assert("nikita-1262", area != NULL);
60062 +
60063 +       put_unaligned(cpu_to_le16(plugin->h.id), area);
60064 +       return 0;
60065 +}
60066 +
60067 +/* list of all plugins of given type */
60068 +struct list_head *get_plugin_list(reiser4_plugin_type type)
60069 +{
60070 +       assert("nikita-1056", is_plugin_type_valid(type));
60071 +       return &plugins[type].plugins_list;
60072 +}
60073 +
60074 +static void update_pset_mask(reiser4_inode * info, pset_member memb)
60075 +{
60076 +       struct dentry *rootdir;
60077 +       reiser4_inode *root;
60078 +
60079 +       assert("edward-1443", memb != PSET_FILE);
60080 +
60081 +       rootdir = inode_by_reiser4_inode(info)->i_sb->s_root;
60082 +       if (rootdir != NULL) {
60083 +               root = reiser4_inode_data(rootdir->d_inode);
60084 +               /*
60085 +                * if inode is different from the default one, or we are
60086 +                * changing plugin of root directory, update plugin_mask
60087 +                */
60088 +               if (aset_get(info->pset, memb) !=
60089 +                   aset_get(root->pset, memb) ||
60090 +                   info == root)
60091 +                       info->plugin_mask |= (1 << memb);
60092 +               else
60093 +                       info->plugin_mask &= ~(1 << memb);
60094 +       }
60095 +}
60096 +
60097 +/* Get specified plugin set member from parent,
60098 +   or from fs-defaults (if no parent is given) and
60099 +   install the result to pset of @self */
60100 +int grab_plugin_pset(struct inode *self,
60101 +                    struct inode *ancestor,
60102 +                    pset_member memb)
60103 +{
60104 +       reiser4_plugin *plug;
60105 +       reiser4_inode *info;
60106 +       int result = 0;
60107 +
60108 +       /* Do not grab if initialised already. */
60109 +       info = reiser4_inode_data(self);
60110 +       if (aset_get(info->pset, memb) != NULL)
60111 +               return 0;
60112 +       if (ancestor) {
60113 +               reiser4_inode *parent;
60114 +
60115 +               parent = reiser4_inode_data(ancestor);
60116 +               plug = aset_get(parent->hset, memb) ? :
60117 +                       aset_get(parent->pset, memb);
60118 +       }
60119 +       else
60120 +               plug = get_default_plugin(memb);
60121 +
60122 +       result = set_plugin(&info->pset, memb, plug);
60123 +       if (result == 0) {
60124 +               if (!ancestor || self->i_sb->s_root->d_inode != self)
60125 +                       update_pset_mask(info, memb);
60126 +       }
60127 +       return result;
60128 +}
60129 +
60130 +/* Take missing pset members from root inode */
60131 +int finish_pset(struct inode *inode)
60132 +{
60133 +       reiser4_plugin *plug;
60134 +       reiser4_inode *root;
60135 +       reiser4_inode *info;
60136 +       pset_member memb;
60137 +       int result = 0;
60138 +
60139 +       root = reiser4_inode_data(inode->i_sb->s_root->d_inode);
60140 +       info = reiser4_inode_data(inode);
60141 +
60142 +       assert("edward-1455", root != NULL);
60143 +       assert("edward-1456", info != NULL);
60144 +
60145 +       /* file and directory plugins are already initialized. */
60146 +       for (memb = PSET_DIR + 1; memb < PSET_LAST; ++memb) {
60147 +
60148 +               /* Do not grab if initialised already. */
60149 +               if (aset_get(info->pset, memb) != NULL)
60150 +                       continue;
60151 +
60152 +               plug = aset_get(root->pset, memb);
60153 +               result = set_plugin(&info->pset, memb, plug);
60154 +               if (result != 0)
60155 +                       break;
60156 +       }
60157 +       if (result != 0) {
60158 +               warning("nikita-3447",
60159 +                       "Cannot set up plugins for %lli",
60160 +                       (unsigned long long)
60161 +                       get_inode_oid(inode));
60162 +       }
60163 +       return result;
60164 +}
60165 +
60166 +int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin * plug)
60167 +{
60168 +       reiser4_inode *info;
60169 +       int result = 0;
60170 +
60171 +       if (!self->i_sb->s_root || self->i_sb->s_root->d_inode == self) {
60172 +               /* Changing pset in the root object. */
60173 +               return RETERR(-EINVAL);
60174 +       }
60175 +
60176 +       info = reiser4_inode_data(self);
60177 +       if (plug->h.pops != NULL && plug->h.pops->change != NULL)
60178 +               result = plug->h.pops->change(self, plug, memb);
60179 +       else
60180 +               result = aset_set_unsafe(&info->pset, memb, plug);
60181 +       if (result == 0) {
60182 +               __u16 oldmask = info->plugin_mask;
60183 +
60184 +               update_pset_mask(info, memb);
60185 +               if (oldmask != info->plugin_mask)
60186 +                       reiser4_inode_clr_flag(self, REISER4_SDLEN_KNOWN);
60187 +       }
60188 +       return result;
60189 +}
60190 +
60191 +reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES] = {
60192 +       /* C90 initializers */
60193 +       [REISER4_FILE_PLUGIN_TYPE] = {
60194 +               .type_id = REISER4_FILE_PLUGIN_TYPE,
60195 +               .label = "file",
60196 +               .desc = "Object plugins",
60197 +               .builtin_num = sizeof_array(file_plugins),
60198 +               .builtin = file_plugins,
60199 +               .plugins_list = {NULL, NULL},
60200 +               .size = sizeof(file_plugin)
60201 +       },
60202 +       [REISER4_DIR_PLUGIN_TYPE] = {
60203 +               .type_id = REISER4_DIR_PLUGIN_TYPE,
60204 +               .label = "dir",
60205 +               .desc = "Directory plugins",
60206 +               .builtin_num = sizeof_array(dir_plugins),
60207 +               .builtin = dir_plugins,
60208 +               .plugins_list = {NULL, NULL},
60209 +               .size = sizeof(dir_plugin)
60210 +       },
60211 +       [REISER4_HASH_PLUGIN_TYPE] = {
60212 +               .type_id = REISER4_HASH_PLUGIN_TYPE,
60213 +               .label = "hash",
60214 +               .desc = "Directory hashes",
60215 +               .builtin_num = sizeof_array(hash_plugins),
60216 +               .builtin = hash_plugins,
60217 +               .plugins_list = {NULL, NULL},
60218 +               .size = sizeof(hash_plugin)
60219 +       },
60220 +       [REISER4_FIBRATION_PLUGIN_TYPE] = {
60221 +               .type_id =
60222 +               REISER4_FIBRATION_PLUGIN_TYPE,
60223 +               .label = "fibration",
60224 +               .desc = "Directory fibrations",
60225 +               .builtin_num = sizeof_array(fibration_plugins),
60226 +               .builtin = fibration_plugins,
60227 +               .plugins_list = {NULL, NULL},
60228 +               .size = sizeof(fibration_plugin)
60229 +       },
60230 +       [REISER4_CIPHER_PLUGIN_TYPE] = {
60231 +               .type_id = REISER4_CIPHER_PLUGIN_TYPE,
60232 +               .label = "cipher",
60233 +               .desc = "Cipher plugins",
60234 +               .builtin_num = sizeof_array(cipher_plugins),
60235 +               .builtin = cipher_plugins,
60236 +               .plugins_list = {NULL, NULL},
60237 +               .size = sizeof(cipher_plugin)
60238 +       },
60239 +       [REISER4_DIGEST_PLUGIN_TYPE] = {
60240 +               .type_id = REISER4_DIGEST_PLUGIN_TYPE,
60241 +               .label = "digest",
60242 +               .desc = "Digest plugins",
60243 +               .builtin_num = sizeof_array(digest_plugins),
60244 +               .builtin = digest_plugins,
60245 +               .plugins_list = {NULL, NULL},
60246 +               .size = sizeof(digest_plugin)
60247 +       },
60248 +       [REISER4_COMPRESSION_PLUGIN_TYPE] = {
60249 +               .type_id = REISER4_COMPRESSION_PLUGIN_TYPE,
60250 +               .label = "compression",
60251 +               .desc = "Compression plugins",
60252 +               .builtin_num = sizeof_array(compression_plugins),
60253 +               .builtin = compression_plugins,
60254 +               .plugins_list = {NULL, NULL},
60255 +               .size = sizeof(compression_plugin)
60256 +       },
60257 +       [REISER4_FORMATTING_PLUGIN_TYPE] = {
60258 +               .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
60259 +               .label = "formatting",
60260 +               .desc = "Tail inlining policies",
60261 +               .builtin_num = sizeof_array(formatting_plugins),
60262 +               .builtin = formatting_plugins,
60263 +               .plugins_list = {NULL, NULL},
60264 +               .size = sizeof(formatting_plugin)
60265 +       },
60266 +       [REISER4_PERM_PLUGIN_TYPE] = {
60267 +               .type_id = REISER4_PERM_PLUGIN_TYPE,
60268 +               .label = "perm",
60269 +               .desc = "Permission checks",
60270 +               .builtin_num = sizeof_array(perm_plugins),
60271 +               .builtin = perm_plugins,
60272 +               .plugins_list = {NULL, NULL},
60273 +               .size = sizeof(perm_plugin)
60274 +       },
60275 +       [REISER4_ITEM_PLUGIN_TYPE] = {
60276 +               .type_id = REISER4_ITEM_PLUGIN_TYPE,
60277 +               .label = "item",
60278 +               .desc = "Item handlers",
60279 +               .builtin_num = sizeof_array(item_plugins),
60280 +               .builtin = item_plugins,
60281 +               .plugins_list = {NULL, NULL},
60282 +               .size = sizeof(item_plugin)
60283 +       },
60284 +       [REISER4_NODE_PLUGIN_TYPE] = {
60285 +               .type_id = REISER4_NODE_PLUGIN_TYPE,
60286 +               .label = "node",
60287 +               .desc = "node layout handlers",
60288 +               .builtin_num = sizeof_array(node_plugins),
60289 +               .builtin = node_plugins,
60290 +               .plugins_list = {NULL, NULL},
60291 +               .size = sizeof(node_plugin)
60292 +       },
60293 +       [REISER4_SD_EXT_PLUGIN_TYPE] = {
60294 +               .type_id = REISER4_SD_EXT_PLUGIN_TYPE,
60295 +               .label = "sd_ext",
60296 +               .desc = "Parts of stat-data",
60297 +               .builtin_num = sizeof_array(sd_ext_plugins),
60298 +               .builtin = sd_ext_plugins,
60299 +               .plugins_list = {NULL, NULL},
60300 +               .size = sizeof(sd_ext_plugin)
60301 +       },
60302 +       [REISER4_FORMAT_PLUGIN_TYPE] = {
60303 +               .type_id = REISER4_FORMAT_PLUGIN_TYPE,
60304 +               .label = "disk_layout",
60305 +               .desc = "defines filesystem on disk layout",
60306 +               .builtin_num = sizeof_array(format_plugins),
60307 +               .builtin = format_plugins,
60308 +               .plugins_list = {NULL, NULL},
60309 +               .size = sizeof(disk_format_plugin)
60310 +       },
60311 +       [REISER4_JNODE_PLUGIN_TYPE] = {
60312 +               .type_id = REISER4_JNODE_PLUGIN_TYPE,
60313 +               .label = "jnode",
60314 +               .desc = "defines kind of jnode",
60315 +               .builtin_num = sizeof_array(jnode_plugins),
60316 +               .builtin = jnode_plugins,
60317 +               .plugins_list = {NULL, NULL},
60318 +               .size = sizeof(jnode_plugin)
60319 +       },
60320 +       [REISER4_COMPRESSION_MODE_PLUGIN_TYPE] = {
60321 +               .type_id = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
60322 +               .label = "compression_mode",
60323 +               .desc = "Defines compression mode",
60324 +               .builtin_num = sizeof_array(compression_mode_plugins),
60325 +               .builtin = compression_mode_plugins,
60326 +               .plugins_list = {NULL, NULL},
60327 +               .size = sizeof(compression_mode_plugin)
60328 +       },
60329 +       [REISER4_CLUSTER_PLUGIN_TYPE] = {
60330 +               .type_id = REISER4_CLUSTER_PLUGIN_TYPE,
60331 +               .label = "cluster",
60332 +               .desc = "Defines cluster size",
60333 +               .builtin_num = sizeof_array(cluster_plugins),
60334 +               .builtin = cluster_plugins,
60335 +               .plugins_list = {NULL, NULL},
60336 +               .size = sizeof(cluster_plugin)
60337 +       }
60338 +};
60339 +
60340 +/*
60341 + * Local variables:
60342 + * c-indentation-style: "K&R"
60343 + * mode-name: "LC"
60344 + * c-basic-offset: 8
60345 + * tab-width: 8
60346 + * fill-column: 120
60347 + * End:
60348 + */
60349 diff --git a/fs/reiser4/plugin/plugin.h b/fs/reiser4/plugin/plugin.h
60350 new file mode 100644
60351 index 0000000..a1d1097
60352 --- /dev/null
60353 +++ b/fs/reiser4/plugin/plugin.h
60354 @@ -0,0 +1,920 @@
60355 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
60356 +
60357 +/* Basic plugin data-types.
60358 +   see fs/reiser4/plugin/plugin.c for details */
60359 +
60360 +#if !defined( __FS_REISER4_PLUGIN_TYPES_H__ )
60361 +#define __FS_REISER4_PLUGIN_TYPES_H__
60362 +
60363 +#include "../forward.h"
60364 +#include "../debug.h"
60365 +#include "../dformat.h"
60366 +#include "../key.h"
60367 +#include "compress/compress.h"
60368 +#include "crypto/cipher.h"
60369 +#include "plugin_header.h"
60370 +#include "item/static_stat.h"
60371 +#include "item/internal.h"
60372 +#include "item/sde.h"
60373 +#include "item/cde.h"
60374 +#include "item/item.h"
60375 +#include "node/node.h"
60376 +#include "node/node40.h"
60377 +#include "security/perm.h"
60378 +#include "fibration.h"
60379 +
60380 +#include "space/bitmap.h"
60381 +#include "space/space_allocator.h"
60382 +
60383 +#include "disk_format/disk_format40.h"
60384 +#include "disk_format/disk_format.h"
60385 +
60386 +#include <linux/fs.h>          /* for struct super_block, address_space  */
60387 +#include <linux/mm.h>          /* for struct page */
60388 +#include <linux/buffer_head.h> /* for struct buffer_head */
60389 +#include <linux/dcache.h>      /* for struct dentry */
60390 +#include <linux/types.h>
60391 +#include <linux/crypto.h>
60392 +
60393 +typedef struct reiser4_object_on_wire reiser4_object_on_wire;
60394 +
60395 +/*
60396 + * File plugin.  Defines the set of methods that file plugins implement, some
60397 + * of which are optional.
60398 + *
60399 + * A file plugin offers to the caller an interface for IO ( writing to and/or
60400 + * reading from) to what the caller sees as one sequence of bytes.  An IO to it
60401 + * may affect more than one physical sequence of bytes, or no physical sequence
60402 + * of bytes, it may affect sequences of bytes offered by other file plugins to
60403 + * the semantic layer, and the file plugin may invoke other plugins and
60404 + * delegate work to them, but its interface is structured for offering the
60405 + * caller the ability to read and/or write what the caller sees as being a
60406 + * single sequence of bytes.
60407 + *
60408 + * The file plugin must present a sequence of bytes to the caller, but it does
60409 + * not necessarily have to store a sequence of bytes, it does not necessarily
60410 + * have to support efficient tree traversal to any offset in the sequence of
60411 + * bytes (tail and extent items, whose keys contain offsets, do however provide
60412 + * efficient non-sequential lookup of any offset in the sequence of bytes).
60413 + *
60414 + * Directory plugins provide methods for selecting file plugins by resolving a
60415 + * name for them.
60416 + *
60417 + * The functionality other filesystems call an attribute, and rigidly tie
60418 + * together, we decompose into orthogonal selectable features of files.  Using
60419 + * the terminology we will define next, an attribute is a perhaps constrained,
60420 + * perhaps static length, file whose parent has a uni-count-intra-link to it,
60421 + * which might be grandparent-major-packed, and whose parent has a deletion
60422 + * method that deletes it.
60423 + *
60424 + * File plugins can implement constraints.
60425 + *
60426 + * Files can be of variable length (e.g. regular unix files), or of static
60427 + * length (e.g. static sized attributes).
60428 + *
60429 + * An object may have many sequences of bytes, and many file plugins, but, it
60430 + * has exactly one objectid.  It is usually desirable that an object has a
60431 + * deletion method which deletes every item with that objectid.  Items cannot
60432 + * in general be found by just their objectids.  This means that an object must
60433 + * have either a method built into its deletion plugin method for knowing what
60434 + * items need to be deleted, or links stored with the object that provide the
60435 + * plugin with a method for finding those items.  Deleting a file within an
60436 + * object may or may not have the effect of deleting the entire object,
60437 + * depending on the file plugin's deletion method.
60438 + *
60439 + * LINK TAXONOMY:
60440 + *
60441 + * Many objects have a reference count, and when the reference count reaches 0
60442 + * the object's deletion method is invoked.  Some links embody a reference
60443 + * count increase ("countlinks"), and others do not ("nocountlinks").
60444 + *
60445 + * Some links are bi-directional links ("bilinks"), and some are
60446 + * uni-directional("unilinks").
60447 + *
60448 + * Some links are between parts of the same object ("intralinks"), and some are
60449 + * between different objects ("interlinks").
60450 + *
60451 + * PACKING TAXONOMY:
60452 + *
60453 + * Some items of an object are stored with a major packing locality based on
60454 + * their object's objectid (e.g. unix directory items in plan A), and these are
60455 + * called "self-major-packed".
60456 + *
60457 + * Some items of an object are stored with a major packing locality based on
60458 + * their semantic parent object's objectid (e.g. unix file bodies in plan A),
60459 + * and these are called "parent-major-packed".
60460 + *
60461 + * Some items of an object are stored with a major packing locality based on
60462 + * their semantic grandparent, and these are called "grandparent-major-packed".
60463 + * Now carefully notice that we run into trouble with key length if we have to
60464 + * store a 8 byte major+minor grandparent based packing locality, an 8 byte
60465 + * parent objectid, an 8 byte attribute objectid, and an 8 byte offset, all in
60466 + * a 24 byte key.  One of these fields must be sacrificed if an item is to be
60467 + * grandparent-major-packed, and which to sacrifice is left to the item author
60468 + * choosing to make the item grandparent-major-packed.  You cannot make tail
60469 + * items and extent items grandparent-major-packed, though you could make them
60470 + * self-major-packed (usually they are parent-major-packed).
60471 + *
60472 + * In the case of ACLs (which are composed of fixed length ACEs which consist
60473 + * of {subject-type, subject, and permission bitmask} triples), it makes sense
60474 + * to not have an offset field in the ACE item key, and to allow duplicate keys
60475 + * for ACEs.  Thus, the set of ACES for a given file is found by looking for a
60476 + * key consisting of the objectid of the grandparent (thus grouping all ACLs in
60477 + * a directory together), the minor packing locality of ACE, the objectid of
60478 + * the file, and 0.
60479 + *
60480 + * IO involves moving data from one location to another, which means that two
60481 + * locations must be specified, source and destination.
60482 + *
60483 + * This source and destination can be in the filesystem, or they can be a
60484 + * pointer in the user process address space plus a byte count.
60485 + *
60486 + * If both source and destination are in the filesystem, then at least one of
60487 + * them must be representable as a pure stream of bytes (which we call a flow,
60488 + * and define as a struct containing a key, a data pointer, and a length).
60489 + * This may mean converting one of them into a flow.  We provide a generic
60490 + * cast_into_flow() method, which will work for any plugin supporting
60491 + * read_flow(), though it is inefficiently implemented in that it temporarily
60492 + * stores the flow in a buffer (Question: what to do with huge flows that
60493 + * cannot fit into memory?  Answer: we must not convert them all at once. )
60494 + *
60495 + * Performing a write requires resolving the write request into a flow defining
60496 + * the source, and a method that performs the write, and a key that defines
60497 + * where in the tree the write is to go.
60498 + *
60499 + * Performing a read requires resolving the read request into a flow defining
60500 + * the target, and a method that performs the read, and a key that defines
60501 + * where in the tree the read is to come from.
60502 + *
60503 + * There will exist file plugins which have no pluginid stored on the disk for
60504 + * them, and which are only invoked by other plugins.
60505 + */
60506 +
60507 +/* This should be incremented with each new contributed
60508 +   pair (plugin type, plugin id).
60509 +   NOTE: Make sure there is a release of reiser4progs
60510 +   with the corresponding version number */
60511 +#define PLUGIN_LIBRARY_VERSION 0
60512 +
60513 + /* enumeration of fields within plugin_set */
60514 +typedef enum {
60515 +       PSET_FILE,
60516 +       PSET_DIR,               /* PSET_FILE and PSET_DIR should be first elements:
60517 +                                * inode.c:read_inode() depends on this. */
60518 +       PSET_PERM,
60519 +       PSET_FORMATTING,
60520 +       PSET_HASH,
60521 +       PSET_FIBRATION,
60522 +       PSET_SD,
60523 +       PSET_DIR_ITEM,
60524 +       PSET_CIPHER,
60525 +       PSET_DIGEST,
60526 +       PSET_COMPRESSION,
60527 +       PSET_COMPRESSION_MODE,
60528 +       PSET_CLUSTER,
60529 +       PSET_CREATE,
60530 +       PSET_LAST
60531 +} pset_member;
60532 +
60533 +/* builtin file-plugins */
60534 +typedef enum {
60535 +       /* regular file */
60536 +       UNIX_FILE_PLUGIN_ID,
60537 +       /* directory */
60538 +       DIRECTORY_FILE_PLUGIN_ID,
60539 +       /* symlink */
60540 +       SYMLINK_FILE_PLUGIN_ID,
60541 +       /* for objects completely handled by the VFS: fifos, devices,
60542 +          sockets  */
60543 +       SPECIAL_FILE_PLUGIN_ID,
60544 +       /* regular cryptcompress file */
60545 +       CRYPTCOMPRESS_FILE_PLUGIN_ID,
60546 +       /* number of file plugins. Used as size of arrays to hold
60547 +          file plugins. */
60548 +       LAST_FILE_PLUGIN_ID
60549 +} reiser4_file_id;
60550 +
60551 +typedef struct file_plugin {
60552 +
60553 +       /* generic fields */
60554 +       plugin_header h;
60555 +
60556 +       struct inode_operations inode_ops;
60557 +       struct file_operations file_ops;
60558 +       struct address_space_operations as_ops;
60559 +
60560 +       /* save inode cached stat-data onto disk. It was called
60561 +          reiserfs_update_sd() in 3.x */
60562 +       int (*write_sd_by_inode) (struct inode *);
60563 +
60564 +       /*
60565 +        * private methods: These are optional.  If used they will allow you to
60566 +        * minimize the amount of code needed to implement a deviation from
60567 +        * some other method that also uses them.
60568 +        */
60569 +
60570 +       /*
60571 +        * Construct flow into @flow according to user-supplied data.
60572 +        *
60573 +        * This is used by read/write methods to construct a flow to
60574 +        * write/read. ->flow_by_inode() is plugin method, rather than single
60575 +        * global implementation, because key in a flow used by plugin may
60576 +        * depend on data in a @buf.
60577 +        *
60578 +        * NIKITA-FIXME-HANS: please create statistics on what functions are
60579 +        * dereferenced how often for the mongo benchmark.  You can supervise
60580 +        * Elena doing this for you if that helps.  Email me the list of the
60581 +        * top 10, with their counts, and an estimate of the total number of
60582 +        * CPU cycles spent dereferencing as a percentage of CPU cycles spent
60583 +        * processing (non-idle processing).  If the total percent is, say,
60584 +        * less than 1%, it will make our coding discussions much easier, and
60585 +        * keep me from questioning whether functions like the below are too
60586 +        * frequently called to be dereferenced.  If the total percent is more
60587 +        * than 1%, perhaps private methods should be listed in a "required"
60588 +        * comment at the top of each plugin (with stern language about how if
60589 +        * the comment is missing it will not be accepted by the maintainer),
60590 +        * and implemented using macros not dereferenced functions.  How about
60591 +        * replacing this whole private methods part of the struct with a
60592 +        * thorough documentation of what the standard helper functions are for
60593 +        * use in constructing plugins?  I think users have been asking for
60594 +        * that, though not in so many words.
60595 +        */
60596 +       int (*flow_by_inode) (struct inode *, const char __user *buf,
60597 +                             int user, loff_t size,
60598 +                             loff_t off, rw_op op, flow_t *);
60599 +
60600 +       /*
60601 +        * Return the key used to retrieve an offset of a file. It is used by
60602 +        * default implementation of ->flow_by_inode() method
60603 +        * (common_build_flow()) and, among other things, to get to the extent
60604 +        * from jnode of unformatted node.
60605 +        */
60606 +       int (*key_by_inode) (struct inode *, loff_t off, reiser4_key *);
60607 +
60608 +       /* NIKITA-FIXME-HANS: this comment is not as clear to others as you think.... */
60609 +       /*
60610 +        * set the plugin for a file.  Called during file creation in creat()
60611 +        * but not reiser4() unless an inode already exists for the file.
60612 +        */
60613 +       int (*set_plug_in_inode) (struct inode *inode, struct inode *parent,
60614 +                                 reiser4_object_create_data *);
60615 +
60616 +       /* NIKITA-FIXME-HANS: comment and name seem to say different things,
60617 +        * are you setting up the object itself also or just adjusting the
60618 +        * parent?.... */
60619 +       /* set up plugins for new @object created in @parent. @root is root
60620 +          directory. */
60621 +       int (*adjust_to_parent) (struct inode *object, struct inode *parent,
60622 +                                struct inode *root);
60623 +       /*
60624 +        * this does whatever is necessary to do when object is created. For
60625 +        * instance, for unix files stat data is inserted. It is supposed to be
60626 +        * called by create of struct inode_operations.
60627 +        */
60628 +       int (*create_object) (struct inode *object, struct inode *parent,
60629 +                             reiser4_object_create_data *);
60630 +
60631 +       /* this does whatever is necessary to do when object is opened */
60632 +       int (*open_object) (struct inode * inode, struct file * file);
60633 +       /*
60634 +        * this method should check REISER4_NO_SD and set REISER4_NO_SD on
60635 +        * success. Deletion of an object usually includes removal of items
60636 +        * building file body (for directories this is removal of "." and "..")
60637 +        * and removal of stat-data item.
60638 +        */
60639 +       int (*delete_object) (struct inode *);
60640 +
60641 +       /* add link from @parent to @object */
60642 +       int (*add_link) (struct inode *object, struct inode *parent);
60643 +
60644 +       /* remove link from @parent to @object */
60645 +       int (*rem_link) (struct inode *object, struct inode *parent);
60646 +
60647 +       /*
60648 +        * return true if item addressed by @coord belongs to @inode.  This is
60649 +        * used by read/write to properly slice flow into items in presence of
60650 +        * multiple key assignment policies, because items of a file are not
60651 +        * necessarily contiguous in a key space, for example, in a plan-b.
60652 +        */
60653 +       int (*owns_item) (const struct inode *, const coord_t *);
60654 +
60655 +       /* checks whether yet another hard links to this object can be
60656 +          added  */
60657 +       int (*can_add_link) (const struct inode *);
60658 +
60659 +       /* checks whether hard links to this object can be removed */
60660 +       int (*can_rem_link) (const struct inode *);
60661 +
60662 +       /* not empty for DIRECTORY_FILE_PLUGIN_ID only currently. It calls
60663 +          detach of directory plugin to remove ".." */
60664 +       int (*detach) (struct inode * child, struct inode * parent);
60665 +
60666 +       /* called when @child was just looked up in the @parent. It is not
60667 +          empty for DIRECTORY_FILE_PLUGIN_ID only where it calls attach of
60668 +          directory plugin */
60669 +       int (*bind) (struct inode * child, struct inode * parent);
60670 +
60671 +       /* process safe-link during mount */
60672 +       int (*safelink) (struct inode * object, reiser4_safe_link_t link,
60673 +                        __u64 value);
60674 +
60675 +       /* The couple of estimate methods for all file operations */
60676 +       struct {
60677 +               reiser4_block_nr(*create) (const struct inode *);
60678 +               reiser4_block_nr(*update) (const struct inode *);
60679 +               reiser4_block_nr(*unlink) (const struct inode *,
60680 +                                          const struct inode *);
60681 +       } estimate;
60682 +
60683 +       /*
60684 +        * reiser4 specific part of inode has a union of structures which are
60685 +        * specific to a plugin. This method is called when inode is read
60686 +        * (read_inode) and when file is created (common_create_child) so that
60687 +        * file plugin could initialize its inode data
60688 +        */
60689 +       void (*init_inode_data) (struct inode *, reiser4_object_create_data *,
60690 +                                int);
60691 +
60692 +       /*
60693 +        * This method performs progressive deletion of items and whole nodes
60694 +        * from right to left.
60695 +        *
60696 +        * @tap: the point deletion process begins from,
60697 +        * @from_key: the beginning of the deleted key range,
60698 +        * @to_key: the end of the deleted key range,
60699 +        * @smallest_removed: the smallest removed key,
60700 +        *
60701 +        * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
60702 +        * operation was interrupted for allowing atom commit .
60703 +        */
60704 +       int (*cut_tree_worker) (tap_t *, const reiser4_key * from_key,
60705 +                               const reiser4_key * to_key,
60706 +                               reiser4_key * smallest_removed, struct inode *,
60707 +                               int, int *);
60708 +
60709 +       /* called from ->destroy_inode() */
60710 +       void (*destroy_inode) (struct inode *);
60711 +
60712 +       /*
60713 +        * methods to serialize object identify. This is used, for example, by
60714 +        * reiser4_{en,de}code_fh().
60715 +        */
60716 +       struct {
60717 +               /* store object's identity at @area */
60718 +               char *(*write) (struct inode * inode, char *area);
60719 +               /* parse object from wire to the @obj */
60720 +               char *(*read) (char *area, reiser4_object_on_wire * obj);
60721 +               /* given object identity in @obj, find or create its dentry */
60722 +               struct dentry *(*get) (struct super_block * s,
60723 +                                      reiser4_object_on_wire * obj);
60724 +               /* how many bytes ->wire.write() consumes */
60725 +               int (*size) (struct inode * inode);
60726 +               /* finish with object identify */
60727 +               void (*done) (reiser4_object_on_wire * obj);
60728 +       } wire;
60729 +} file_plugin;
60730 +
60731 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
60732 +
60733 +struct reiser4_object_on_wire {
60734 +       file_plugin *plugin;
60735 +       union {
60736 +               struct {
60737 +                       obj_key_id key_id;
60738 +               } std;
60739 +               void *generic;
60740 +       } u;
60741 +};
60742 +
60743 +/* builtin dir-plugins */
60744 +typedef enum {
60745 +       HASHED_DIR_PLUGIN_ID,
60746 +       SEEKABLE_HASHED_DIR_PLUGIN_ID,
60747 +       LAST_DIR_ID
60748 +} reiser4_dir_id;
60749 +
60750 +typedef struct dir_plugin {
60751 +       /* generic fields */
60752 +       plugin_header h;
60753 +
60754 +       struct inode_operations inode_ops;
60755 +       struct file_operations file_ops;
60756 +       struct address_space_operations as_ops;
60757 +
60758 +       /*
60759 +        * private methods: These are optional.  If used they will allow you to
60760 +        * minimize the amount of code needed to implement a deviation from
60761 +        * some other method that uses them.  You could logically argue that
60762 +        * they should be a separate type of plugin.
60763 +        */
60764 +
60765 +       struct dentry *(*get_parent) (struct inode * childdir);
60766 +
60767 +       /*
60768 +        * check whether "name" is acceptable name to be inserted into this
60769 +        * object. Optionally implemented by directory-like objects.  Can check
60770 +        * for maximal length, reserved symbols etc
60771 +        */
60772 +       int (*is_name_acceptable) (const struct inode * inode, const char *name,
60773 +                                  int len);
60774 +
60775 +       void (*build_entry_key) (const struct inode * dir       /* directory where
60776 +                                                                * entry is (or will
60777 +                                                                * be) in.*/ ,
60778 +                                const struct qstr * name       /* name of file
60779 +                                                                * referenced by this
60780 +                                                                * entry */ ,
60781 +                                reiser4_key * result   /* resulting key of
60782 +                                                        * directory entry */ );
60783 +       int (*build_readdir_key) (struct file * dir, reiser4_key * result);
60784 +       int (*add_entry) (struct inode * object, struct dentry * where,
60785 +                         reiser4_object_create_data * data,
60786 +                         reiser4_dir_entry_desc * entry);
60787 +       int (*rem_entry) (struct inode * object, struct dentry * where,
60788 +                         reiser4_dir_entry_desc * entry);
60789 +
60790 +       /*
60791 +        * initialize directory structure for newly created object. For normal
60792 +        * unix directories, insert dot and dotdot.
60793 +        */
60794 +       int (*init) (struct inode * object, struct inode * parent,
60795 +                    reiser4_object_create_data * data);
60796 +
60797 +       /* destroy directory */
60798 +       int (*done) (struct inode * child);
60799 +
60800 +       /* called when @subdir was just looked up in the @dir */
60801 +       int (*attach) (struct inode * subdir, struct inode * dir);
60802 +       int (*detach) (struct inode * subdir, struct inode * dir);
60803 +
60804 +       struct {
60805 +               reiser4_block_nr(*add_entry) (const struct inode *);
60806 +               reiser4_block_nr(*rem_entry) (const struct inode *);
60807 +               reiser4_block_nr(*unlink) (const struct inode *,
60808 +                                          const struct inode *);
60809 +       } estimate;
60810 +} dir_plugin;
60811 +
60812 +extern dir_plugin dir_plugins[LAST_DIR_ID];
60813 +
60814 +typedef struct formatting_plugin {
60815 +       /* generic fields */
60816 +       plugin_header h;
60817 +       /* returns non-zero iff file's tail has to be stored
60818 +          in a direct item. */
60819 +       int (*have_tail) (const struct inode * inode, loff_t size);
60820 +} formatting_plugin;
60821 +
60822 +typedef struct hash_plugin {
60823 +       /* generic fields */
60824 +       plugin_header h;
60825 +       /* computes hash of the given name */
60826 +        __u64(*hash) (const unsigned char *name, int len);
60827 +} hash_plugin;
60828 +
60829 +typedef struct cipher_plugin {
60830 +       /* generic fields */
60831 +       plugin_header h;
60832 +       struct crypto_blkcipher * (*alloc) (void);
60833 +       void (*free) (struct crypto_blkcipher * tfm);
60834 +       /* Offset translator. For each offset this returns (k * offset), where
60835 +          k (k >= 1) is an expansion factor of the cipher algorithm.
60836 +          For all symmetric algorithms k == 1. For asymmetric algorithms (which
60837 +          inflate data) offset translation guarantees that all disk cluster's
60838 +          units will have keys smaller then next cluster's one.
60839 +        */
60840 +        loff_t(*scale) (struct inode * inode, size_t blocksize, loff_t src);
60841 +       /* Cipher algorithms can accept data only by chunks of cipher block
60842 +          size. This method is to align any flow up to cipher block size when
60843 +          we pass it to cipher algorithm. To align means to append padding of
60844 +          special format specific to the cipher algorithm */
60845 +       int (*align_stream) (__u8 * tail, int clust_size, int blocksize);
60846 +       /* low-level key manager (check, install, etc..) */
60847 +       int (*setkey) (struct crypto_tfm * tfm, const __u8 * key,
60848 +                      unsigned int keylen);
60849 +       /* main text processing procedures */
60850 +       void (*encrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60851 +       void (*decrypt) (__u32 * expkey, __u8 * dst, const __u8 * src);
60852 +} cipher_plugin;
60853 +
60854 +typedef struct digest_plugin {
60855 +       /* generic fields */
60856 +       plugin_header h;
60857 +       /* fingerprint size in bytes */
60858 +       int fipsize;
60859 +       struct crypto_hash * (*alloc) (void);
60860 +       void (*free) (struct crypto_hash * tfm);
60861 +} digest_plugin;
60862 +
60863 +typedef struct compression_plugin {
60864 +       /* generic fields */
60865 +       plugin_header h;
60866 +       int (*init) (void);
60867 +       /* the maximum number of bytes the size of the "compressed" data can
60868 +        * exceed the uncompressed data. */
60869 +       int (*overrun) (unsigned src_len);
60870 +        coa_t(*alloc) (tfm_action act);
60871 +       void (*free) (coa_t coa, tfm_action act);
60872 +       /* minimal size of the flow we still try to compress */
60873 +       int (*min_size_deflate) (void);
60874 +        __u32(*checksum) (char *data, __u32 length);
60875 +       /* main transform procedures */
60876 +       void (*compress) (coa_t coa, __u8 * src_first, unsigned src_len,
60877 +                         __u8 * dst_first, unsigned *dst_len);
60878 +       void (*decompress) (coa_t coa, __u8 * src_first, unsigned src_len,
60879 +                           __u8 * dst_first, unsigned *dst_len);
60880 +} compression_plugin;
60881 +
60882 +typedef struct compression_mode_plugin {
60883 +       /* generic fields */
60884 +       plugin_header h;
60885 +       /* this is called when estimating compressibility
60886 +          of a logical cluster by its content */
60887 +       int (*should_deflate) (struct inode * inode, cloff_t index);
60888 +       /* this is called when results of compression should be saved */
60889 +       int (*accept_hook) (struct inode * inode, cloff_t index);
60890 +       /* this is called when results of compression should be discarded */
60891 +       int (*discard_hook) (struct inode * inode, cloff_t index);
60892 +} compression_mode_plugin;
60893 +
60894 +typedef struct cluster_plugin {
60895 +       /* generic fields */
60896 +       plugin_header h;
60897 +       int shift;
60898 +} cluster_plugin;
60899 +
60900 +typedef struct sd_ext_plugin {
60901 +       /* generic fields */
60902 +       plugin_header h;
60903 +       int (*present) (struct inode * inode, char **area, int *len);
60904 +       int (*absent) (struct inode * inode);
60905 +       int (*save_len) (struct inode * inode);
60906 +       int (*save) (struct inode * inode, char **area);
60907 +       /* alignment requirement for this stat-data part */
60908 +       int alignment;
60909 +} sd_ext_plugin;
60910 +
60911 +/* this plugin contains methods to allocate objectid for newly created files,
60912 +   to deallocate objectid when file gets removed, to report number of used and
60913 +   free objectids */
60914 +typedef struct oid_allocator_plugin {
60915 +       /* generic fields */
60916 +       plugin_header h;
60917 +       int (*init_oid_allocator) (reiser4_oid_allocator * map, __u64 nr_files,
60918 +                                  __u64 oids);
60919 +       /* used to report statfs->f_files */
60920 +        __u64(*oids_used) (reiser4_oid_allocator * map);
60921 +       /* get next oid to use */
60922 +        __u64(*next_oid) (reiser4_oid_allocator * map);
60923 +       /* used to report statfs->f_ffree */
60924 +        __u64(*oids_free) (reiser4_oid_allocator * map);
60925 +       /* allocate new objectid */
60926 +       int (*allocate_oid) (reiser4_oid_allocator * map, oid_t *);
60927 +       /* release objectid */
60928 +       int (*release_oid) (reiser4_oid_allocator * map, oid_t);
60929 +       /* how many pages to reserve in transaction for allocation of new
60930 +          objectid */
60931 +       int (*oid_reserve_allocate) (reiser4_oid_allocator * map);
60932 +       /* how many pages to reserve in transaction for freeing of an
60933 +          objectid */
60934 +       int (*oid_reserve_release) (reiser4_oid_allocator * map);
60935 +       void (*print_info) (const char *, reiser4_oid_allocator *);
60936 +} oid_allocator_plugin;
60937 +
60938 +/* disk layout plugin: this specifies super block, journal, bitmap (if there
60939 +   are any) locations, etc */
60940 +typedef struct disk_format_plugin {
60941 +       /* generic fields */
60942 +       plugin_header h;
60943 +       /* replay journal, initialize super_info_data, etc */
60944 +       int (*init_format) (struct super_block *, void *data);
60945 +
60946 +       /* key of root directory stat data */
60947 +       const reiser4_key *(*root_dir_key) (const struct super_block *);
60948 +
60949 +       int (*release) (struct super_block *);
60950 +       jnode *(*log_super) (struct super_block *);
60951 +       int (*check_open) (const struct inode * object);
60952 +       int (*version_update) (struct super_block *);
60953 +} disk_format_plugin;
60954 +
60955 +struct jnode_plugin {
60956 +       /* generic fields */
60957 +       plugin_header h;
60958 +       int (*init) (jnode * node);
60959 +       int (*parse) (jnode * node);
60960 +       struct address_space *(*mapping) (const jnode * node);
60961 +       unsigned long (*index) (const jnode * node);
60962 +       jnode *(*clone) (jnode * node);
60963 +};
60964 +
60965 +/* plugin instance.                                                         */
60966 +/*                                                                          */
60967 +/* This is "wrapper" union for all types of plugins. Most of the code uses  */
60968 +/* plugins of particular type (file_plugin, dir_plugin, etc.)  rather than  */
60969 +/* operates with pointers to reiser4_plugin. This union is only used in     */
60970 +/* some generic code in plugin/plugin.c that operates on all                */
60971 +/* plugins. Technically speaking purpose of this union is to add type       */
60972 +/* safety to said generic code: each plugin type (file_plugin, for          */
60973 +/* example), contains plugin_header as its first memeber. This first member */
60974 +/* is located at the same place in memory as .h member of                   */
60975 +/* reiser4_plugin. Generic code, obtains pointer to reiser4_plugin and      */
60976 +/* looks in the .h which is header of plugin type located in union. This    */
60977 +/* allows to avoid type-casts.                                              */
60978 +union reiser4_plugin {
60979 +       /* generic fields */
60980 +       plugin_header h;
60981 +       /* file plugin */
60982 +       file_plugin file;
60983 +       /* directory plugin */
60984 +       dir_plugin dir;
60985 +       /* hash plugin, used by directory plugin */
60986 +       hash_plugin hash;
60987 +       /* fibration plugin used by directory plugin */
60988 +       fibration_plugin fibration;
60989 +       /* cipher transform plugin, used by file plugin */
60990 +       cipher_plugin cipher;
60991 +       /* digest transform plugin, used by file plugin */
60992 +       digest_plugin digest;
60993 +       /* compression transform plugin, used by file plugin */
60994 +       compression_plugin compression;
60995 +       /* tail plugin, used by file plugin */
60996 +       formatting_plugin formatting;
60997 +       /* permission plugin */
60998 +       perm_plugin perm;
60999 +       /* node plugin */
61000 +       node_plugin node;
61001 +       /* item plugin */
61002 +       item_plugin item;
61003 +       /* stat-data extension plugin */
61004 +       sd_ext_plugin sd_ext;
61005 +       /* disk layout plugin */
61006 +       disk_format_plugin format;
61007 +       /* object id allocator plugin */
61008 +       oid_allocator_plugin oid_allocator;
61009 +       /* plugin for different jnode types */
61010 +       jnode_plugin jnode;
61011 +       /* compression mode plugin, used by object plugin */
61012 +       compression_mode_plugin compression_mode;
61013 +       /* cluster plugin, used by object plugin */
61014 +       cluster_plugin clust;
61015 +       /* place-holder for new plugin types that can be registered
61016 +          dynamically, and used by other dynamically loaded plugins.  */
61017 +       void *generic;
61018 +};
61019 +
61020 +struct reiser4_plugin_ops {
61021 +       /* called when plugin is initialized */
61022 +       int (*init) (reiser4_plugin * plugin);
61023 +       /* called when plugin is unloaded */
61024 +       int (*done) (reiser4_plugin * plugin);
61025 +       /* load given plugin from disk */
61026 +       int (*load) (struct inode * inode,
61027 +                    reiser4_plugin * plugin, char **area, int *len);
61028 +       /* how many space is required to store this plugin's state
61029 +          in stat-data */
61030 +       int (*save_len) (struct inode * inode, reiser4_plugin * plugin);
61031 +       /* save persistent plugin-data to disk */
61032 +       int (*save) (struct inode * inode, reiser4_plugin * plugin,
61033 +                    char **area);
61034 +       /* alignment requirement for on-disk state of this plugin
61035 +          in number of bytes */
61036 +       int alignment;
61037 +       /* install itself into given inode. This can return error
61038 +          (e.g., you cannot change hash of non-empty directory). */
61039 +       int (*change) (struct inode * inode, reiser4_plugin * plugin,
61040 +                      pset_member memb);
61041 +       /* install itself into given inode. This can return error
61042 +          (e.g., you cannot change hash of non-empty directory). */
61043 +       int (*inherit) (struct inode * inode, struct inode * parent,
61044 +                       reiser4_plugin * plugin);
61045 +};
61046 +
61047 +/* functions implemented in fs/reiser4/plugin/plugin.c */
61048 +
61049 +/* stores plugin reference in reiser4-specific part of inode */
61050 +extern int set_object_plugin(struct inode *inode, reiser4_plugin_id id);
61051 +extern int setup_plugins(struct super_block *super, reiser4_plugin ** area);
61052 +extern int init_plugins(void);
61053 +
61054 +/* builtin plugins */
61055 +
61056 +/* builtin hash-plugins */
61057 +
61058 +typedef enum {
61059 +       RUPASOV_HASH_ID,
61060 +       R5_HASH_ID,
61061 +       TEA_HASH_ID,
61062 +       FNV1_HASH_ID,
61063 +       DEGENERATE_HASH_ID,
61064 +       LAST_HASH_ID
61065 +} reiser4_hash_id;
61066 +
61067 +/* builtin cipher plugins */
61068 +
61069 +typedef enum {
61070 +       NONE_CIPHER_ID,
61071 +       LAST_CIPHER_ID
61072 +} reiser4_cipher_id;
61073 +
61074 +/* builtin digest plugins */
61075 +
61076 +typedef enum {
61077 +       SHA256_32_DIGEST_ID,
61078 +       LAST_DIGEST_ID
61079 +} reiser4_digest_id;
61080 +
61081 +/* builtin compression mode plugins */
61082 +typedef enum {
61083 +       NONE_COMPRESSION_MODE_ID,
61084 +       LATTD_COMPRESSION_MODE_ID,
61085 +       ULTIM_COMPRESSION_MODE_ID,
61086 +       FORCE_COMPRESSION_MODE_ID,
61087 +       CONVX_COMPRESSION_MODE_ID,
61088 +       LAST_COMPRESSION_MODE_ID
61089 +} reiser4_compression_mode_id;
61090 +
61091 +/* builtin cluster plugins */
61092 +typedef enum {
61093 +       CLUSTER_64K_ID,
61094 +       CLUSTER_32K_ID,
61095 +       CLUSTER_16K_ID,
61096 +       CLUSTER_8K_ID,
61097 +       CLUSTER_4K_ID,
61098 +       LAST_CLUSTER_ID
61099 +} reiser4_cluster_id;
61100 +
61101 +/* builtin tail-plugins */
61102 +
61103 +typedef enum {
61104 +       NEVER_TAILS_FORMATTING_ID,
61105 +       ALWAYS_TAILS_FORMATTING_ID,
61106 +       SMALL_FILE_FORMATTING_ID,
61107 +       LAST_TAIL_FORMATTING_ID
61108 +} reiser4_formatting_id;
61109 +
61110 +/* compression/clustering specific data */
61111 +typedef struct compression_data {
61112 +       reiser4_compression_id coa;     /* id of the compression algorithm */
61113 +} compression_data_t;
61114 +
61115 +typedef __u8 cluster_data_t;   /* cluster info */
61116 +
61117 +/* data type used to pack parameters that we pass to vfs object creation
61118 +   function create_object() */
61119 +struct reiser4_object_create_data {
61120 +       /* plugin to control created object */
61121 +       reiser4_file_id id;
61122 +       /* mode of regular file, directory or special file */
61123 +/* what happens if some other sort of perm plugin is in use? */
61124 +       int mode;
61125 +       /* rdev of special file */
61126 +       dev_t rdev;
61127 +       /* symlink target */
61128 +       const char *name;
61129 +       /* add here something for non-standard objects you invent, like
61130 +          query for interpolation file etc. */
61131 +
61132 +       crypto_stat_t * crypto;
61133 +       compression_data_t *compression;
61134 +       cluster_data_t *cluster;
61135 +
61136 +       struct inode *parent;
61137 +       struct dentry *dentry;
61138 +};
61139 +
61140 +/* description of directory entry being created/destroyed/sought for
61141 +
61142 +   It is passed down to the directory plugin and farther to the
61143 +   directory item plugin methods. Creation of new directory is done in
61144 +   several stages: first we search for an entry with the same name, then
61145 +   create new one. reiser4_dir_entry_desc is used to store some information
61146 +   collected at some stage of this process and required later: key of
61147 +   item that we want to insert/delete and pointer to an object that will
61148 +   be bound by the new directory entry. Probably some more fields will
61149 +   be added there.
61150 +
61151 +*/
61152 +struct reiser4_dir_entry_desc {
61153 +       /* key of directory entry */
61154 +       reiser4_key key;
61155 +       /* object bound by this entry. */
61156 +       struct inode *obj;
61157 +};
61158 +
61159 +#define MAX_PLUGIN_TYPE_LABEL_LEN  32
61160 +#define MAX_PLUGIN_PLUG_LABEL_LEN  32
61161 +
61162 +/* used for interface with user-land: table-driven parsing in
61163 +    reiser4(). */
61164 +typedef struct plugin_locator {
61165 +       reiser4_plugin_type type_id;
61166 +       reiser4_plugin_id id;
61167 +       char type_label[MAX_PLUGIN_TYPE_LABEL_LEN];
61168 +       char plug_label[MAX_PLUGIN_PLUG_LABEL_LEN];
61169 +} plugin_locator;
61170 +
61171 +extern int locate_plugin(struct inode *inode, plugin_locator * loc);
61172 +
61173 +#define PLUGIN_BY_ID(TYPE,ID,FIELD)                                    \
61174 +static inline TYPE *TYPE ## _by_id( reiser4_plugin_id id )             \
61175 +{                                                                      \
61176 +       reiser4_plugin *plugin = plugin_by_id ( ID, id );               \
61177 +       return plugin ? & plugin -> FIELD : NULL;                       \
61178 +}                                                                      \
61179 +static inline TYPE *TYPE ## _by_disk_id( reiser4_tree *tree, d16 *id ) \
61180 +{                                                                      \
61181 +       reiser4_plugin *plugin = plugin_by_disk_id ( tree, ID, id );    \
61182 +       return plugin ? & plugin -> FIELD : NULL;                       \
61183 +}                                                                      \
61184 +static inline TYPE *TYPE ## _by_unsafe_id( reiser4_plugin_id id )      \
61185 +{                                                                      \
61186 +       reiser4_plugin *plugin = plugin_by_unsafe_id ( ID, id );        \
61187 +       return plugin ? & plugin -> FIELD : NULL;                       \
61188 +}                                                                      \
61189 +static inline reiser4_plugin* TYPE ## _to_plugin( TYPE* plugin )       \
61190 +{                                                                      \
61191 +       return ( reiser4_plugin * ) plugin;                             \
61192 +}                                                                      \
61193 +static inline reiser4_plugin_id TYPE ## _id( TYPE* plugin )            \
61194 +{                                                                      \
61195 +       return TYPE ## _to_plugin (plugin) -> h.id;                     \
61196 +}                                                                      \
61197 +typedef struct { int foo; } TYPE ## _plugin_dummy
61198 +
61199 +PLUGIN_BY_ID(item_plugin, REISER4_ITEM_PLUGIN_TYPE, item);
61200 +PLUGIN_BY_ID(file_plugin, REISER4_FILE_PLUGIN_TYPE, file);
61201 +PLUGIN_BY_ID(dir_plugin, REISER4_DIR_PLUGIN_TYPE, dir);
61202 +PLUGIN_BY_ID(node_plugin, REISER4_NODE_PLUGIN_TYPE, node);
61203 +PLUGIN_BY_ID(sd_ext_plugin, REISER4_SD_EXT_PLUGIN_TYPE, sd_ext);
61204 +PLUGIN_BY_ID(perm_plugin, REISER4_PERM_PLUGIN_TYPE, perm);
61205 +PLUGIN_BY_ID(hash_plugin, REISER4_HASH_PLUGIN_TYPE, hash);
61206 +PLUGIN_BY_ID(fibration_plugin, REISER4_FIBRATION_PLUGIN_TYPE, fibration);
61207 +PLUGIN_BY_ID(cipher_plugin, REISER4_CIPHER_PLUGIN_TYPE, cipher);
61208 +PLUGIN_BY_ID(digest_plugin, REISER4_DIGEST_PLUGIN_TYPE, digest);
61209 +PLUGIN_BY_ID(compression_plugin, REISER4_COMPRESSION_PLUGIN_TYPE, compression);
61210 +PLUGIN_BY_ID(formatting_plugin, REISER4_FORMATTING_PLUGIN_TYPE, formatting);
61211 +PLUGIN_BY_ID(disk_format_plugin, REISER4_FORMAT_PLUGIN_TYPE, format);
61212 +PLUGIN_BY_ID(jnode_plugin, REISER4_JNODE_PLUGIN_TYPE, jnode);
61213 +PLUGIN_BY_ID(compression_mode_plugin, REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61214 +            compression_mode);
61215 +PLUGIN_BY_ID(cluster_plugin, REISER4_CLUSTER_PLUGIN_TYPE, clust);
61216 +
61217 +extern int save_plugin_id(reiser4_plugin * plugin, d16 * area);
61218 +
61219 +extern struct list_head *get_plugin_list(reiser4_plugin_type type_id);
61220 +
61221 +#define for_all_plugins(ptype, plugin)                                                 \
61222 +for (plugin = list_entry(get_plugin_list(ptype)->next, reiser4_plugin, h.linkage);     \
61223 +     get_plugin_list(ptype) != &plugin->h.linkage;                                     \
61224 +     plugin = list_entry(plugin->h.linkage.next, reiser4_plugin, h.linkage))
61225 +
61226 +
61227 +extern int grab_plugin_pset(struct inode *self, struct inode *ancestor, pset_member memb);
61228 +extern int force_plugin_pset(struct inode *self, pset_member memb, reiser4_plugin *plug);
61229 +extern int finish_pset(struct inode *inode);
61230 +
61231 +/* defined in fs/reiser4/plugin/object.c */
61232 +extern file_plugin file_plugins[LAST_FILE_PLUGIN_ID];
61233 +/* defined in fs/reiser4/plugin/object.c */
61234 +extern dir_plugin dir_plugins[LAST_DIR_ID];
61235 +/* defined in fs/reiser4/plugin/item/static_stat.c */
61236 +extern sd_ext_plugin sd_ext_plugins[LAST_SD_EXTENSION];
61237 +/* defined in fs/reiser4/plugin/hash.c */
61238 +extern hash_plugin hash_plugins[LAST_HASH_ID];
61239 +/* defined in fs/reiser4/plugin/fibration.c */
61240 +extern fibration_plugin fibration_plugins[LAST_FIBRATION_ID];
61241 +/* defined in fs/reiser4/plugin/crypt.c */
61242 +extern cipher_plugin cipher_plugins[LAST_CIPHER_ID];
61243 +/* defined in fs/reiser4/plugin/digest.c */
61244 +extern digest_plugin digest_plugins[LAST_DIGEST_ID];
61245 +/* defined in fs/reiser4/plugin/compress/compress.c */
61246 +extern compression_plugin compression_plugins[LAST_COMPRESSION_ID];
61247 +/* defined in fs/reiser4/plugin/compress/compression_mode.c */
61248 +extern compression_mode_plugin
61249 +compression_mode_plugins[LAST_COMPRESSION_MODE_ID];
61250 +/* defined in fs/reiser4/plugin/cluster.c */
61251 +extern cluster_plugin cluster_plugins[LAST_CLUSTER_ID];
61252 +/* defined in fs/reiser4/plugin/tail.c */
61253 +extern formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID];
61254 +/* defined in fs/reiser4/plugin/security/security.c */
61255 +extern perm_plugin perm_plugins[LAST_PERM_ID];
61256 +/* defined in fs/reiser4/plugin/item/item.c */
61257 +extern item_plugin item_plugins[LAST_ITEM_ID];
61258 +/* defined in fs/reiser4/plugin/node/node.c */
61259 +extern node_plugin node_plugins[LAST_NODE_ID];
61260 +/* defined in fs/reiser4/plugin/disk_format/disk_format.c */
61261 +extern disk_format_plugin format_plugins[LAST_FORMAT_ID];
61262 +
61263 +/* __FS_REISER4_PLUGIN_TYPES_H__ */
61264 +#endif
61265 +
61266 +/* Make Linus happy.
61267 +   Local variables:
61268 +   c-indentation-style: "K&R"
61269 +   mode-name: "LC"
61270 +   c-basic-offset: 8
61271 +   tab-width: 8
61272 +   fill-column: 120
61273 +   End:
61274 +*/
61275 diff --git a/fs/reiser4/plugin/plugin_header.h b/fs/reiser4/plugin/plugin_header.h
61276 new file mode 100644
61277 index 0000000..68cf5b0
61278 --- /dev/null
61279 +++ b/fs/reiser4/plugin/plugin_header.h
61280 @@ -0,0 +1,144 @@
61281 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61282 +
61283 +/* plugin header. Data structures required by all plugin types. */
61284 +
61285 +#if !defined( __PLUGIN_HEADER_H__ )
61286 +#define __PLUGIN_HEADER_H__
61287 +
61288 +/* plugin data-types and constants */
61289 +
61290 +#include "../debug.h"
61291 +#include "../dformat.h"
61292 +
61293 +typedef enum {
61294 +       REISER4_FILE_PLUGIN_TYPE,
61295 +       REISER4_DIR_PLUGIN_TYPE,
61296 +       REISER4_ITEM_PLUGIN_TYPE,
61297 +       REISER4_NODE_PLUGIN_TYPE,
61298 +       REISER4_HASH_PLUGIN_TYPE,
61299 +       REISER4_FIBRATION_PLUGIN_TYPE,
61300 +       REISER4_FORMATTING_PLUGIN_TYPE,
61301 +       REISER4_PERM_PLUGIN_TYPE,
61302 +       REISER4_SD_EXT_PLUGIN_TYPE,
61303 +       REISER4_FORMAT_PLUGIN_TYPE,
61304 +       REISER4_JNODE_PLUGIN_TYPE,
61305 +       REISER4_CIPHER_PLUGIN_TYPE,
61306 +       REISER4_DIGEST_PLUGIN_TYPE,
61307 +       REISER4_COMPRESSION_PLUGIN_TYPE,
61308 +       REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61309 +       REISER4_CLUSTER_PLUGIN_TYPE,
61310 +       REISER4_PLUGIN_TYPES
61311 +} reiser4_plugin_type;
61312 +
61313 +typedef enum {
61314 +       REISER4_DIRECTORY_FILE,
61315 +       REISER4_REGULAR_FILE,
61316 +       REISER4_SYMLINK_FILE,
61317 +       REISER4_SPECIAL_FILE,
61318 +} reiser4_plugin_group;
61319 +
61320 +struct reiser4_plugin_ops;
61321 +/* generic plugin operations, supported by each
61322 +    plugin type. */
61323 +typedef struct reiser4_plugin_ops reiser4_plugin_ops;
61324 +
61325 +/* the common part of all plugin instances. */
61326 +typedef struct plugin_header {
61327 +       /* plugin type */
61328 +       reiser4_plugin_type type_id;
61329 +       /* id of this plugin */
61330 +       reiser4_plugin_id id;
61331 +       /* bitmask of groups the plugin belongs to. */
61332 +       reiser4_plugin_groups groups;
61333 +       /* plugin operations */
61334 +       reiser4_plugin_ops *pops;
61335 +/* NIKITA-FIXME-HANS: usage of and access to label and desc is not commented and defined. */
61336 +       /* short label of this plugin */
61337 +       const char *label;
61338 +       /* descriptive string.. */
61339 +       const char *desc;
61340 +       /* list linkage */
61341 +       struct list_head linkage;
61342 +} plugin_header;
61343 +
61344 +#define plugin_of_group(plug, group) (plug->h.groups & (1 << group))
61345 +
61346 +/* PRIVATE INTERFACES */
61347 +/* NIKITA-FIXME-HANS: what is this for and why does it duplicate what is in plugin_header? */
61348 +/* plugin type representation. */
61349 +typedef struct reiser4_plugin_type_data {
61350 +       /* internal plugin type identifier. Should coincide with
61351 +          index of this item in plugins[] array. */
61352 +       reiser4_plugin_type type_id;
61353 +       /* short symbolic label of this plugin type. Should be no longer
61354 +          than MAX_PLUGIN_TYPE_LABEL_LEN characters including '\0'. */
61355 +       const char *label;
61356 +       /* plugin type description longer than .label */
61357 +       const char *desc;
61358 +
61359 +/* NIKITA-FIXME-HANS: define built-in */
61360 +       /* number of built-in plugin instances of this type */
61361 +       int builtin_num;
61362 +       /* array of built-in plugins */
61363 +       void *builtin;
61364 +       struct list_head plugins_list;
61365 +       size_t size;
61366 +} reiser4_plugin_type_data;
61367 +
61368 +extern reiser4_plugin_type_data plugins[REISER4_PLUGIN_TYPES];
61369 +
61370 +int is_plugin_type_valid(reiser4_plugin_type type);
61371 +int is_plugin_id_valid(reiser4_plugin_type type, reiser4_plugin_id id);
61372 +
61373 +static inline reiser4_plugin *plugin_at(reiser4_plugin_type_data * ptype, int i)
61374 +{
61375 +       char *builtin;
61376 +
61377 +       builtin = ptype->builtin;
61378 +       return (reiser4_plugin *) (builtin + i * ptype->size);
61379 +}
61380 +
61381 +/* return plugin by its @type_id and @id */
61382 +static inline reiser4_plugin *plugin_by_id(reiser4_plugin_type type,
61383 +                                          reiser4_plugin_id id)
61384 +{
61385 +       assert("nikita-1651", is_plugin_type_valid(type));
61386 +       assert("nikita-1652", is_plugin_id_valid(type, id));
61387 +       return plugin_at(&plugins[type], id);
61388 +}
61389 +
61390 +extern reiser4_plugin *plugin_by_unsafe_id(reiser4_plugin_type type_id,
61391 +                                          reiser4_plugin_id id);
61392 +
61393 +/**
61394 + * plugin_by_disk_id - get reiser4_plugin
61395 + * @type_id: plugin type id
61396 + * @did: plugin id in disk format
61397 + *
61398 + * Returns reiser4_plugin by plugin type id an dplugin_id.
61399 + */
61400 +static inline reiser4_plugin *plugin_by_disk_id(reiser4_tree * tree UNUSED_ARG,
61401 +                                               reiser4_plugin_type type_id,
61402 +                                               __le16 *plugin_id)
61403 +{
61404 +       /*
61405 +        * what we should do properly is to maintain within each file-system a
61406 +        * dictionary that maps on-disk plugin ids to "universal" ids. This
61407 +        * dictionary will be resolved on mount time, so that this function
61408 +        * will perform just one additional array lookup.
61409 +        */
61410 +       return plugin_by_unsafe_id(type_id, le16_to_cpu(*plugin_id));
61411 +}
61412 +
61413 +/* __PLUGIN_HEADER_H__ */
61414 +#endif
61415 +
61416 +/*
61417 + * Local variables:
61418 + * c-indentation-style: "K&R"
61419 + * mode-name: "LC"
61420 + * c-basic-offset: 8
61421 + * tab-width: 8
61422 + * fill-column: 79
61423 + * End:
61424 + */
61425 diff --git a/fs/reiser4/plugin/plugin_set.c b/fs/reiser4/plugin/plugin_set.c
61426 new file mode 100644
61427 index 0000000..528632d
61428 --- /dev/null
61429 +++ b/fs/reiser4/plugin/plugin_set.c
61430 @@ -0,0 +1,379 @@
61431 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
61432 + * reiser4/README */
61433 +/* This file contains Reiser4 plugin set operations */
61434 +
61435 +/* plugin sets
61436 + *
61437 + * Each file in reiser4 is controlled by a whole set of plugins (file plugin,
61438 + * directory plugin, hash plugin, tail policy plugin, security plugin, etc.)
61439 + * assigned (inherited, deduced from mode bits, etc.) at creation time. This
61440 + * set of plugins (so called pset) is described by structure plugin_set (see
61441 + * plugin/plugin_set.h), which contains pointers to all required plugins.
61442 + *
61443 + * Children can inherit some pset members from their parent, however sometimes
61444 + * it is useful to specify members different from parent ones. Since object's
61445 + * pset can not be easily changed without fatal consequences, we use for this
61446 + * purpose another special plugin table (so called hset, or heir set) described
61447 + * by the same structure.
61448 + *
61449 + * Inode only stores a pointers to pset and hset. Different inodes with the
61450 + * same set of pset (hset) members point to the same pset (hset). This is
61451 + * archived by storing psets and hsets in global hash table. Races are avoided
61452 + * by simple (and efficient so far) solution of never recycling psets, even
61453 + * when last inode pointing to it is destroyed.
61454 + */
61455 +
61456 +#include "../debug.h"
61457 +#include "../super.h"
61458 +#include "plugin_set.h"
61459 +
61460 +#include <linux/slab.h>
61461 +#include <linux/stddef.h>
61462 +
61463 +/* slab for plugin sets */
61464 +static struct kmem_cache *plugin_set_slab;
61465 +
61466 +static spinlock_t plugin_set_lock[8] __cacheline_aligned_in_smp = {
61467 +       [0 ... 7] = SPIN_LOCK_UNLOCKED
61468 +};
61469 +
61470 +/* hash table support */
61471 +
61472 +#define PS_TABLE_SIZE (32)
61473 +
61474 +static inline plugin_set *cast_to(const unsigned long *a)
61475 +{
61476 +       return container_of(a, plugin_set, hashval);
61477 +}
61478 +
61479 +static inline int pseq(const unsigned long *a1, const unsigned long *a2)
61480 +{
61481 +       plugin_set *set1;
61482 +       plugin_set *set2;
61483 +
61484 +       /* make sure fields are not missed in the code below */
61485 +       cassert(sizeof *set1 ==
61486 +               sizeof set1->hashval +
61487 +               sizeof set1->link +
61488 +               sizeof set1->file +
61489 +               sizeof set1->dir +
61490 +               sizeof set1->perm +
61491 +               sizeof set1->formatting +
61492 +               sizeof set1->hash +
61493 +               sizeof set1->fibration +
61494 +               sizeof set1->sd +
61495 +               sizeof set1->dir_item +
61496 +               sizeof set1->cipher +
61497 +               sizeof set1->digest +
61498 +               sizeof set1->compression +
61499 +               sizeof set1->compression_mode +
61500 +               sizeof set1->cluster +
61501 +               sizeof set1->create);
61502 +
61503 +       set1 = cast_to(a1);
61504 +       set2 = cast_to(a2);
61505 +       return
61506 +           set1->hashval == set2->hashval &&
61507 +           set1->file == set2->file &&
61508 +           set1->dir == set2->dir &&
61509 +           set1->perm == set2->perm &&
61510 +           set1->formatting == set2->formatting &&
61511 +           set1->hash == set2->hash &&
61512 +           set1->fibration == set2->fibration &&
61513 +           set1->sd == set2->sd &&
61514 +           set1->dir_item == set2->dir_item &&
61515 +           set1->cipher == set2->cipher &&
61516 +           set1->digest == set2->digest &&
61517 +           set1->compression == set2->compression &&
61518 +           set1->compression_mode == set2->compression_mode &&
61519 +           set1->cluster == set2->cluster &&
61520 +           set1->create == set2->create;
61521 +}
61522 +
61523 +#define HASH_FIELD(hash, set, field)           \
61524 +({                                             \
61525 +        (hash) += (unsigned long)(set)->field >> 2;    \
61526 +})
61527 +
61528 +static inline unsigned long calculate_hash(const plugin_set * set)
61529 +{
61530 +       unsigned long result;
61531 +
61532 +       result = 0;
61533 +       HASH_FIELD(result, set, file);
61534 +       HASH_FIELD(result, set, dir);
61535 +       HASH_FIELD(result, set, perm);
61536 +       HASH_FIELD(result, set, formatting);
61537 +       HASH_FIELD(result, set, hash);
61538 +       HASH_FIELD(result, set, fibration);
61539 +       HASH_FIELD(result, set, sd);
61540 +       HASH_FIELD(result, set, dir_item);
61541 +       HASH_FIELD(result, set, cipher);
61542 +       HASH_FIELD(result, set, digest);
61543 +       HASH_FIELD(result, set, compression);
61544 +       HASH_FIELD(result, set, compression_mode);
61545 +       HASH_FIELD(result, set, cluster);
61546 +       HASH_FIELD(result, set, create);
61547 +       return result & (PS_TABLE_SIZE - 1);
61548 +}
61549 +
61550 +static inline unsigned long
61551 +pshash(ps_hash_table * table, const unsigned long *a)
61552 +{
61553 +       return *a;
61554 +}
61555 +
61556 +/* The hash table definition */
61557 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
61558 +#define KFREE(ptr, size) kfree(ptr)
61559 +TYPE_SAFE_HASH_DEFINE(ps, plugin_set, unsigned long, hashval, link, pshash,
61560 +                     pseq);
61561 +#undef KFREE
61562 +#undef KMALLOC
61563 +
61564 +static ps_hash_table ps_table;
61565 +static plugin_set empty_set = {
61566 +       .hashval = 0,
61567 +       .file = NULL,
61568 +       .dir = NULL,
61569 +       .perm = NULL,
61570 +       .formatting = NULL,
61571 +       .hash = NULL,
61572 +       .fibration = NULL,
61573 +       .sd = NULL,
61574 +       .dir_item = NULL,
61575 +       .cipher = NULL,
61576 +       .digest = NULL,
61577 +       .compression = NULL,
61578 +       .compression_mode = NULL,
61579 +       .cluster = NULL,
61580 +       .create = NULL,
61581 +       .link = {NULL}
61582 +};
61583 +
61584 +plugin_set *plugin_set_get_empty(void)
61585 +{
61586 +       return &empty_set;
61587 +}
61588 +
61589 +void plugin_set_put(plugin_set * set)
61590 +{
61591 +}
61592 +
61593 +static inline unsigned long *pset_field(plugin_set * set, int offset)
61594 +{
61595 +       return (unsigned long *)(((char *)set) + offset);
61596 +}
61597 +
61598 +static int plugin_set_field(plugin_set ** set, const unsigned long val,
61599 +                           const int offset)
61600 +{
61601 +       unsigned long *spot;
61602 +       spinlock_t *lock;
61603 +       plugin_set replica;
61604 +       plugin_set *twin;
61605 +       plugin_set *psal;
61606 +       plugin_set *orig;
61607 +
61608 +       assert("nikita-2902", set != NULL);
61609 +       assert("nikita-2904", *set != NULL);
61610 +
61611 +       spot = pset_field(*set, offset);
61612 +       if (unlikely(*spot == val))
61613 +               return 0;
61614 +
61615 +       replica = *(orig = *set);
61616 +       *pset_field(&replica, offset) = val;
61617 +       replica.hashval = calculate_hash(&replica);
61618 +       rcu_read_lock();
61619 +       twin = ps_hash_find(&ps_table, &replica.hashval);
61620 +       if (unlikely(twin == NULL)) {
61621 +               rcu_read_unlock();
61622 +               psal = kmem_cache_alloc(plugin_set_slab,
61623 +                                       reiser4_ctx_gfp_mask_get());
61624 +               if (psal == NULL)
61625 +                       return RETERR(-ENOMEM);
61626 +               *psal = replica;
61627 +               lock = &plugin_set_lock[replica.hashval & 7];
61628 +               spin_lock(lock);
61629 +               twin = ps_hash_find(&ps_table, &replica.hashval);
61630 +               if (likely(twin == NULL)) {
61631 +                       *set = psal;
61632 +                       ps_hash_insert_rcu(&ps_table, psal);
61633 +               } else {
61634 +                       *set = twin;
61635 +                       kmem_cache_free(plugin_set_slab, psal);
61636 +               }
61637 +               spin_unlock(lock);
61638 +       } else {
61639 +               rcu_read_unlock();
61640 +               *set = twin;
61641 +       }
61642 +       return 0;
61643 +}
61644 +
61645 +static struct {
61646 +       int offset;
61647 +       reiser4_plugin_groups groups;
61648 +       reiser4_plugin_type type;
61649 +} pset_descr[PSET_LAST] = {
61650 +       [PSET_FILE] = {
61651 +               .offset = offsetof(plugin_set, file),
61652 +               .type = REISER4_FILE_PLUGIN_TYPE,
61653 +               .groups = 0
61654 +       },
61655 +       [PSET_DIR] = {
61656 +               .offset = offsetof(plugin_set, dir),
61657 +               .type = REISER4_DIR_PLUGIN_TYPE,
61658 +               .groups = 0
61659 +       },
61660 +       [PSET_PERM] = {
61661 +               .offset = offsetof(plugin_set, perm),
61662 +               .type = REISER4_PERM_PLUGIN_TYPE,
61663 +               .groups = 0
61664 +       },
61665 +       [PSET_FORMATTING] = {
61666 +               .offset = offsetof(plugin_set, formatting),
61667 +               .type = REISER4_FORMATTING_PLUGIN_TYPE,
61668 +               .groups = 0
61669 +       },
61670 +       [PSET_HASH] = {
61671 +               .offset = offsetof(plugin_set, hash),
61672 +               .type = REISER4_HASH_PLUGIN_TYPE,
61673 +               .groups = 0
61674 +       },
61675 +       [PSET_FIBRATION] = {
61676 +               .offset = offsetof(plugin_set, fibration),
61677 +               .type = REISER4_FIBRATION_PLUGIN_TYPE,
61678 +               .groups = 0
61679 +       },
61680 +       [PSET_SD] = {
61681 +               .offset = offsetof(plugin_set, sd),
61682 +               .type = REISER4_ITEM_PLUGIN_TYPE,
61683 +               .groups = (1 << STAT_DATA_ITEM_TYPE)
61684 +       },
61685 +       [PSET_DIR_ITEM] = {
61686 +               .offset = offsetof(plugin_set, dir_item),
61687 +               .type = REISER4_ITEM_PLUGIN_TYPE,
61688 +               .groups = (1 << DIR_ENTRY_ITEM_TYPE)
61689 +       },
61690 +       [PSET_CIPHER] = {
61691 +               .offset = offsetof(plugin_set, cipher),
61692 +               .type = REISER4_CIPHER_PLUGIN_TYPE,
61693 +               .groups = 0
61694 +       },
61695 +       [PSET_DIGEST] = {
61696 +               .offset = offsetof(plugin_set, digest),
61697 +               .type = REISER4_DIGEST_PLUGIN_TYPE,
61698 +               .groups = 0
61699 +       },
61700 +       [PSET_COMPRESSION] = {
61701 +               .offset = offsetof(plugin_set, compression),
61702 +               .type = REISER4_COMPRESSION_PLUGIN_TYPE,
61703 +               .groups = 0
61704 +       },
61705 +       [PSET_COMPRESSION_MODE] = {
61706 +               .offset = offsetof(plugin_set, compression_mode),
61707 +               .type = REISER4_COMPRESSION_MODE_PLUGIN_TYPE,
61708 +               .groups = 0
61709 +       },
61710 +       [PSET_CLUSTER] = {
61711 +               .offset = offsetof(plugin_set, cluster),
61712 +               .type = REISER4_CLUSTER_PLUGIN_TYPE,
61713 +               .groups = 0
61714 +       },
61715 +       [PSET_CREATE] = {
61716 +               .offset = offsetof(plugin_set, create),
61717 +               .type = REISER4_FILE_PLUGIN_TYPE,
61718 +               .groups = (1 << REISER4_REGULAR_FILE)
61719 +       }
61720 +};
61721 +
61722 +#define DEFINE_PSET_OPS(PREFIX)                                                       \
61723 +       reiser4_plugin_type PREFIX##_member_to_type_unsafe(pset_member memb)   \
61724 +{                                                                             \
61725 +       if (memb > PSET_LAST)                                                  \
61726 +               return REISER4_PLUGIN_TYPES;                                   \
61727 +       return pset_descr[memb].type;                                          \
61728 +}                                                                             \
61729 +                                                                              \
61730 +int PREFIX##_set_unsafe(plugin_set ** set, pset_member memb,                  \
61731 +                    reiser4_plugin * plugin)                                  \
61732 +{                                                                             \
61733 +       assert("nikita-3492", set != NULL);                                    \
61734 +       assert("nikita-3493", *set != NULL);                                   \
61735 +       assert("nikita-3494", plugin != NULL);                                 \
61736 +       assert("nikita-3495", 0 <= memb && memb < PSET_LAST);                  \
61737 +       assert("nikita-3496", plugin->h.type_id == pset_descr[memb].type);     \
61738 +                                                                              \
61739 +       if (pset_descr[memb].groups)                                           \
61740 +               if (!(pset_descr[memb].groups & plugin->h.groups))             \
61741 +                       return -EINVAL;                                        \
61742 +                                                                              \
61743 +       return plugin_set_field(set,                                           \
61744 +                       (unsigned long)plugin, pset_descr[memb].offset);       \
61745 +}                                                                             \
61746 +                                                                              \
61747 +reiser4_plugin *PREFIX##_get(plugin_set * set, pset_member memb)              \
61748 +{                                                                             \
61749 +       assert("nikita-3497", set != NULL);                                    \
61750 +       assert("nikita-3498", 0 <= memb && memb < PSET_LAST);                  \
61751 +                                                                              \
61752 +       return *(reiser4_plugin **) (((char *)set) + pset_descr[memb].offset); \
61753 +}
61754 +
61755 +DEFINE_PSET_OPS(aset);
61756 +
61757 +int set_plugin(plugin_set ** set, pset_member memb, reiser4_plugin * plugin) {
61758 +       return plugin_set_field(set,
61759 +               (unsigned long)plugin, pset_descr[memb].offset);
61760 +}
61761 +
61762 +/**
61763 + * init_plugin_set - create plugin set cache and hash table
61764 + *
61765 + * Initializes slab cache of plugin_set-s and their hash table. It is part of
61766 + * reiser4 module initialization.
61767 + */
61768 +int init_plugin_set(void)
61769 +{
61770 +       int result;
61771 +
61772 +       result = ps_hash_init(&ps_table, PS_TABLE_SIZE);
61773 +       if (result == 0) {
61774 +               plugin_set_slab = kmem_cache_create("plugin_set",
61775 +                                                   sizeof(plugin_set), 0,
61776 +                                                   SLAB_HWCACHE_ALIGN,
61777 +                                                   NULL, NULL);
61778 +               if (plugin_set_slab == NULL)
61779 +                       result = RETERR(-ENOMEM);
61780 +       }
61781 +       return result;
61782 +}
61783 +
61784 +/**
61785 + * done_plugin_set - delete plugin_set cache and plugin_set hash table
61786 + *
61787 + * This is called on reiser4 module unloading or system shutdown.
61788 + */
61789 +void done_plugin_set(void)
61790 +{
61791 +       plugin_set *cur, *next;
61792 +
61793 +       for_all_in_htable(&ps_table, ps, cur, next) {
61794 +               ps_hash_remove(&ps_table, cur);
61795 +               kmem_cache_free(plugin_set_slab, cur);
61796 +       }
61797 +       destroy_reiser4_cache(&plugin_set_slab);
61798 +       ps_hash_done(&ps_table);
61799 +}
61800 +
61801 +/*
61802 + * Local variables:
61803 + * c-indentation-style: "K&R"
61804 + * mode-name: "LC"
61805 + * c-basic-offset: 8
61806 + * tab-width: 8
61807 + * fill-column: 120
61808 + * End:
61809 + */
61810 diff --git a/fs/reiser4/plugin/plugin_set.h b/fs/reiser4/plugin/plugin_set.h
61811 new file mode 100644
61812 index 0000000..8edcaea
61813 --- /dev/null
61814 +++ b/fs/reiser4/plugin/plugin_set.h
61815 @@ -0,0 +1,77 @@
61816 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61817 +
61818 +/* Reiser4 plugin set definition.
61819 +   See fs/reiser4/plugin/plugin_set.c for details */
61820 +
61821 +#if !defined( __PLUGIN_SET_H__ )
61822 +#define __PLUGIN_SET_H__
61823 +
61824 +#include "../type_safe_hash.h"
61825 +#include "plugin.h"
61826 +
61827 +#include <linux/rcupdate.h>
61828 +
61829 +struct plugin_set;
61830 +typedef struct plugin_set plugin_set;
61831 +
61832 +TYPE_SAFE_HASH_DECLARE(ps, plugin_set);
61833 +
61834 +struct plugin_set {
61835 +       unsigned long hashval;
61836 +       /* plugin of file */
61837 +       file_plugin *file;
61838 +       /* plugin of dir */
61839 +       dir_plugin *dir;
61840 +       /* perm plugin for this file */
61841 +       perm_plugin *perm;
61842 +       /* tail policy plugin. Only meaningful for regular files */
61843 +       formatting_plugin *formatting;
61844 +       /* hash plugin. Only meaningful for directories. */
61845 +       hash_plugin *hash;
61846 +       /* fibration plugin. Only meaningful for directories. */
61847 +       fibration_plugin *fibration;
61848 +       /* plugin of stat-data */
61849 +       item_plugin *sd;
61850 +       /* plugin of items a directory is built of */
61851 +       item_plugin *dir_item;
61852 +       /* cipher plugin */
61853 +       cipher_plugin *cipher;
61854 +       /* digest plugin */
61855 +       digest_plugin *digest;
61856 +       /* compression plugin */
61857 +       compression_plugin *compression;
61858 +       /* compression mode plugin */
61859 +       compression_mode_plugin *compression_mode;
61860 +       /* cluster plugin */
61861 +       cluster_plugin *cluster;
61862 +       /* this specifies file plugin of regular children.
61863 +          only meaningful for directories */
61864 +       file_plugin *create;
61865 +       ps_hash_link link;
61866 +};
61867 +
61868 +extern plugin_set *plugin_set_get_empty(void);
61869 +extern void plugin_set_put(plugin_set * set);
61870 +
61871 +extern int init_plugin_set(void);
61872 +extern void done_plugin_set(void);
61873 +
61874 +extern reiser4_plugin *aset_get(plugin_set * set, pset_member memb);
61875 +extern int set_plugin(plugin_set ** set, pset_member memb,
61876 +                     reiser4_plugin * plugin);
61877 +extern int aset_set_unsafe(plugin_set ** set, pset_member memb,
61878 +                          reiser4_plugin * plugin);
61879 +extern reiser4_plugin_type aset_member_to_type_unsafe(pset_member memb);
61880 +
61881 +/* __PLUGIN_SET_H__ */
61882 +#endif
61883 +
61884 +/* Make Linus happy.
61885 +   Local variables:
61886 +   c-indentation-style: "K&R"
61887 +   mode-name: "LC"
61888 +   c-basic-offset: 8
61889 +   tab-width: 8
61890 +   fill-column: 120
61891 +   End:
61892 +*/
61893 diff --git a/fs/reiser4/plugin/security/Makefile b/fs/reiser4/plugin/security/Makefile
61894 new file mode 100644
61895 index 0000000..645dbb5
61896 --- /dev/null
61897 +++ b/fs/reiser4/plugin/security/Makefile
61898 @@ -0,0 +1,4 @@
61899 +obj-$(CONFIG_REISER4_FS) += security_plugins.o
61900 +
61901 +security_plugins-objs :=       \
61902 +       perm.o
61903 diff --git a/fs/reiser4/plugin/security/perm.c b/fs/reiser4/plugin/security/perm.c
61904 new file mode 100644
61905 index 0000000..ab3b4fc
61906 --- /dev/null
61907 +++ b/fs/reiser4/plugin/security/perm.c
61908 @@ -0,0 +1,44 @@
61909 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61910 +
61911 +/*
61912 + * this file contains implementation of permission plugins. Currently, only
61913 + * RWX_PERM_ID is implemented
61914 + */
61915 +
61916 +#include "../plugin.h"
61917 +#include "../plugin_header.h"
61918 +#include "../../debug.h"
61919 +
61920 +perm_plugin perm_plugins[LAST_PERM_ID] = {
61921 +       [NULL_PERM_ID] = {
61922 +               .h = {
61923 +                       .type_id = REISER4_PERM_PLUGIN_TYPE,
61924 +                       .id = NULL_PERM_ID,
61925 +                       .pops = NULL,
61926 +                       .label = "null",
61927 +                       .desc = "stub permission plugin",
61928 +                       .linkage = {NULL, NULL}
61929 +               },
61930 +               .read_ok = NULL,
61931 +               .write_ok = NULL,
61932 +               .lookup_ok = NULL,
61933 +               .create_ok = NULL,
61934 +               .link_ok = NULL,
61935 +               .unlink_ok = NULL,
61936 +               .delete_ok = NULL,
61937 +               .mask_ok = NULL,
61938 +               .setattr_ok = NULL,
61939 +               .getattr_ok = NULL,
61940 +               .rename_ok = NULL,
61941 +       }
61942 +};
61943 +
61944 +/*
61945 + * Local variables:
61946 + * c-indentation-style: "K&R"
61947 + * mode-name: "LC"
61948 + * c-basic-offset: 8
61949 + * tab-width: 8
61950 + * fill-column: 79
61951 + * End:
61952 + */
61953 diff --git a/fs/reiser4/plugin/security/perm.h b/fs/reiser4/plugin/security/perm.h
61954 new file mode 100644
61955 index 0000000..747e8f7
61956 --- /dev/null
61957 +++ b/fs/reiser4/plugin/security/perm.h
61958 @@ -0,0 +1,82 @@
61959 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
61960 +
61961 +/* Perm (short for "permissions") plugins common stuff. */
61962 +
61963 +#if !defined( __REISER4_PERM_H__ )
61964 +#define __REISER4_PERM_H__
61965 +
61966 +#include "../../forward.h"
61967 +#include "../plugin_header.h"
61968 +
61969 +#include <linux/types.h>
61970 +#include <linux/fs.h>          /* for struct file  */
61971 +#include <linux/dcache.h>      /* for struct dentry */
61972 +
61973 +/* interface for perm plugin.
61974 +
61975 +   Perm plugin method can be implemented through:
61976 +
61977 +    1. consulting ->i_mode bits in stat data
61978 +
61979 +    2. obtaining acl from the tree and inspecting it
61980 +
61981 +    3. asking some kernel module or user-level program to authorize access.
61982 +
61983 +   This allows for integration with things like capabilities, SELinux-style
61984 +   secutiry contexts, etc.
61985 +
61986 +*/
61987 +/* NIKITA-FIXME-HANS: define what this is targeted for.  It does not seem to be intended for use with sys_reiser4.  Explain. */
61988 +typedef struct perm_plugin {
61989 +       /* generic plugin fields */
61990 +       plugin_header h;
61991 +
61992 +       /* check permissions for read/write */
61993 +       int (*read_ok) (struct file *file, const char __user *buf,
61994 +                       size_t size, loff_t *off);
61995 +       int (*write_ok) (struct file *file, const char __user *buf,
61996 +                        size_t size, loff_t *off);
61997 +
61998 +       /* check permissions for lookup */
61999 +       int (*lookup_ok) (struct inode * parent, struct dentry * dentry);
62000 +
62001 +       /* check permissions for create */
62002 +       int (*create_ok) (struct inode * parent, struct dentry * dentry,
62003 +                         reiser4_object_create_data * data);
62004 +
62005 +       /* check permissions for linking @where to @existing */
62006 +       int (*link_ok) (struct dentry * existing, struct inode * parent,
62007 +                       struct dentry * where);
62008 +
62009 +       /* check permissions for unlinking @victim from @parent */
62010 +       int (*unlink_ok) (struct inode * parent, struct dentry * victim);
62011 +
62012 +       /* check permissions for deletion of @object whose last reference is
62013 +          by @parent */
62014 +       int (*delete_ok) (struct inode * parent, struct dentry * victim);
62015 +       int (*mask_ok) (struct inode * inode, int mask);
62016 +       /* check whether attribute change is acceptable */
62017 +       int (*setattr_ok) (struct dentry * dentry, struct iattr * attr);
62018 +
62019 +       /* check whether stat(2) is allowed */
62020 +       int (*getattr_ok) (struct vfsmount * mnt UNUSED_ARG,
62021 +                          struct dentry * dentry, struct kstat * stat);
62022 +       /* check whether rename(2) is allowed */
62023 +       int (*rename_ok) (struct inode * old_dir, struct dentry * old,
62024 +                         struct inode * new_dir, struct dentry * new);
62025 +} perm_plugin;
62026 +
62027 +typedef enum { NULL_PERM_ID, LAST_PERM_ID } reiser4_perm_id;
62028 +
62029 +/* __REISER4_PERM_H__ */
62030 +#endif
62031 +
62032 +/* Make Linus happy.
62033 +   Local variables:
62034 +   c-indentation-style: "K&R"
62035 +   mode-name: "LC"
62036 +   c-basic-offset: 8
62037 +   tab-width: 8
62038 +   fill-column: 120
62039 +   End:
62040 +*/
62041 diff --git a/fs/reiser4/plugin/space/Makefile b/fs/reiser4/plugin/space/Makefile
62042 new file mode 100644
62043 index 0000000..5a0c94f
62044 --- /dev/null
62045 +++ b/fs/reiser4/plugin/space/Makefile
62046 @@ -0,0 +1,4 @@
62047 +obj-$(CONFIG_REISER4_FS) += space_plugins.o
62048 +
62049 +space_plugins-objs := \
62050 +       bitmap.o
62051 diff --git a/fs/reiser4/plugin/space/bitmap.c b/fs/reiser4/plugin/space/bitmap.c
62052 new file mode 100644
62053 index 0000000..a0ff17a
62054 --- /dev/null
62055 +++ b/fs/reiser4/plugin/space/bitmap.c
62056 @@ -0,0 +1,1585 @@
62057 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
62058 +
62059 +#include "../../debug.h"
62060 +#include "../../dformat.h"
62061 +#include "../../txnmgr.h"
62062 +#include "../../jnode.h"
62063 +#include "../../block_alloc.h"
62064 +#include "../../tree.h"
62065 +#include "../../super.h"
62066 +#include "../plugin.h"
62067 +#include "space_allocator.h"
62068 +#include "bitmap.h"
62069 +
62070 +#include <linux/types.h>
62071 +#include <linux/fs.h>          /* for struct super_block  */
62072 +#include <linux/mutex.h>
62073 +#include <asm/div64.h>
62074 +
62075 +/* Proposed (but discarded) optimization: dynamic loading/unloading of bitmap
62076 + * blocks
62077 +
62078 +   A useful optimization of reiser4 bitmap handling would be dynamic bitmap
62079 +   blocks loading/unloading which is different from v3.x where all bitmap
62080 +   blocks are loaded at mount time.
62081 +
62082 +   To implement bitmap blocks unloading we need to count bitmap block usage
62083 +   and detect currently unused blocks allowing them to be unloaded. It is not
62084 +   a simple task since we allow several threads to modify one bitmap block
62085 +   simultaneously.
62086 +
62087 +   Briefly speaking, the following schema is proposed: we count in special
62088 +   variable associated with each bitmap block. That is for counting of block
62089 +   alloc/dealloc operations on that bitmap block. With a deferred block
62090 +   deallocation feature of reiser4 all those operation will be represented in
62091 +   atom dirty/deleted lists as jnodes for freshly allocated or deleted
62092 +   nodes.
62093 +
62094 +   So, we increment usage counter for each new node allocated or deleted, and
62095 +   decrement it at atom commit one time for each node from the dirty/deleted
62096 +   atom's list.  Of course, freshly allocated node deletion and node reusing
62097 +   from atom deleted (if we do so) list should decrement bitmap usage counter
62098 +   also.
62099 +
62100 +   This schema seems to be working but that reference counting is
62101 +   not easy to debug. I think we should agree with Hans and do not implement
62102 +   it in v4.0. Current code implements "on-demand" bitmap blocks loading only.
62103 +
62104 +   For simplicity all bitmap nodes (both commit and working bitmap blocks) are
62105 +   loaded into memory on fs mount time or each bitmap nodes are loaded at the
62106 +   first access to it, the "dont_load_bitmap" mount option controls whether
62107 +   bimtap nodes should be loaded at mount time. Dynamic unloading of bitmap
62108 +   nodes currently is not supported. */
62109 +
62110 +#define CHECKSUM_SIZE    4
62111 +
62112 +#define BYTES_PER_LONG   (sizeof(long))
62113 +
62114 +#if BITS_PER_LONG == 64
62115 +#  define LONG_INT_SHIFT (6)
62116 +#else
62117 +#  define LONG_INT_SHIFT (5)
62118 +#endif
62119 +
62120 +#define LONG_INT_MASK (BITS_PER_LONG - 1UL)
62121 +
62122 +typedef unsigned long ulong_t;
62123 +
62124 +#define bmap_size(blocksize)       ((blocksize) - CHECKSUM_SIZE)
62125 +#define bmap_bit_count(blocksize)   (bmap_size(blocksize) << 3)
62126 +
62127 +/* Block allocation/deallocation are done through special bitmap objects which
62128 +   are allocated in an array at fs mount. */
62129 +struct bitmap_node {
62130 +       struct mutex mutex;     /* long term lock object */
62131 +
62132 +       jnode *wjnode;          /* j-nodes for WORKING ... */
62133 +       jnode *cjnode;          /* ... and COMMIT bitmap blocks */
62134 +
62135 +       bmap_off_t first_zero_bit;      /* for skip_busy option implementation */
62136 +
62137 +       atomic_t loaded;        /* a flag which shows that bnode is loaded
62138 +                                * already */
62139 +};
62140 +
62141 +static inline char *bnode_working_data(struct bitmap_node *bnode)
62142 +{
62143 +       char *data;
62144 +
62145 +       data = jdata(bnode->wjnode);
62146 +       assert("zam-429", data != NULL);
62147 +
62148 +       return data + CHECKSUM_SIZE;
62149 +}
62150 +
62151 +static inline char *bnode_commit_data(const struct bitmap_node *bnode)
62152 +{
62153 +       char *data;
62154 +
62155 +       data = jdata(bnode->cjnode);
62156 +       assert("zam-430", data != NULL);
62157 +
62158 +       return data + CHECKSUM_SIZE;
62159 +}
62160 +
62161 +static inline __u32 bnode_commit_crc(const struct bitmap_node *bnode)
62162 +{
62163 +       char *data;
62164 +
62165 +       data = jdata(bnode->cjnode);
62166 +       assert("vpf-261", data != NULL);
62167 +
62168 +       return le32_to_cpu(get_unaligned((d32 *)data));
62169 +}
62170 +
62171 +static inline void bnode_set_commit_crc(struct bitmap_node *bnode, __u32 crc)
62172 +{
62173 +       char *data;
62174 +
62175 +       data = jdata(bnode->cjnode);
62176 +       assert("vpf-261", data != NULL);
62177 +
62178 +       put_unaligned(cpu_to_le32(crc), (d32 *)data);
62179 +}
62180 +
62181 +/* ZAM-FIXME-HANS: is the idea that this might be a union someday? having
62182 + * written the code, does this added abstraction still have */
62183 +/* ANSWER(Zam): No, the abstractions is in the level above (exact place is the
62184 + * reiser4_space_allocator structure) */
62185 +/* ZAM-FIXME-HANS: I don't understand your english in comment above. */
62186 +/* FIXME-HANS(Zam): I don't understand the questions like "might be a union
62187 + * someday?". What they about?  If there is a reason to have a union, it should
62188 + * be a union, if not, it should not be a union.  "..might be someday" means no
62189 + * reason. */
62190 +struct bitmap_allocator_data {
62191 +       /* an array for bitmap blocks direct access */
62192 +       struct bitmap_node *bitmap;
62193 +};
62194 +
62195 +#define get_barray(super) \
62196 +(((struct bitmap_allocator_data *)(get_super_private(super)->space_allocator.u.generic)) -> bitmap)
62197 +
62198 +#define get_bnode(super, i) (get_barray(super) + i)
62199 +
62200 +/* allocate and initialize jnode with JNODE_BITMAP type */
62201 +static jnode *bnew(void)
62202 +{
62203 +       jnode *jal = jalloc();
62204 +
62205 +       if (jal)
62206 +               jnode_init(jal, current_tree, JNODE_BITMAP);
62207 +
62208 +       return jal;
62209 +}
62210 +
62211 +/* this file contains:
62212 +   - bitmap based implementation of space allocation plugin
62213 +   - all the helper functions like set bit, find_first_zero_bit, etc */
62214 +
62215 +/* Audited by: green(2002.06.12) */
62216 +static int find_next_zero_bit_in_word(ulong_t word, int start_bit)
62217 +{
62218 +       ulong_t mask = 1UL << start_bit;
62219 +       int i = start_bit;
62220 +
62221 +       while ((word & mask) != 0) {
62222 +               mask <<= 1;
62223 +               if (++i >= BITS_PER_LONG)
62224 +                       break;
62225 +       }
62226 +
62227 +       return i;
62228 +}
62229 +
62230 +#include <asm/bitops.h>
62231 +
62232 +#if BITS_PER_LONG == 64
62233 +
62234 +#define OFF(addr)  (((ulong_t)(addr) & (BYTES_PER_LONG - 1)) << 3)
62235 +#define BASE(addr) ((ulong_t*) ((ulong_t)(addr) & ~(BYTES_PER_LONG - 1)))
62236 +
62237 +static inline void reiser4_set_bit(int nr, void *addr)
62238 +{
62239 +       ext2_set_bit(nr + OFF(addr), BASE(addr));
62240 +}
62241 +
62242 +static inline void reiser4_clear_bit(int nr, void *addr)
62243 +{
62244 +       ext2_clear_bit(nr + OFF(addr), BASE(addr));
62245 +}
62246 +
62247 +static inline int reiser4_test_bit(int nr, void *addr)
62248 +{
62249 +       return ext2_test_bit(nr + OFF(addr), BASE(addr));
62250 +}
62251 +static inline int reiser4_find_next_zero_bit(void *addr, int maxoffset,
62252 +                                            int offset)
62253 +{
62254 +       int off = OFF(addr);
62255 +
62256 +       return ext2_find_next_zero_bit(BASE(addr), maxoffset + off,
62257 +                                      offset + off) - off;
62258 +}
62259 +
62260 +#else
62261 +
62262 +#define reiser4_set_bit(nr, addr)    ext2_set_bit(nr, addr)
62263 +#define reiser4_clear_bit(nr, addr)  ext2_clear_bit(nr, addr)
62264 +#define reiser4_test_bit(nr, addr)  ext2_test_bit(nr, addr)
62265 +
62266 +#define reiser4_find_next_zero_bit(addr, maxoffset, offset) \
62267 +ext2_find_next_zero_bit(addr, maxoffset, offset)
62268 +#endif
62269 +
62270 +/* Search for a set bit in the bit array [@start_offset, @max_offset[, offsets
62271 + * are counted from @addr, return the offset of the first bit if it is found,
62272 + * @maxoffset otherwise. */
62273 +static bmap_off_t __reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
62274 +                                             bmap_off_t start_offset)
62275 +{
62276 +       ulong_t *base = addr;
62277 +       /* start_offset is in bits, convert it to byte offset within bitmap. */
62278 +       int word_nr = start_offset >> LONG_INT_SHIFT;
62279 +       /* bit number within the byte. */
62280 +       int bit_nr = start_offset & LONG_INT_MASK;
62281 +       int max_word_nr = (max_offset - 1) >> LONG_INT_SHIFT;
62282 +
62283 +       assert("zam-387", max_offset != 0);
62284 +
62285 +       /* Unaligned @start_offset case.  */
62286 +       if (bit_nr != 0) {
62287 +               bmap_nr_t nr;
62288 +
62289 +               nr = find_next_zero_bit_in_word(~(base[word_nr]), bit_nr);
62290 +
62291 +               if (nr < BITS_PER_LONG)
62292 +                       return (word_nr << LONG_INT_SHIFT) + nr;
62293 +
62294 +               ++word_nr;
62295 +       }
62296 +
62297 +       /* Fast scan trough aligned words. */
62298 +       while (word_nr <= max_word_nr) {
62299 +               if (base[word_nr] != 0) {
62300 +                       return (word_nr << LONG_INT_SHIFT)
62301 +                           + find_next_zero_bit_in_word(~(base[word_nr]), 0);
62302 +               }
62303 +
62304 +               ++word_nr;
62305 +       }
62306 +
62307 +       return max_offset;
62308 +}
62309 +
62310 +#if BITS_PER_LONG == 64
62311 +
62312 +static bmap_off_t reiser4_find_next_set_bit(void *addr, bmap_off_t max_offset,
62313 +                                           bmap_off_t start_offset)
62314 +{
62315 +       bmap_off_t off = OFF(addr);
62316 +
62317 +       return __reiser4_find_next_set_bit(BASE(addr), max_offset + off,
62318 +                                          start_offset + off) - off;
62319 +}
62320 +
62321 +#else
62322 +#define reiser4_find_next_set_bit(addr, max_offset, start_offset) \
62323 +  __reiser4_find_next_set_bit(addr, max_offset, start_offset)
62324 +#endif
62325 +
62326 +/* search for the first set bit in single word. */
62327 +static int find_last_set_bit_in_word(ulong_t word, int start_bit)
62328 +{
62329 +       ulong_t bit_mask;
62330 +       int nr = start_bit;
62331 +
62332 +       assert("zam-965", start_bit < BITS_PER_LONG);
62333 +       assert("zam-966", start_bit >= 0);
62334 +
62335 +       bit_mask = (1UL << nr);
62336 +
62337 +       while (bit_mask != 0) {
62338 +               if (bit_mask & word)
62339 +                       return nr;
62340 +               bit_mask >>= 1;
62341 +               nr--;
62342 +       }
62343 +       return BITS_PER_LONG;
62344 +}
62345 +
62346 +/* Search bitmap for a set bit in backward direction from the end to the
62347 + * beginning of given region
62348 + *
62349 + * @result: result offset of the last set bit
62350 + * @addr:   base memory address,
62351 + * @low_off:  low end of the search region, edge bit included into the region,
62352 + * @high_off: high end of the search region, edge bit included into the region,
62353 + *
62354 + * @return: 0 - set bit was found, -1 otherwise.
62355 + */
62356 +static int
62357 +reiser4_find_last_set_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
62358 +                         bmap_off_t high_off)
62359 +{
62360 +       ulong_t *base = addr;
62361 +       int last_word;
62362 +       int first_word;
62363 +       int last_bit;
62364 +       int nr;
62365 +
62366 +       assert("zam-962", high_off >= low_off);
62367 +
62368 +       last_word = high_off >> LONG_INT_SHIFT;
62369 +       last_bit = high_off & LONG_INT_MASK;
62370 +       first_word = low_off >> LONG_INT_SHIFT;
62371 +
62372 +       if (last_bit < BITS_PER_LONG) {
62373 +               nr = find_last_set_bit_in_word(base[last_word], last_bit);
62374 +               if (nr < BITS_PER_LONG) {
62375 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
62376 +                       return 0;
62377 +               }
62378 +               --last_word;
62379 +       }
62380 +       while (last_word >= first_word) {
62381 +               if (base[last_word] != 0x0) {
62382 +                       last_bit =
62383 +                           find_last_set_bit_in_word(base[last_word],
62384 +                                                     BITS_PER_LONG - 1);
62385 +                       assert("zam-972", last_bit < BITS_PER_LONG);
62386 +                       *result = (last_word << LONG_INT_SHIFT) + last_bit;
62387 +                       return 0;
62388 +               }
62389 +               --last_word;
62390 +       }
62391 +
62392 +       return -1;              /* set bit not found */
62393 +}
62394 +
62395 +/* Search bitmap for a clear bit in backward direction from the end to the
62396 + * beginning of given region */
62397 +static int
62398 +reiser4_find_last_zero_bit(bmap_off_t * result, void *addr, bmap_off_t low_off,
62399 +                          bmap_off_t high_off)
62400 +{
62401 +       ulong_t *base = addr;
62402 +       int last_word;
62403 +       int first_word;
62404 +       int last_bit;
62405 +       int nr;
62406 +
62407 +       last_word = high_off >> LONG_INT_SHIFT;
62408 +       last_bit = high_off & LONG_INT_MASK;
62409 +       first_word = low_off >> LONG_INT_SHIFT;
62410 +
62411 +       if (last_bit < BITS_PER_LONG) {
62412 +               nr = find_last_set_bit_in_word(~base[last_word], last_bit);
62413 +               if (nr < BITS_PER_LONG) {
62414 +                       *result = (last_word << LONG_INT_SHIFT) + nr;
62415 +                       return 0;
62416 +               }
62417 +               --last_word;
62418 +       }
62419 +       while (last_word >= first_word) {
62420 +               if (base[last_word] != (ulong_t) (-1)) {
62421 +                       *result = (last_word << LONG_INT_SHIFT) +
62422 +                           find_last_set_bit_in_word(~base[last_word],
62423 +                                                     BITS_PER_LONG - 1);
62424 +                       return 0;
62425 +               }
62426 +               --last_word;
62427 +       }
62428 +
62429 +       return -1;              /* zero bit not found */
62430 +}
62431 +
62432 +/* Audited by: green(2002.06.12) */
62433 +static void reiser4_clear_bits(char *addr, bmap_off_t start, bmap_off_t end)
62434 +{
62435 +       int first_byte;
62436 +       int last_byte;
62437 +
62438 +       unsigned char first_byte_mask = 0xFF;
62439 +       unsigned char last_byte_mask = 0xFF;
62440 +
62441 +       assert("zam-410", start < end);
62442 +
62443 +       first_byte = start >> 3;
62444 +       last_byte = (end - 1) >> 3;
62445 +
62446 +       if (last_byte > first_byte + 1)
62447 +               memset(addr + first_byte + 1, 0,
62448 +                      (size_t) (last_byte - first_byte - 1));
62449 +
62450 +       first_byte_mask >>= 8 - (start & 0x7);
62451 +       last_byte_mask <<= ((end - 1) & 0x7) + 1;
62452 +
62453 +       if (first_byte == last_byte) {
62454 +               addr[first_byte] &= (first_byte_mask | last_byte_mask);
62455 +       } else {
62456 +               addr[first_byte] &= first_byte_mask;
62457 +               addr[last_byte] &= last_byte_mask;
62458 +       }
62459 +}
62460 +
62461 +/* Audited by: green(2002.06.12) */
62462 +/* ZAM-FIXME-HANS: comment this */
62463 +static void reiser4_set_bits(char *addr, bmap_off_t start, bmap_off_t end)
62464 +{
62465 +       int first_byte;
62466 +       int last_byte;
62467 +
62468 +       unsigned char first_byte_mask = 0xFF;
62469 +       unsigned char last_byte_mask = 0xFF;
62470 +
62471 +       assert("zam-386", start < end);
62472 +
62473 +       first_byte = start >> 3;
62474 +       last_byte = (end - 1) >> 3;
62475 +
62476 +       if (last_byte > first_byte + 1)
62477 +               memset(addr + first_byte + 1, 0xFF,
62478 +                      (size_t) (last_byte - first_byte - 1));
62479 +
62480 +       first_byte_mask <<= start & 0x7;
62481 +       last_byte_mask >>= 7 - ((end - 1) & 0x7);
62482 +
62483 +       if (first_byte == last_byte) {
62484 +               addr[first_byte] |= (first_byte_mask & last_byte_mask);
62485 +       } else {
62486 +               addr[first_byte] |= first_byte_mask;
62487 +               addr[last_byte] |= last_byte_mask;
62488 +       }
62489 +}
62490 +
62491 +#define ADLER_BASE    65521
62492 +#define ADLER_NMAX    5552
62493 +
62494 +/* Calculates the adler32 checksum for the data pointed by `data` of the
62495 +    length `len`. This function was originally taken from zlib, version 1.1.3,
62496 +    July 9th, 1998.
62497 +
62498 +    Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
62499 +
62500 +    This software is provided 'as-is', without any express or implied
62501 +    warranty.  In no event will the authors be held liable for any damages
62502 +    arising from the use of this software.
62503 +
62504 +    Permission is granted to anyone to use this software for any purpose,
62505 +    including commercial applications, and to alter it and redistribute it
62506 +    freely, subject to the following restrictions:
62507 +
62508 +    1. The origin of this software must not be misrepresented; you must not
62509 +       claim that you wrote the original software. If you use this software
62510 +       in a product, an acknowledgment in the product documentation would be
62511 +       appreciated but is not required.
62512 +    2. Altered source versions must be plainly marked as such, and must not be
62513 +       misrepresented as being the original software.
62514 +    3. This notice may not be removed or altered from any source distribution.
62515 +
62516 +    Jean-loup Gailly        Mark Adler
62517 +    jloup@gzip.org          madler@alumni.caltech.edu
62518 +
62519 +    The above comment applies only to the reiser4_adler32 function.
62520 +*/
62521 +
62522 +__u32 reiser4_adler32(char *data, __u32 len)
62523 +{
62524 +       unsigned char *t = data;
62525 +       __u32 s1 = 1;
62526 +       __u32 s2 = 0;
62527 +       int k;
62528 +
62529 +       while (len > 0) {
62530 +               k = len < ADLER_NMAX ? len : ADLER_NMAX;
62531 +               len -= k;
62532 +
62533 +               while (k--) {
62534 +                       s1 += *t++;
62535 +                       s2 += s1;
62536 +               }
62537 +
62538 +               s1 %= ADLER_BASE;
62539 +               s2 %= ADLER_BASE;
62540 +       }
62541 +       return (s2 << 16) | s1;
62542 +}
62543 +
62544 +#define sb_by_bnode(bnode) \
62545 +       ((struct super_block *)jnode_get_tree(bnode->wjnode)->super)
62546 +
62547 +static __u32 bnode_calc_crc(const struct bitmap_node *bnode, unsigned long size)
62548 +{
62549 +       return reiser4_adler32(bnode_commit_data(bnode), bmap_size(size));
62550 +}
62551 +
62552 +static int
62553 +bnode_check_adler32(const struct bitmap_node *bnode, unsigned long size)
62554 +{
62555 +       if (bnode_calc_crc(bnode, size) != bnode_commit_crc(bnode)) {
62556 +               bmap_nr_t bmap;
62557 +
62558 +               bmap = bnode - get_bnode(sb_by_bnode(bnode), 0);
62559 +
62560 +               warning("vpf-263",
62561 +                       "Checksum for the bitmap block %llu is incorrect",
62562 +                       bmap);
62563 +
62564 +               return RETERR(-EIO);
62565 +       }
62566 +
62567 +       return 0;
62568 +}
62569 +
62570 +#define REISER4_CHECK_BMAP_CRC (0)
62571 +
62572 +#if REISER4_CHECK_BMAP_CRC
62573 +static int bnode_check_crc(const struct bitmap_node *bnode)
62574 +{
62575 +       return bnode_check_adler32(bnode,
62576 +                                  bmap_size(sb_by_bnode(bnode)->s_blocksize));
62577 +}
62578 +
62579 +/* REISER4_CHECK_BMAP_CRC */
62580 +#else
62581 +
62582 +#define bnode_check_crc(bnode) (0)
62583 +
62584 +/* REISER4_CHECK_BMAP_CRC */
62585 +#endif
62586 +
62587 +/* Recalculates the adler32 checksum for only 1 byte change.
62588 +    adler - previous adler checksum
62589 +    old_data, data - old, new byte values.
62590 +    tail == (chunk - offset) : length, checksum was calculated for, - offset of
62591 +    the changed byte within this chunk.
62592 +    This function can be used for checksum calculation optimisation.
62593 +*/
62594 +
62595 +static __u32
62596 +adler32_recalc(__u32 adler, unsigned char old_data, unsigned char data,
62597 +              __u32 tail)
62598 +{
62599 +       __u32 delta = data - old_data + 2 * ADLER_BASE;
62600 +       __u32 s1 = adler & 0xffff;
62601 +       __u32 s2 = (adler >> 16) & 0xffff;
62602 +
62603 +       s1 = (delta + s1) % ADLER_BASE;
62604 +       s2 = (delta * tail + s2) % ADLER_BASE;
62605 +
62606 +       return (s2 << 16) | s1;
62607 +}
62608 +
62609 +#define LIMIT(val, boundary) ((val) > (boundary) ? (boundary) : (val))
62610 +
62611 +/**
62612 + * get_nr_bitmap - calculate number of bitmap blocks
62613 + * @super: super block with initialized blocksize and block count
62614 + *
62615 + * Calculates number of bitmap blocks of a filesystem which uses bitmaps to
62616 + * maintain free disk space. It assumes that each bitmap addresses the same
62617 + * number of blocks which is calculated by bmap_block_count macro defined in
62618 + * above. Number of blocks in the filesystem has to be initialized in reiser4
62619 + * private data of super block already so that it can be obtained via
62620 + * reiser4_block_count(). Unfortunately, number of blocks addressed by a bitmap
62621 + * is not power of 2 because 4 bytes are used for checksum. Therefore, we have
62622 + * to use special function to divide and modulo 64bits filesystem block
62623 + * counters.
62624 + *
62625 + * Example: suppose filesystem have 32768 blocks. Blocksize is 4096. Each bitmap
62626 + * block addresses (4096 - 4) * 8 = 32736 blocks. Number of bitmaps to address
62627 + * all 32768 blocks is calculated as (32768 - 1) / 32736 + 1 = 2.
62628 + */
62629 +static bmap_nr_t get_nr_bmap(const struct super_block *super)
62630 +{
62631 +       u64 quotient;
62632 +
62633 +       assert("zam-393", reiser4_block_count(super) != 0);
62634 +
62635 +       quotient = reiser4_block_count(super) - 1;
62636 +       do_div(quotient, bmap_bit_count(super->s_blocksize));
62637 +       return quotient + 1;
62638 +}
62639 +
62640 +/**
62641 + * parse_blocknr - calculate bitmap number and offset in it by block number
62642 + * @block: pointer to block number to calculate location in bitmap of
62643 + * @bmap: pointer where to store bitmap block number
62644 + * @offset: pointer where to store offset within bitmap block
62645 + *
62646 + * Calculates location of bit which is responsible for allocation/freeing of
62647 + * block @*block. That location is represented by bitmap block number and offset
62648 + * within that bitmap block.
62649 + */
62650 +static void
62651 +parse_blocknr(const reiser4_block_nr *block, bmap_nr_t *bmap,
62652 +             bmap_off_t *offset)
62653 +{
62654 +       struct super_block *super = get_current_context()->super;
62655 +       u64 quotient = *block;
62656 +
62657 +       *offset = do_div(quotient, bmap_bit_count(super->s_blocksize));
62658 +       *bmap = quotient;
62659 +
62660 +       assert("zam-433", *bmap < get_nr_bmap(super));
62661 +       assert("", *offset < bmap_bit_count(super->s_blocksize));
62662 +}
62663 +
62664 +#if REISER4_DEBUG
62665 +/* Audited by: green(2002.06.12) */
62666 +static void
62667 +check_block_range(const reiser4_block_nr * start, const reiser4_block_nr * len)
62668 +{
62669 +       struct super_block *sb = reiser4_get_current_sb();
62670 +
62671 +       assert("zam-436", sb != NULL);
62672 +
62673 +       assert("zam-455", start != NULL);
62674 +       assert("zam-437", *start != 0);
62675 +       assert("zam-541", !reiser4_blocknr_is_fake(start));
62676 +       assert("zam-441", *start < reiser4_block_count(sb));
62677 +
62678 +       if (len != NULL) {
62679 +               assert("zam-438", *len != 0);
62680 +               assert("zam-442", *start + *len <= reiser4_block_count(sb));
62681 +       }
62682 +}
62683 +
62684 +static void check_bnode_loaded(const struct bitmap_node *bnode)
62685 +{
62686 +       assert("zam-485", bnode != NULL);
62687 +       assert("zam-483", jnode_page(bnode->wjnode) != NULL);
62688 +       assert("zam-484", jnode_page(bnode->cjnode) != NULL);
62689 +       assert("nikita-2820", jnode_is_loaded(bnode->wjnode));
62690 +       assert("nikita-2821", jnode_is_loaded(bnode->cjnode));
62691 +}
62692 +
62693 +#else
62694 +
62695 +#  define check_block_range(start, len) do { /* nothing */} while(0)
62696 +#  define check_bnode_loaded(bnode)     do { /* nothing */} while(0)
62697 +
62698 +#endif
62699 +
62700 +/* modify bnode->first_zero_bit (if we free bits before); bnode should be
62701 +   spin-locked */
62702 +static inline void
62703 +adjust_first_zero_bit(struct bitmap_node *bnode, bmap_off_t offset)
62704 +{
62705 +       if (offset < bnode->first_zero_bit)
62706 +               bnode->first_zero_bit = offset;
62707 +}
62708 +
62709 +/* return a physical disk address for logical bitmap number @bmap */
62710 +/* FIXME-VS: this is somehow related to disk layout? */
62711 +/* ZAM-FIXME-HANS: your answer is? Use not more than one function dereference
62712 + * per block allocation so that performance is not affected.  Probably this
62713 + * whole file should be considered part of the disk layout plugin, and other
62714 + * disk layouts can use other defines and efficiency will not be significantly
62715 + * affected.  */
62716 +
62717 +#define REISER4_FIRST_BITMAP_BLOCK \
62718 +       ((REISER4_MASTER_OFFSET / PAGE_CACHE_SIZE) + 2)
62719 +
62720 +/* Audited by: green(2002.06.12) */
62721 +static void
62722 +get_bitmap_blocknr(struct super_block *super, bmap_nr_t bmap,
62723 +                  reiser4_block_nr * bnr)
62724 +{
62725 +
62726 +       assert("zam-390", bmap < get_nr_bmap(super));
62727 +
62728 +#ifdef CONFIG_REISER4_BADBLOCKS
62729 +#define BITMAP_PLUGIN_DISKMAP_ID ((0xc0e1<<16) | (0xe0ff))
62730 +       /* Check if the diskmap have this already, first. */
62731 +       if (reiser4_get_diskmap_value(BITMAP_PLUGIN_DISKMAP_ID, bmap, bnr) == 0)
62732 +               return;         /* Found it in diskmap */
62733 +#endif
62734 +       /* FIXME_ZAM: before discussing of disk layouts and disk format
62735 +          plugins I implement bitmap location scheme which is close to scheme
62736 +          used in reiser 3.6 */
62737 +       if (bmap == 0) {
62738 +               *bnr = REISER4_FIRST_BITMAP_BLOCK;
62739 +       } else {
62740 +               *bnr = bmap * bmap_bit_count(super->s_blocksize);
62741 +       }
62742 +}
62743 +
62744 +/* construct a fake block number for shadow bitmap (WORKING BITMAP) block */
62745 +/* Audited by: green(2002.06.12) */
62746 +static void get_working_bitmap_blocknr(bmap_nr_t bmap, reiser4_block_nr * bnr)
62747 +{
62748 +       *bnr =
62749 +           (reiser4_block_nr) ((bmap & ~REISER4_BLOCKNR_STATUS_BIT_MASK) |
62750 +                               REISER4_BITMAP_BLOCKS_STATUS_VALUE);
62751 +}
62752 +
62753 +/* bnode structure initialization */
62754 +static void
62755 +init_bnode(struct bitmap_node *bnode,
62756 +          struct super_block *super UNUSED_ARG, bmap_nr_t bmap UNUSED_ARG)
62757 +{
62758 +       memset(bnode, 0, sizeof(struct bitmap_node));
62759 +
62760 +       mutex_init(&bnode->mutex);
62761 +       atomic_set(&bnode->loaded, 0);
62762 +}
62763 +
62764 +static void release(jnode * node)
62765 +{
62766 +       jrelse(node);
62767 +       JF_SET(node, JNODE_HEARD_BANSHEE);
62768 +       jput(node);
62769 +}
62770 +
62771 +/* This function is for internal bitmap.c use because it assumes that jnode is
62772 +   in under full control of this thread */
62773 +static void done_bnode(struct bitmap_node *bnode)
62774 +{
62775 +       if (bnode) {
62776 +               atomic_set(&bnode->loaded, 0);
62777 +               if (bnode->wjnode != NULL)
62778 +                       release(bnode->wjnode);
62779 +               if (bnode->cjnode != NULL)
62780 +                       release(bnode->cjnode);
62781 +               bnode->wjnode = bnode->cjnode = NULL;
62782 +       }
62783 +}
62784 +
62785 +/* ZAM-FIXME-HANS: comment this.  Called only by load_and_lock_bnode()*/
62786 +static int prepare_bnode(struct bitmap_node *bnode, jnode **cjnode_ret,
62787 +                        jnode **wjnode_ret)
62788 +{
62789 +       struct super_block *super;
62790 +       jnode *cjnode;
62791 +       jnode *wjnode;
62792 +       bmap_nr_t bmap;
62793 +       int ret;
62794 +
62795 +       super = reiser4_get_current_sb();
62796 +
62797 +       *wjnode_ret = wjnode = bnew();
62798 +       if (wjnode == NULL) {
62799 +               *cjnode_ret = NULL;
62800 +               return RETERR(-ENOMEM);
62801 +       }
62802 +
62803 +       *cjnode_ret = cjnode = bnew();
62804 +       if (cjnode == NULL)
62805 +               return RETERR(-ENOMEM);
62806 +
62807 +       bmap = bnode - get_bnode(super, 0);
62808 +
62809 +       get_working_bitmap_blocknr(bmap, &wjnode->blocknr);
62810 +       get_bitmap_blocknr(super, bmap, &cjnode->blocknr);
62811 +
62812 +       jref(cjnode);
62813 +       jref(wjnode);
62814 +
62815 +       /* load commit bitmap */
62816 +       ret = jload_gfp(cjnode, GFP_NOFS, 1);
62817 +
62818 +       if (ret)
62819 +               goto error;
62820 +
62821 +       /* allocate memory for working bitmap block. Note that for
62822 +        * bitmaps jinit_new() doesn't actually modifies node content,
62823 +        * so parallel calls to this are ok. */
62824 +       ret = jinit_new(wjnode, GFP_NOFS);
62825 +
62826 +       if (ret != 0) {
62827 +               jrelse(cjnode);
62828 +               goto error;
62829 +       }
62830 +
62831 +       return 0;
62832 +
62833 +      error:
62834 +       jput(cjnode);
62835 +       jput(wjnode);
62836 +       *wjnode_ret = *cjnode_ret = NULL;
62837 +       return ret;
62838 +
62839 +}
62840 +
62841 +/* Check the bnode data on read. */
62842 +static int check_struct_bnode(struct bitmap_node *bnode, __u32 blksize)
62843 +{
62844 +       void *data;
62845 +       int ret;
62846 +
62847 +       /* Check CRC */
62848 +       ret = bnode_check_adler32(bnode, blksize);
62849 +
62850 +       if (ret) {
62851 +               return ret;
62852 +       }
62853 +
62854 +       data = jdata(bnode->cjnode) + CHECKSUM_SIZE;
62855 +
62856 +       /* Check the very first bit -- it must be busy. */
62857 +       if (!reiser4_test_bit(0, data)) {
62858 +               warning("vpf-1362", "The allocator block %llu is not marked "
62859 +                       "as used.", (unsigned long long)bnode->cjnode->blocknr);
62860 +
62861 +               return -EINVAL;
62862 +       }
62863 +
62864 +       return 0;
62865 +}
62866 +
62867 +/* load bitmap blocks "on-demand" */
62868 +static int load_and_lock_bnode(struct bitmap_node *bnode)
62869 +{
62870 +       int ret;
62871 +
62872 +       jnode *cjnode;
62873 +       jnode *wjnode;
62874 +
62875 +       assert("nikita-3040", reiser4_schedulable());
62876 +
62877 +/* ZAM-FIXME-HANS: since bitmaps are never unloaded, this does not
62878 + * need to be atomic, right? Just leave a comment that if bitmaps were
62879 + * unloadable, this would need to be atomic.  */
62880 +       if (atomic_read(&bnode->loaded)) {
62881 +               /* bitmap is already loaded, nothing to do */
62882 +               check_bnode_loaded(bnode);
62883 +               mutex_lock(&bnode->mutex);
62884 +               assert("nikita-2827", atomic_read(&bnode->loaded));
62885 +               return 0;
62886 +       }
62887 +
62888 +       ret = prepare_bnode(bnode, &cjnode, &wjnode);
62889 +       if (ret == 0) {
62890 +               mutex_lock(&bnode->mutex);
62891 +
62892 +               if (!atomic_read(&bnode->loaded)) {
62893 +                       assert("nikita-2822", cjnode != NULL);
62894 +                       assert("nikita-2823", wjnode != NULL);
62895 +                       assert("nikita-2824", jnode_is_loaded(cjnode));
62896 +                       assert("nikita-2825", jnode_is_loaded(wjnode));
62897 +
62898 +                       bnode->wjnode = wjnode;
62899 +                       bnode->cjnode = cjnode;
62900 +
62901 +                       ret = check_struct_bnode(bnode, current_blocksize);
62902 +                       if (!ret) {
62903 +                               cjnode = wjnode = NULL;
62904 +                               atomic_set(&bnode->loaded, 1);
62905 +                               /* working bitmap is initialized by on-disk
62906 +                                * commit bitmap. This should be performed
62907 +                                * under mutex. */
62908 +                               memcpy(bnode_working_data(bnode),
62909 +                                      bnode_commit_data(bnode),
62910 +                                      bmap_size(current_blocksize));
62911 +                       } else
62912 +                               mutex_unlock(&bnode->mutex);
62913 +               } else
62914 +                       /* race: someone already loaded bitmap while we were
62915 +                        * busy initializing data. */
62916 +                       check_bnode_loaded(bnode);
62917 +       }
62918 +
62919 +       if (wjnode != NULL) {
62920 +               release(wjnode);
62921 +               bnode->wjnode = NULL;
62922 +       }
62923 +       if (cjnode != NULL) {
62924 +               release(cjnode);
62925 +               bnode->cjnode = NULL;
62926 +       }
62927 +
62928 +       return ret;
62929 +}
62930 +
62931 +static void release_and_unlock_bnode(struct bitmap_node *bnode)
62932 +{
62933 +       check_bnode_loaded(bnode);
62934 +       mutex_unlock(&bnode->mutex);
62935 +}
62936 +
62937 +/* This function does all block allocation work but only for one bitmap
62938 +   block.*/
62939 +/* FIXME_ZAM: It does not allow us to allocate block ranges across bitmap
62940 +   block responsibility zone boundaries. This had no sense in v3.6 but may
62941 +   have it in v4.x */
62942 +/* ZAM-FIXME-HANS: do you mean search one bitmap block forward? */
62943 +static int
62944 +search_one_bitmap_forward(bmap_nr_t bmap, bmap_off_t * offset,
62945 +                         bmap_off_t max_offset, int min_len, int max_len)
62946 +{
62947 +       struct super_block *super = get_current_context()->super;
62948 +       struct bitmap_node *bnode = get_bnode(super, bmap);
62949 +
62950 +       char *data;
62951 +
62952 +       bmap_off_t search_end;
62953 +       bmap_off_t start;
62954 +       bmap_off_t end;
62955 +
62956 +       int set_first_zero_bit = 0;
62957 +
62958 +       int ret;
62959 +
62960 +       assert("zam-364", min_len > 0);
62961 +       assert("zam-365", max_len >= min_len);
62962 +       assert("zam-366", *offset <= max_offset);
62963 +
62964 +       ret = load_and_lock_bnode(bnode);
62965 +
62966 +       if (ret)
62967 +               return ret;
62968 +
62969 +       data = bnode_working_data(bnode);
62970 +
62971 +       start = *offset;
62972 +
62973 +       if (bnode->first_zero_bit >= start) {
62974 +               start = bnode->first_zero_bit;
62975 +               set_first_zero_bit = 1;
62976 +       }
62977 +
62978 +       while (start + min_len < max_offset) {
62979 +
62980 +               start =
62981 +                   reiser4_find_next_zero_bit((long *)data, max_offset, start);
62982 +               if (set_first_zero_bit) {
62983 +                       bnode->first_zero_bit = start;
62984 +                       set_first_zero_bit = 0;
62985 +               }
62986 +               if (start >= max_offset)
62987 +                       break;
62988 +
62989 +               search_end = LIMIT(start + max_len, max_offset);
62990 +               end =
62991 +                   reiser4_find_next_set_bit((long *)data, search_end, start);
62992 +               if (end >= start + min_len) {
62993 +                       /* we can't trust find_next_set_bit result if set bit
62994 +                          was not fount, result may be bigger than
62995 +                          max_offset */
62996 +                       if (end > search_end)
62997 +                               end = search_end;
62998 +
62999 +                       ret = end - start;
63000 +                       *offset = start;
63001 +
63002 +                       reiser4_set_bits(data, start, end);
63003 +
63004 +                       /* FIXME: we may advance first_zero_bit if [start,
63005 +                          end] region overlaps the first_zero_bit point */
63006 +
63007 +                       break;
63008 +               }
63009 +
63010 +               start = end + 1;
63011 +       }
63012 +
63013 +       release_and_unlock_bnode(bnode);
63014 +
63015 +       return ret;
63016 +}
63017 +
63018 +static int
63019 +search_one_bitmap_backward(bmap_nr_t bmap, bmap_off_t * start_offset,
63020 +                          bmap_off_t end_offset, int min_len, int max_len)
63021 +{
63022 +       struct super_block *super = get_current_context()->super;
63023 +       struct bitmap_node *bnode = get_bnode(super, bmap);
63024 +       char *data;
63025 +       bmap_off_t start;
63026 +       int ret;
63027 +
63028 +       assert("zam-958", min_len > 0);
63029 +       assert("zam-959", max_len >= min_len);
63030 +       assert("zam-960", *start_offset >= end_offset);
63031 +
63032 +       ret = load_and_lock_bnode(bnode);
63033 +       if (ret)
63034 +               return ret;
63035 +
63036 +       data = bnode_working_data(bnode);
63037 +       start = *start_offset;
63038 +
63039 +       while (1) {
63040 +               bmap_off_t end, search_end;
63041 +
63042 +               /* Find the beginning of the zero filled region */
63043 +               if (reiser4_find_last_zero_bit(&start, data, end_offset, start))
63044 +                       break;
63045 +               /* Is there more than `min_len' bits from `start' to
63046 +                * `end_offset'?  */
63047 +               if (start < end_offset + min_len - 1)
63048 +                       break;
63049 +
63050 +               /* Do not search to `end_offset' if we need to find less than
63051 +                * `max_len' zero bits. */
63052 +               if (end_offset + max_len - 1 < start)
63053 +                       search_end = start - max_len + 1;
63054 +               else
63055 +                       search_end = end_offset;
63056 +
63057 +               if (reiser4_find_last_set_bit(&end, data, search_end, start))
63058 +                       end = search_end;
63059 +               else
63060 +                       end++;
63061 +
63062 +               if (end + min_len <= start + 1) {
63063 +                       if (end < search_end)
63064 +                               end = search_end;
63065 +                       ret = start - end + 1;
63066 +                       *start_offset = end;    /* `end' is lowest offset */
63067 +                       assert("zam-987",
63068 +                              reiser4_find_next_set_bit(data, start + 1,
63069 +                                                        end) >= start + 1);
63070 +                       reiser4_set_bits(data, end, start + 1);
63071 +                       break;
63072 +               }
63073 +
63074 +               if (end <= end_offset)
63075 +                       /* left search boundary reached. */
63076 +                       break;
63077 +               start = end - 1;
63078 +       }
63079 +
63080 +       release_and_unlock_bnode(bnode);
63081 +       return ret;
63082 +}
63083 +
63084 +/* allocate contiguous range of blocks in bitmap */
63085 +static int bitmap_alloc_forward(reiser4_block_nr * start,
63086 +                               const reiser4_block_nr * end, int min_len,
63087 +                               int max_len)
63088 +{
63089 +       bmap_nr_t bmap, end_bmap;
63090 +       bmap_off_t offset, end_offset;
63091 +       int len;
63092 +
63093 +       reiser4_block_nr tmp;
63094 +
63095 +       struct super_block *super = get_current_context()->super;
63096 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
63097 +
63098 +       parse_blocknr(start, &bmap, &offset);
63099 +
63100 +       tmp = *end - 1;
63101 +       parse_blocknr(&tmp, &end_bmap, &end_offset);
63102 +       ++end_offset;
63103 +
63104 +       assert("zam-358", end_bmap >= bmap);
63105 +       assert("zam-359", ergo(end_bmap == bmap, end_offset >= offset));
63106 +
63107 +       for (; bmap < end_bmap; bmap++, offset = 0) {
63108 +               len =
63109 +                   search_one_bitmap_forward(bmap, &offset, max_offset,
63110 +                                             min_len, max_len);
63111 +               if (len != 0)
63112 +                       goto out;
63113 +       }
63114 +
63115 +       len =
63116 +           search_one_bitmap_forward(bmap, &offset, end_offset, min_len,
63117 +                                     max_len);
63118 +      out:
63119 +       *start = bmap * max_offset + offset;
63120 +       return len;
63121 +}
63122 +
63123 +/* allocate contiguous range of blocks in bitmap (from @start to @end in
63124 + * backward direction) */
63125 +static int bitmap_alloc_backward(reiser4_block_nr * start,
63126 +                                const reiser4_block_nr * end, int min_len,
63127 +                                int max_len)
63128 +{
63129 +       bmap_nr_t bmap, end_bmap;
63130 +       bmap_off_t offset, end_offset;
63131 +       int len;
63132 +       struct super_block *super = get_current_context()->super;
63133 +       const bmap_off_t max_offset = bmap_bit_count(super->s_blocksize);
63134 +
63135 +       parse_blocknr(start, &bmap, &offset);
63136 +       parse_blocknr(end, &end_bmap, &end_offset);
63137 +
63138 +       assert("zam-961", end_bmap <= bmap);
63139 +       assert("zam-962", ergo(end_bmap == bmap, end_offset <= offset));
63140 +
63141 +       for (; bmap > end_bmap; bmap--, offset = max_offset - 1) {
63142 +               len =
63143 +                   search_one_bitmap_backward(bmap, &offset, 0, min_len,
63144 +                                              max_len);
63145 +               if (len != 0)
63146 +                       goto out;
63147 +       }
63148 +
63149 +       len =
63150 +           search_one_bitmap_backward(bmap, &offset, end_offset, min_len,
63151 +                                      max_len);
63152 +      out:
63153 +       *start = bmap * max_offset + offset;
63154 +       return len;
63155 +}
63156 +
63157 +/* plugin->u.space_allocator.alloc_blocks() */
63158 +static int alloc_blocks_forward(reiser4_blocknr_hint *hint, int needed,
63159 +                               reiser4_block_nr *start, reiser4_block_nr *len)
63160 +{
63161 +       struct super_block *super = get_current_context()->super;
63162 +       int actual_len;
63163 +
63164 +       reiser4_block_nr search_start;
63165 +       reiser4_block_nr search_end;
63166 +
63167 +       assert("zam-398", super != NULL);
63168 +       assert("zam-412", hint != NULL);
63169 +       assert("zam-397", hint->blk <= reiser4_block_count(super));
63170 +
63171 +       if (hint->max_dist == 0)
63172 +               search_end = reiser4_block_count(super);
63173 +       else
63174 +               search_end =
63175 +                   LIMIT(hint->blk + hint->max_dist,
63176 +                         reiser4_block_count(super));
63177 +
63178 +       /* We use @hint -> blk as a search start and search from it to the end
63179 +          of the disk or in given region if @hint -> max_dist is not zero */
63180 +       search_start = hint->blk;
63181 +
63182 +       actual_len =
63183 +           bitmap_alloc_forward(&search_start, &search_end, 1, needed);
63184 +
63185 +       /* There is only one bitmap search if max_dist was specified or first
63186 +          pass was from the beginning of the bitmap. We also do one pass for
63187 +          scanning bitmap in backward direction. */
63188 +       if (!(actual_len != 0 || hint->max_dist != 0 || search_start == 0)) {
63189 +               /* next step is a scanning from 0 to search_start */
63190 +               search_end = search_start;
63191 +               search_start = 0;
63192 +               actual_len =
63193 +                   bitmap_alloc_forward(&search_start, &search_end, 1, needed);
63194 +       }
63195 +       if (actual_len == 0)
63196 +               return RETERR(-ENOSPC);
63197 +       if (actual_len < 0)
63198 +               return RETERR(actual_len);
63199 +       *len = actual_len;
63200 +       *start = search_start;
63201 +       return 0;
63202 +}
63203 +
63204 +static int alloc_blocks_backward(reiser4_blocknr_hint * hint, int needed,
63205 +                                reiser4_block_nr * start,
63206 +                                reiser4_block_nr * len)
63207 +{
63208 +       reiser4_block_nr search_start;
63209 +       reiser4_block_nr search_end;
63210 +       int actual_len;
63211 +
63212 +       ON_DEBUG(struct super_block *super = reiser4_get_current_sb());
63213 +
63214 +       assert("zam-969", super != NULL);
63215 +       assert("zam-970", hint != NULL);
63216 +       assert("zam-971", hint->blk <= reiser4_block_count(super));
63217 +
63218 +       search_start = hint->blk;
63219 +       if (hint->max_dist == 0 || search_start <= hint->max_dist)
63220 +               search_end = 0;
63221 +       else
63222 +               search_end = search_start - hint->max_dist;
63223 +
63224 +       actual_len =
63225 +           bitmap_alloc_backward(&search_start, &search_end, 1, needed);
63226 +       if (actual_len == 0)
63227 +               return RETERR(-ENOSPC);
63228 +       if (actual_len < 0)
63229 +               return RETERR(actual_len);
63230 +       *len = actual_len;
63231 +       *start = search_start;
63232 +       return 0;
63233 +}
63234 +
63235 +/* plugin->u.space_allocator.alloc_blocks() */
63236 +int reiser4_alloc_blocks_bitmap(reiser4_space_allocator * allocator,
63237 +                               reiser4_blocknr_hint * hint, int needed,
63238 +                               reiser4_block_nr * start, reiser4_block_nr * len)
63239 +{
63240 +       if (hint->backward)
63241 +               return alloc_blocks_backward(hint, needed, start, len);
63242 +       return alloc_blocks_forward(hint, needed, start, len);
63243 +}
63244 +
63245 +/* plugin->u.space_allocator.dealloc_blocks(). */
63246 +/* It just frees blocks in WORKING BITMAP. Usually formatted an unformatted
63247 +   nodes deletion is deferred until transaction commit.  However, deallocation
63248 +   of temporary objects like wandered blocks and transaction commit records
63249 +   requires immediate node deletion from WORKING BITMAP.*/
63250 +void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator * allocator,
63251 +                                  reiser4_block_nr start, reiser4_block_nr len)
63252 +{
63253 +       struct super_block *super = reiser4_get_current_sb();
63254 +
63255 +       bmap_nr_t bmap;
63256 +       bmap_off_t offset;
63257 +
63258 +       struct bitmap_node *bnode;
63259 +       int ret;
63260 +
63261 +       assert("zam-468", len != 0);
63262 +       check_block_range(&start, &len);
63263 +
63264 +       parse_blocknr(&start, &bmap, &offset);
63265 +
63266 +       assert("zam-469", offset + len <= bmap_bit_count(super->s_blocksize));
63267 +
63268 +       bnode = get_bnode(super, bmap);
63269 +
63270 +       assert("zam-470", bnode != NULL);
63271 +
63272 +       ret = load_and_lock_bnode(bnode);
63273 +       assert("zam-481", ret == 0);
63274 +
63275 +       reiser4_clear_bits(bnode_working_data(bnode), offset,
63276 +                          (bmap_off_t) (offset + len));
63277 +
63278 +       adjust_first_zero_bit(bnode, offset);
63279 +
63280 +       release_and_unlock_bnode(bnode);
63281 +}
63282 +
63283 +/* plugin->u.space_allocator.check_blocks(). */
63284 +void reiser4_check_blocks_bitmap(const reiser4_block_nr * start,
63285 +                                const reiser4_block_nr * len, int desired)
63286 +{
63287 +#if REISER4_DEBUG
63288 +       struct super_block *super = reiser4_get_current_sb();
63289 +
63290 +       bmap_nr_t bmap;
63291 +       bmap_off_t start_offset;
63292 +       bmap_off_t end_offset;
63293 +
63294 +       struct bitmap_node *bnode;
63295 +       int ret;
63296 +
63297 +       assert("zam-622", len != NULL);
63298 +       check_block_range(start, len);
63299 +       parse_blocknr(start, &bmap, &start_offset);
63300 +
63301 +       end_offset = start_offset + *len;
63302 +       assert("nikita-2214", end_offset <= bmap_bit_count(super->s_blocksize));
63303 +
63304 +       bnode = get_bnode(super, bmap);
63305 +
63306 +       assert("nikita-2215", bnode != NULL);
63307 +
63308 +       ret = load_and_lock_bnode(bnode);
63309 +       assert("zam-626", ret == 0);
63310 +
63311 +       assert("nikita-2216", jnode_is_loaded(bnode->wjnode));
63312 +
63313 +       if (desired) {
63314 +               assert("zam-623",
63315 +                      reiser4_find_next_zero_bit(bnode_working_data(bnode),
63316 +                                                 end_offset, start_offset)
63317 +                      >= end_offset);
63318 +       } else {
63319 +               assert("zam-624",
63320 +                      reiser4_find_next_set_bit(bnode_working_data(bnode),
63321 +                                                end_offset, start_offset)
63322 +                      >= end_offset);
63323 +       }
63324 +
63325 +       release_and_unlock_bnode(bnode);
63326 +#endif
63327 +}
63328 +
63329 +/* conditional insertion of @node into atom's overwrite set  if it was not there */
63330 +static void cond_add_to_overwrite_set(txn_atom * atom, jnode * node)
63331 +{
63332 +       assert("zam-546", atom != NULL);
63333 +       assert("zam-547", atom->stage == ASTAGE_PRE_COMMIT);
63334 +       assert("zam-548", node != NULL);
63335 +
63336 +       spin_lock_atom(atom);
63337 +       spin_lock_jnode(node);
63338 +
63339 +       if (node->atom == NULL) {
63340 +               JF_SET(node, JNODE_OVRWR);
63341 +               insert_into_atom_ovrwr_list(atom, node);
63342 +       } else {
63343 +               assert("zam-549", node->atom == atom);
63344 +       }
63345 +
63346 +       spin_unlock_jnode(node);
63347 +       spin_unlock_atom(atom);
63348 +}
63349 +
63350 +/* an actor which applies delete set to COMMIT bitmap pages and link modified
63351 +   pages in a single-linked list */
63352 +static int
63353 +apply_dset_to_commit_bmap(txn_atom * atom, const reiser4_block_nr * start,
63354 +                         const reiser4_block_nr * len, void *data)
63355 +{
63356 +
63357 +       bmap_nr_t bmap;
63358 +       bmap_off_t offset;
63359 +       int ret;
63360 +
63361 +       long long *blocks_freed_p = data;
63362 +
63363 +       struct bitmap_node *bnode;
63364 +
63365 +       struct super_block *sb = reiser4_get_current_sb();
63366 +
63367 +       check_block_range(start, len);
63368 +
63369 +       parse_blocknr(start, &bmap, &offset);
63370 +
63371 +       /* FIXME-ZAM: we assume that all block ranges are allocated by this
63372 +          bitmap-based allocator and each block range can't go over a zone of
63373 +          responsibility of one bitmap block; same assumption is used in
63374 +          other journal hooks in bitmap code. */
63375 +       bnode = get_bnode(sb, bmap);
63376 +       assert("zam-448", bnode != NULL);
63377 +
63378 +       /* it is safe to unlock atom with is in ASTAGE_PRE_COMMIT */
63379 +       assert("zam-767", atom->stage == ASTAGE_PRE_COMMIT);
63380 +       ret = load_and_lock_bnode(bnode);
63381 +       if (ret)
63382 +               return ret;
63383 +
63384 +       /* put bnode into atom's overwrite set */
63385 +       cond_add_to_overwrite_set(atom, bnode->cjnode);
63386 +
63387 +       data = bnode_commit_data(bnode);
63388 +
63389 +       ret = bnode_check_crc(bnode);
63390 +       if (ret != 0)
63391 +               return ret;
63392 +
63393 +       if (len != NULL) {
63394 +               /* FIXME-ZAM: a check that all bits are set should be there */
63395 +               assert("zam-443",
63396 +                      offset + *len <= bmap_bit_count(sb->s_blocksize));
63397 +               reiser4_clear_bits(data, offset, (bmap_off_t) (offset + *len));
63398 +
63399 +               (*blocks_freed_p) += *len;
63400 +       } else {
63401 +               reiser4_clear_bit(offset, data);
63402 +               (*blocks_freed_p)++;
63403 +       }
63404 +
63405 +       bnode_set_commit_crc(bnode, bnode_calc_crc(bnode, sb->s_blocksize));
63406 +
63407 +       release_and_unlock_bnode(bnode);
63408 +
63409 +       return 0;
63410 +}
63411 +
63412 +/* plugin->u.space_allocator.pre_commit_hook(). */
63413 +/* It just applies transaction changes to fs-wide COMMIT BITMAP, hoping the
63414 +   rest is done by transaction manager (allocate wandered locations for COMMIT
63415 +   BITMAP blocks, copy COMMIT BITMAP blocks data). */
63416 +/* Only one instance of this function can be running at one given time, because
63417 +   only one transaction can be committed a time, therefore it is safe to access
63418 +   some global variables without any locking */
63419 +
63420 +int reiser4_pre_commit_hook_bitmap(void)
63421 +{
63422 +       struct super_block *super = reiser4_get_current_sb();
63423 +       txn_atom *atom;
63424 +
63425 +       long long blocks_freed = 0;
63426 +
63427 +       atom = get_current_atom_locked();
63428 +       assert("zam-876", atom->stage == ASTAGE_PRE_COMMIT);
63429 +       spin_unlock_atom(atom);
63430 +
63431 +       {                       /* scan atom's captured list and find all freshly allocated nodes,
63432 +                                * mark corresponded bits in COMMIT BITMAP as used */
63433 +               struct list_head *head = ATOM_CLEAN_LIST(atom);
63434 +               jnode *node = list_entry(head->next, jnode, capture_link);
63435 +
63436 +               while (head != &node->capture_link) {
63437 +                       /* we detect freshly allocated jnodes */
63438 +                       if (JF_ISSET(node, JNODE_RELOC)) {
63439 +                               int ret;
63440 +                               bmap_nr_t bmap;
63441 +
63442 +                               bmap_off_t offset;
63443 +                               bmap_off_t index;
63444 +                               struct bitmap_node *bn;
63445 +                               __u32 size = bmap_size(super->s_blocksize);
63446 +                               __u32 crc;
63447 +                               char byte;
63448 +
63449 +                               assert("zam-559", !JF_ISSET(node, JNODE_OVRWR));
63450 +                               assert("zam-460",
63451 +                                      !reiser4_blocknr_is_fake(&node->blocknr));
63452 +
63453 +                               parse_blocknr(&node->blocknr, &bmap, &offset);
63454 +                               bn = get_bnode(super, bmap);
63455 +
63456 +                               index = offset >> 3;
63457 +                               assert("vpf-276", index < size);
63458 +
63459 +                               ret = bnode_check_crc(bnode);
63460 +                               if (ret != 0)
63461 +                                       return ret;
63462 +
63463 +                               check_bnode_loaded(bn);
63464 +                               load_and_lock_bnode(bn);
63465 +
63466 +                               byte = *(bnode_commit_data(bn) + index);
63467 +                               reiser4_set_bit(offset, bnode_commit_data(bn));
63468 +
63469 +                               crc = adler32_recalc(bnode_commit_crc(bn), byte,
63470 +                                                    *(bnode_commit_data(bn) +
63471 +                                                      index),
63472 +                                                    size - index),
63473 +                                   bnode_set_commit_crc(bn, crc);
63474 +
63475 +                               release_and_unlock_bnode(bn);
63476 +
63477 +                               ret = bnode_check_crc(bn);
63478 +                               if (ret != 0)
63479 +                                       return ret;
63480 +
63481 +                               /* working of this depends on how it inserts
63482 +                                  new j-node into clean list, because we are
63483 +                                  scanning the same list now. It is OK, if
63484 +                                  insertion is done to the list front */
63485 +                               cond_add_to_overwrite_set(atom, bn->cjnode);
63486 +                       }
63487 +
63488 +                       node = list_entry(node->capture_link.next, jnode, capture_link);
63489 +               }
63490 +       }
63491 +
63492 +       blocknr_set_iterator(atom, &atom->delete_set, apply_dset_to_commit_bmap,
63493 +                            &blocks_freed, 0);
63494 +
63495 +       blocks_freed -= atom->nr_blocks_allocated;
63496 +
63497 +       {
63498 +               reiser4_super_info_data *sbinfo;
63499 +
63500 +               sbinfo = get_super_private(super);
63501 +
63502 +               spin_lock_reiser4_super(sbinfo);
63503 +               sbinfo->blocks_free_committed += blocks_freed;
63504 +               spin_unlock_reiser4_super(sbinfo);
63505 +       }
63506 +
63507 +       return 0;
63508 +}
63509 +
63510 +/* plugin->u.space_allocator.init_allocator
63511 +    constructor of reiser4_space_allocator object. It is called on fs mount */
63512 +int reiser4_init_allocator_bitmap(reiser4_space_allocator * allocator,
63513 +                                 struct super_block *super, void *arg)
63514 +{
63515 +       struct bitmap_allocator_data *data = NULL;
63516 +       bmap_nr_t bitmap_blocks_nr;
63517 +       bmap_nr_t i;
63518 +
63519 +       assert("nikita-3039", reiser4_schedulable());
63520 +
63521 +       /* getting memory for bitmap allocator private data holder */
63522 +       data =
63523 +               kmalloc(sizeof(struct bitmap_allocator_data),
63524 +                       reiser4_ctx_gfp_mask_get());
63525 +
63526 +       if (data == NULL)
63527 +               return RETERR(-ENOMEM);
63528 +
63529 +       /* allocation and initialization for the array of bnodes */
63530 +       bitmap_blocks_nr = get_nr_bmap(super);
63531 +
63532 +       /* FIXME-ZAM: it is not clear what to do with huge number of bitmaps
63533 +          which is bigger than 2^32 (= 8 * 4096 * 4096 * 2^32 bytes = 5.76e+17,
63534 +          may I never meet someone who still uses the ia32 architecture when
63535 +          storage devices of that size enter the market, and wants to use ia32
63536 +          with that storage device, much less reiser4. ;-) -Hans). Kmalloc is not possible and,
63537 +          probably, another dynamic data structure should replace a static
63538 +          array of bnodes. */
63539 +       /*data->bitmap = reiser4_kmalloc((size_t) (sizeof (struct bitmap_node) * bitmap_blocks_nr), GFP_KERNEL); */
63540 +       data->bitmap = reiser4_vmalloc(sizeof(struct bitmap_node) * bitmap_blocks_nr);
63541 +       if (data->bitmap == NULL) {
63542 +               kfree(data);
63543 +               return RETERR(-ENOMEM);
63544 +       }
63545 +
63546 +       for (i = 0; i < bitmap_blocks_nr; i++)
63547 +               init_bnode(data->bitmap + i, super, i);
63548 +
63549 +       allocator->u.generic = data;
63550 +
63551 +#if REISER4_DEBUG
63552 +       get_super_private(super)->min_blocks_used += bitmap_blocks_nr;
63553 +#endif
63554 +
63555 +       /* Load all bitmap blocks at mount time. */
63556 +       if (!test_bit
63557 +           (REISER4_DONT_LOAD_BITMAP, &get_super_private(super)->fs_flags)) {
63558 +               __u64 start_time, elapsed_time;
63559 +               struct bitmap_node *bnode;
63560 +               int ret;
63561 +
63562 +               if (REISER4_DEBUG)
63563 +                       printk(KERN_INFO "loading reiser4 bitmap...");
63564 +               start_time = jiffies;
63565 +
63566 +               for (i = 0; i < bitmap_blocks_nr; i++) {
63567 +                       bnode = data->bitmap + i;
63568 +                       ret = load_and_lock_bnode(bnode);
63569 +                       if (ret) {
63570 +                               reiser4_destroy_allocator_bitmap(allocator,
63571 +                                                                super);
63572 +                               return ret;
63573 +                       }
63574 +                       release_and_unlock_bnode(bnode);
63575 +               }
63576 +
63577 +               elapsed_time = jiffies - start_time;
63578 +               if (REISER4_DEBUG)
63579 +                       printk("...done (%llu jiffies)\n",
63580 +                              (unsigned long long)elapsed_time);
63581 +       }
63582 +
63583 +       return 0;
63584 +}
63585 +
63586 +/* plugin->u.space_allocator.destroy_allocator
63587 +   destructor. It is called on fs unmount */
63588 +int reiser4_destroy_allocator_bitmap(reiser4_space_allocator * allocator,
63589 +                                    struct super_block *super)
63590 +{
63591 +       bmap_nr_t bitmap_blocks_nr;
63592 +       bmap_nr_t i;
63593 +
63594 +       struct bitmap_allocator_data *data = allocator->u.generic;
63595 +
63596 +       assert("zam-414", data != NULL);
63597 +       assert("zam-376", data->bitmap != NULL);
63598 +
63599 +       bitmap_blocks_nr = get_nr_bmap(super);
63600 +
63601 +       for (i = 0; i < bitmap_blocks_nr; i++) {
63602 +               struct bitmap_node *bnode = data->bitmap + i;
63603 +
63604 +               mutex_lock(&bnode->mutex);
63605 +
63606 +#if REISER4_DEBUG
63607 +               if (atomic_read(&bnode->loaded)) {
63608 +                       jnode *wj = bnode->wjnode;
63609 +                       jnode *cj = bnode->cjnode;
63610 +
63611 +                       assert("zam-480", jnode_page(cj) != NULL);
63612 +                       assert("zam-633", jnode_page(wj) != NULL);
63613 +
63614 +                       assert("zam-634",
63615 +                              memcmp(jdata(wj), jdata(wj),
63616 +                                     bmap_size(super->s_blocksize)) == 0);
63617 +
63618 +               }
63619 +#endif
63620 +               done_bnode(bnode);
63621 +               mutex_unlock(&bnode->mutex);
63622 +       }
63623 +
63624 +       vfree(data->bitmap);
63625 +       kfree(data);
63626 +
63627 +       allocator->u.generic = NULL;
63628 +
63629 +       return 0;
63630 +}
63631 +
63632 +/*
63633 + * Local variables:
63634 + * c-indentation-style: "K&R"
63635 + * mode-name: "LC"
63636 + * c-basic-offset: 8
63637 + * tab-width: 8
63638 + * fill-column: 79
63639 + * scroll-step: 1
63640 + * End:
63641 + */
63642 diff --git a/fs/reiser4/plugin/space/bitmap.h b/fs/reiser4/plugin/space/bitmap.h
63643 new file mode 100644
63644 index 0000000..be867f1
63645 --- /dev/null
63646 +++ b/fs/reiser4/plugin/space/bitmap.h
63647 @@ -0,0 +1,47 @@
63648 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63649 +
63650 +#if !defined (__REISER4_PLUGIN_SPACE_BITMAP_H__)
63651 +#define __REISER4_PLUGIN_SPACE_BITMAP_H__
63652 +
63653 +#include "../../dformat.h"
63654 +#include "../../block_alloc.h"
63655 +
63656 +#include <linux/types.h>       /* for __u??  */
63657 +#include <linux/fs.h>          /* for struct super_block  */
63658 +/* EDWARD-FIXME-HANS: write something as informative as the below for every .h file lacking it. */
63659 +/* declarations of functions implementing methods of space allocator plugin for
63660 +   bitmap based allocator. The functions themselves are in bitmap.c */
63661 +extern int reiser4_init_allocator_bitmap(reiser4_space_allocator *,
63662 +                                        struct super_block *, void *);
63663 +extern int reiser4_destroy_allocator_bitmap(reiser4_space_allocator *,
63664 +                                           struct super_block *);
63665 +extern int reiser4_alloc_blocks_bitmap(reiser4_space_allocator *,
63666 +                                      reiser4_blocknr_hint *, int needed,
63667 +                                      reiser4_block_nr * start,
63668 +                                      reiser4_block_nr * len);
63669 +extern void reiser4_check_blocks_bitmap(const reiser4_block_nr *,
63670 +                                       const reiser4_block_nr *, int);
63671 +extern void reiser4_dealloc_blocks_bitmap(reiser4_space_allocator *,
63672 +                                         reiser4_block_nr,
63673 +                                         reiser4_block_nr);
63674 +extern int reiser4_pre_commit_hook_bitmap(void);
63675 +
63676 +#define reiser4_post_commit_hook_bitmap() do{}while(0)
63677 +#define reiser4_post_write_back_hook_bitmap() do{}while(0)
63678 +#define reiser4_print_info_bitmap(pref, al) do{}while(0)
63679 +
63680 +typedef __u64 bmap_nr_t;
63681 +typedef __u32 bmap_off_t;
63682 +
63683 +#endif                         /* __REISER4_PLUGIN_SPACE_BITMAP_H__ */
63684 +
63685 +/* Make Linus happy.
63686 +   Local variables:
63687 +   c-indentation-style: "K&R"
63688 +   mode-name: "LC"
63689 +   c-basic-offset: 8
63690 +   tab-width: 8
63691 +   fill-column: 120
63692 +   scroll-step: 1
63693 +   End:
63694 +*/
63695 diff --git a/fs/reiser4/plugin/space/space_allocator.h b/fs/reiser4/plugin/space/space_allocator.h
63696 new file mode 100644
63697 index 0000000..5bfa9a3
63698 --- /dev/null
63699 +++ b/fs/reiser4/plugin/space/space_allocator.h
63700 @@ -0,0 +1,80 @@
63701 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
63702 +
63703 +#ifndef __SPACE_ALLOCATOR_H__
63704 +#define __SPACE_ALLOCATOR_H__
63705 +
63706 +#include "../../forward.h"
63707 +#include "bitmap.h"
63708 +/* NIKITA-FIXME-HANS: surely this could use a comment. Something about how bitmap is the only space allocator for now,
63709 + * but... */
63710 +#define DEF_SPACE_ALLOCATOR(allocator)                                                                                 \
63711 +                                                                                                                       \
63712 +static inline int sa_init_allocator (reiser4_space_allocator * al, struct super_block *s, void * opaque)               \
63713 +{                                                                                                                      \
63714 +       return reiser4_init_allocator_##allocator (al, s, opaque);                                                      \
63715 +}                                                                                                                      \
63716 +                                                                                                                       \
63717 +static inline void sa_destroy_allocator (reiser4_space_allocator *al, struct super_block *s)                           \
63718 +{                                                                                                                      \
63719 +       reiser4_destroy_allocator_##allocator (al, s);                                                                  \
63720 +}                                                                                                                      \
63721 +                                                                                                                       \
63722 +static inline int sa_alloc_blocks (reiser4_space_allocator *al, reiser4_blocknr_hint * hint,                           \
63723 +                                  int needed, reiser4_block_nr * start, reiser4_block_nr * len)                        \
63724 +{                                                                                                                      \
63725 +       return reiser4_alloc_blocks_##allocator (al, hint, needed, start, len);                                         \
63726 +}                                                                                                                      \
63727 +static inline void sa_dealloc_blocks (reiser4_space_allocator * al, reiser4_block_nr start, reiser4_block_nr len)      \
63728 +{                                                                                                                      \
63729 +       reiser4_dealloc_blocks_##allocator (al, start, len);                                                            \
63730 +}                                                                                                                      \
63731 +                                                                                                                       \
63732 +static inline void sa_check_blocks (const reiser4_block_nr * start, const reiser4_block_nr * end, int desired)                 \
63733 +{                                                                                                                      \
63734 +       reiser4_check_blocks_##allocator (start, end, desired);                                                         \
63735 +}                                                                                                                      \
63736 +                                                                                                                       \
63737 +static inline void sa_pre_commit_hook (void)                                                                           \
63738 +{                                                                                                                      \
63739 +       reiser4_pre_commit_hook_##allocator ();                                                                         \
63740 +}                                                                                                                      \
63741 +                                                                                                                       \
63742 +static inline void sa_post_commit_hook (void)                                                                          \
63743 +{                                                                                                                      \
63744 +       reiser4_post_commit_hook_##allocator ();                                                                        \
63745 +}                                                                                                                      \
63746 +                                                                                                                       \
63747 +static inline void sa_post_write_back_hook (void)                                                                      \
63748 +{                                                                                                                      \
63749 +       reiser4_post_write_back_hook_##allocator();                                                                     \
63750 +}                                                                                                                      \
63751 +                                                                                                                       \
63752 +static inline void sa_print_info(const char * prefix, reiser4_space_allocator * al)                                    \
63753 +{                                                                                                                      \
63754 +       reiser4_print_info_##allocator (prefix, al);                                                                    \
63755 +}
63756 +
63757 +DEF_SPACE_ALLOCATOR(bitmap)
63758 +
63759 +/* this object is part of reiser4 private in-core super block */
63760 +struct reiser4_space_allocator {
63761 +       union {
63762 +               /* space allocators might use this pointer to reference their
63763 +                * data. */
63764 +               void *generic;
63765 +       } u;
63766 +};
63767 +
63768 +/* __SPACE_ALLOCATOR_H__ */
63769 +#endif
63770 +
63771 +/* Make Linus happy.
63772 +   Local variables:
63773 +   c-indentation-style: "K&R"
63774 +   mode-name: "LC"
63775 +   c-basic-offset: 8
63776 +   tab-width: 8
63777 +   fill-column: 120
63778 +   scroll-step: 1
63779 +   End:
63780 +*/
63781 diff --git a/fs/reiser4/plugin/tail_policy.c b/fs/reiser4/plugin/tail_policy.c
63782 new file mode 100644
63783 index 0000000..43f4ae7
63784 --- /dev/null
63785 +++ b/fs/reiser4/plugin/tail_policy.c
63786 @@ -0,0 +1,113 @@
63787 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63788 + * reiser4/README */
63789 +
63790 +/* Formatting policy plugins */
63791 +
63792 +/*
63793 + * Formatting policy plugin is used by object plugin (of regular file) to
63794 + * convert file between two representations.
63795 + *
63796 + * Currently following policies are implemented:
63797 + *  never store file in formatted nodes
63798 + *  always store file in formatted nodes
63799 + *  store file in formatted nodes if file is smaller than 4 blocks (default)
63800 + */
63801 +
63802 +#include "../tree.h"
63803 +#include "../inode.h"
63804 +#include "../super.h"
63805 +#include "object.h"
63806 +#include "plugin.h"
63807 +#include "node/node.h"
63808 +#include "plugin_header.h"
63809 +
63810 +#include <linux/pagemap.h>
63811 +#include <linux/fs.h>          /* For struct inode */
63812 +
63813 +/**
63814 + * have_formatting_never -
63815 + * @inode:
63816 + * @size:
63817 + *
63818 + *
63819 + */
63820 +/* Never store file's tail as direct item */
63821 +/* Audited by: green(2002.06.12) */
63822 +static int have_formatting_never(const struct inode *inode UNUSED_ARG
63823 +                     /* inode to operate on */ ,
63824 +                     loff_t size UNUSED_ARG /* new object size */ )
63825 +{
63826 +       return 0;
63827 +}
63828 +
63829 +/* Always store file's tail as direct item */
63830 +/* Audited by: green(2002.06.12) */
63831 +static int
63832 +have_formatting_always(const struct inode *inode UNUSED_ARG
63833 +                      /* inode to operate on */ ,
63834 +                      loff_t size UNUSED_ARG /* new object size */ )
63835 +{
63836 +       return 1;
63837 +}
63838 +
63839 +/* This function makes test if we should store file denoted @inode as tails only or
63840 +   as extents only. */
63841 +static int
63842 +have_formatting_default(const struct inode *inode UNUSED_ARG
63843 +                       /* inode to operate on */ ,
63844 +                       loff_t size /* new object size */ )
63845 +{
63846 +       assert("umka-1253", inode != NULL);
63847 +
63848 +       if (size > inode->i_sb->s_blocksize * 4)
63849 +               return 0;
63850 +
63851 +       return 1;
63852 +}
63853 +
63854 +/* tail plugins */
63855 +formatting_plugin formatting_plugins[LAST_TAIL_FORMATTING_ID] = {
63856 +       [NEVER_TAILS_FORMATTING_ID] = {
63857 +               .h = {
63858 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63859 +                       .id = NEVER_TAILS_FORMATTING_ID,
63860 +                       .pops = NULL,
63861 +                       .label = "never",
63862 +                       .desc = "Never store file's tail",
63863 +                       .linkage = {NULL, NULL}
63864 +               },
63865 +               .have_tail = have_formatting_never
63866 +       },
63867 +       [ALWAYS_TAILS_FORMATTING_ID] = {
63868 +               .h = {
63869 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63870 +                       .id = ALWAYS_TAILS_FORMATTING_ID,
63871 +                       .pops = NULL,
63872 +                       .label = "always",
63873 +                       .desc = "Always store file's tail",
63874 +                       .linkage = {NULL, NULL}
63875 +               },
63876 +               .have_tail = have_formatting_always
63877 +       },
63878 +       [SMALL_FILE_FORMATTING_ID] = {
63879 +               .h = {
63880 +                       .type_id = REISER4_FORMATTING_PLUGIN_TYPE,
63881 +                       .id = SMALL_FILE_FORMATTING_ID,
63882 +                       .pops = NULL,
63883 +                       .label = "4blocks",
63884 +                       .desc = "store files shorter than 4 blocks in tail items",
63885 +                       .linkage = {NULL, NULL}
63886 +               },
63887 +               .have_tail = have_formatting_default
63888 +       }
63889 +};
63890 +
63891 +/*
63892 + * Local variables:
63893 + * c-indentation-style: "K&R"
63894 + * mode-name: "LC"
63895 + * c-basic-offset: 8
63896 + * tab-width: 8
63897 + * fill-column: 79
63898 + * End:
63899 + */
63900 diff --git a/fs/reiser4/pool.c b/fs/reiser4/pool.c
63901 new file mode 100644
63902 index 0000000..f4303da
63903 --- /dev/null
63904 +++ b/fs/reiser4/pool.c
63905 @@ -0,0 +1,234 @@
63906 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
63907 + * reiser4/README */
63908 +
63909 +/* Fast pool allocation.
63910 +
63911 +   There are situations when some sub-system normally asks memory allocator
63912 +   for only few objects, but under some circumstances could require much
63913 +   more. Typical and actually motivating example is tree balancing. It needs
63914 +   to keep track of nodes that were involved into it, and it is well-known
63915 +   that in reasonable packed balanced tree most (92.938121%) percent of all
63916 +   balancings end up after working with only few nodes (3.141592 on
63917 +   average). But in rare cases balancing can involve much more nodes
63918 +   (3*tree_height+1 in extremal situation).
63919 +
63920 +   On the one hand, we don't want to resort to dynamic allocation (slab,
63921 +    malloc(), etc.) to allocate data structures required to keep track of
63922 +   nodes during balancing. On the other hand, we cannot statically allocate
63923 +   required amount of space on the stack, because first: it is useless wastage
63924 +   of precious resource, and second: this amount is unknown in advance (tree
63925 +   height can change).
63926 +
63927 +   Pools, implemented in this file are solution for this problem:
63928 +
63929 +    - some configurable amount of objects is statically preallocated on the
63930 +    stack
63931 +
63932 +    - if this preallocated pool is exhausted and more objects is requested
63933 +    they are allocated dynamically.
63934 +
63935 +   Pools encapsulate distinction between statically and dynamically allocated
63936 +   objects. Both allocation and recycling look exactly the same.
63937 +
63938 +   To keep track of dynamically allocated objects, pool adds its own linkage
63939 +   to each object.
63940 +
63941 +   NOTE-NIKITA This linkage also contains some balancing-specific data. This
63942 +   is not perfect. On the other hand, balancing is currently the only client
63943 +   of pool code.
63944 +
63945 +   NOTE-NIKITA Another desirable feature is to rewrite all pool manipulation
63946 +   functions in the style of tslist/tshash, i.e., make them unreadable, but
63947 +   type-safe.
63948 +
63949 +*/
63950 +
63951 +#include "debug.h"
63952 +#include "pool.h"
63953 +#include "super.h"
63954 +
63955 +#include <linux/types.h>
63956 +#include <linux/err.h>
63957 +
63958 +/* initialize new pool object */
63959 +static void reiser4_init_pool_obj(reiser4_pool_header * h      /* pool object to
63960 +                                                                * initialize */ )
63961 +{
63962 +       INIT_LIST_HEAD(&h->usage_linkage);
63963 +       INIT_LIST_HEAD(&h->level_linkage);
63964 +       INIT_LIST_HEAD(&h->extra_linkage);
63965 +}
63966 +
63967 +/* initialize new pool */
63968 +void reiser4_init_pool(reiser4_pool * pool /* pool to initialize */ ,
63969 +                      size_t obj_size /* size of objects in @pool */ ,
63970 +                      int num_of_objs /* number of preallocated objects */ ,
63971 +                      char *data /* area for preallocated objects */ )
63972 +{
63973 +       reiser4_pool_header *h;
63974 +       int i;
63975 +
63976 +       assert("nikita-955", pool != NULL);
63977 +       assert("nikita-1044", obj_size > 0);
63978 +       assert("nikita-956", num_of_objs >= 0);
63979 +       assert("nikita-957", data != NULL);
63980 +
63981 +       memset(pool, 0, sizeof *pool);
63982 +       pool->obj_size = obj_size;
63983 +       pool->data = data;
63984 +       INIT_LIST_HEAD(&pool->free);
63985 +       INIT_LIST_HEAD(&pool->used);
63986 +       INIT_LIST_HEAD(&pool->extra);
63987 +       memset(data, 0, obj_size * num_of_objs);
63988 +       for (i = 0; i < num_of_objs; ++i) {
63989 +               h = (reiser4_pool_header *) (data + i * obj_size);
63990 +               reiser4_init_pool_obj(h);
63991 +               /* add pool header to the end of pool's free list */
63992 +               list_add_tail(&h->usage_linkage, &pool->free);
63993 +       }
63994 +}
63995 +
63996 +/* release pool resources
63997 +
63998 +   Release all resources acquired by this pool, specifically, dynamically
63999 +   allocated objects.
64000 +
64001 +*/
64002 +void reiser4_done_pool(reiser4_pool * pool UNUSED_ARG /* pool to destroy */ )
64003 +{
64004 +}
64005 +
64006 +/* allocate carry object from pool
64007 +
64008 +   First, try to get preallocated object. If this fails, resort to dynamic
64009 +   allocation.
64010 +
64011 +*/
64012 +static void *reiser4_pool_alloc(reiser4_pool * pool    /* pool to allocate object
64013 +                                                        * from */ )
64014 +{
64015 +       reiser4_pool_header *result;
64016 +
64017 +       assert("nikita-959", pool != NULL);
64018 +
64019 +       if (!list_empty(&pool->free)) {
64020 +               struct list_head *linkage;
64021 +
64022 +               linkage = pool->free.next;
64023 +               list_del(linkage);
64024 +               INIT_LIST_HEAD(linkage);
64025 +               result = list_entry(linkage, reiser4_pool_header, usage_linkage);
64026 +               BUG_ON(!list_empty(&result->level_linkage) ||
64027 +                      !list_empty(&result->extra_linkage));
64028 +       } else {
64029 +               /* pool is empty. Extra allocations don't deserve dedicated
64030 +                  slab to be served from, as they are expected to be rare. */
64031 +               result = kmalloc(pool->obj_size, reiser4_ctx_gfp_mask_get());
64032 +               if (result != 0) {
64033 +                       reiser4_init_pool_obj(result);
64034 +                       list_add(&result->extra_linkage, &pool->extra);
64035 +               } else
64036 +                       return ERR_PTR(RETERR(-ENOMEM));
64037 +               BUG_ON(!list_empty(&result->usage_linkage) ||
64038 +                      !list_empty(&result->level_linkage));
64039 +       }
64040 +       ++pool->objs;
64041 +       list_add(&result->usage_linkage, &pool->used);
64042 +       memset(result + 1, 0, pool->obj_size - sizeof *result);
64043 +       return result;
64044 +}
64045 +
64046 +/* return object back to the pool */
64047 +void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h    /* pool to return object back
64048 +                                                                        * into */ )
64049 +{
64050 +       assert("nikita-961", h != NULL);
64051 +       assert("nikita-962", pool != NULL);
64052 +
64053 +       --pool->objs;
64054 +       assert("nikita-963", pool->objs >= 0);
64055 +
64056 +       list_del_init(&h->usage_linkage);
64057 +       list_del_init(&h->level_linkage);
64058 +
64059 +       if (list_empty(&h->extra_linkage))
64060 +               /*
64061 +                * pool header is not an extra one. Push it onto free list
64062 +                * using usage_linkage
64063 +                */
64064 +               list_add(&h->usage_linkage, &pool->free);
64065 +       else {
64066 +               /* remove pool header from pool's extra list and kfree it */
64067 +               list_del(&h->extra_linkage);
64068 +               kfree(h);
64069 +       }
64070 +}
64071 +
64072 +/* add new object to the carry level list
64073 +
64074 +   Carry level is FIFO most of the time, but not always. Complications arise
64075 +   when make_space() function tries to go to the left neighbor and thus adds
64076 +   carry node before existing nodes, and also, when updating delimiting keys
64077 +   after moving data between two nodes, we want left node to be locked before
64078 +   right node.
64079 +
64080 +   Latter case is confusing at the first glance. Problem is that COP_UPDATE
64081 +   opration that updates delimiting keys is sometimes called with two nodes
64082 +   (when data are moved between two nodes) and sometimes with only one node
64083 +   (when leftmost item is deleted in a node). In any case operation is
64084 +   supplied with at least node whose left delimiting key is to be updated
64085 +   (that is "right" node).
64086 +
64087 +*/
64088 +reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool /* pool from which to
64089 +                                                         * allocate new object
64090 +                                                         */,
64091 +                                    struct list_head *list /* list where to add
64092 +                                                            * object */,
64093 +                                    pool_ordering order /* where to add */,
64094 +                                    reiser4_pool_header * reference
64095 +                                    /* after (or before) which existing object
64096 +                                       to add */)
64097 +{
64098 +       reiser4_pool_header *result;
64099 +
64100 +       assert("nikita-972", pool != NULL);
64101 +
64102 +       result = reiser4_pool_alloc(pool);
64103 +       if (IS_ERR(result))
64104 +               return result;
64105 +
64106 +       assert("nikita-973", result != NULL);
64107 +
64108 +       switch (order) {
64109 +       case POOLO_BEFORE:
64110 +               __list_add(&result->level_linkage,
64111 +                          reference->level_linkage.prev,
64112 +                          &reference->level_linkage);
64113 +               break;
64114 +       case POOLO_AFTER:
64115 +               __list_add(&result->level_linkage,
64116 +                          &reference->level_linkage,
64117 +                          reference->level_linkage.next);
64118 +               break;
64119 +       case POOLO_LAST:
64120 +               list_add_tail(&result->level_linkage, list);
64121 +               break;
64122 +       case POOLO_FIRST:
64123 +               list_add(&result->level_linkage, list);
64124 +               break;
64125 +       default:
64126 +               wrong_return_value("nikita-927", "order");
64127 +       }
64128 +       return result;
64129 +}
64130 +
64131 +/* Make Linus happy.
64132 +   Local variables:
64133 +   c-indentation-style: "K&R"
64134 +   mode-name: "LC"
64135 +   c-basic-offset: 8
64136 +   tab-width: 8
64137 +   fill-column: 120
64138 +   End:
64139 +*/
64140 diff --git a/fs/reiser4/pool.h b/fs/reiser4/pool.h
64141 new file mode 100644
64142 index 0000000..174d3c6
64143 --- /dev/null
64144 +++ b/fs/reiser4/pool.h
64145 @@ -0,0 +1,55 @@
64146 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64147 +
64148 +/* Fast pool allocation */
64149 +
64150 +#ifndef __REISER4_POOL_H__
64151 +#define __REISER4_POOL_H__
64152 +
64153 +#include <linux/types.h>
64154 +
64155 +typedef struct reiser4_pool {
64156 +       size_t obj_size;
64157 +       int objs;
64158 +       char *data;
64159 +       struct list_head free;
64160 +       struct list_head used;
64161 +       struct list_head extra;
64162 +} reiser4_pool;
64163 +
64164 +typedef struct reiser4_pool_header {
64165 +       /* object is either on free or "used" lists */
64166 +       struct list_head usage_linkage;
64167 +       struct list_head level_linkage;
64168 +       struct list_head extra_linkage;
64169 +} reiser4_pool_header;
64170 +
64171 +typedef enum {
64172 +       POOLO_BEFORE,
64173 +       POOLO_AFTER,
64174 +       POOLO_LAST,
64175 +       POOLO_FIRST
64176 +} pool_ordering;
64177 +
64178 +/* pool manipulation functions */
64179 +
64180 +extern void reiser4_init_pool(reiser4_pool * pool, size_t obj_size,
64181 +                             int num_of_objs, char *data);
64182 +extern void reiser4_done_pool(reiser4_pool * pool);
64183 +extern void reiser4_pool_free(reiser4_pool * pool, reiser4_pool_header * h);
64184 +reiser4_pool_header *reiser4_add_obj(reiser4_pool * pool,
64185 +                                    struct list_head * list,
64186 +                                    pool_ordering order,
64187 +                                    reiser4_pool_header * reference);
64188 +
64189 +/* __REISER4_POOL_H__ */
64190 +#endif
64191 +
64192 +/* Make Linus happy.
64193 +   Local variables:
64194 +   c-indentation-style: "K&R"
64195 +   mode-name: "LC"
64196 +   c-basic-offset: 8
64197 +   tab-width: 8
64198 +   fill-column: 120
64199 +   End:
64200 +*/
64201 diff --git a/fs/reiser4/readahead.c b/fs/reiser4/readahead.c
64202 new file mode 100644
64203 index 0000000..8e5a9f1
64204 --- /dev/null
64205 +++ b/fs/reiser4/readahead.c
64206 @@ -0,0 +1,138 @@
64207 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64208 + * reiser4/README */
64209 +
64210 +#include "forward.h"
64211 +#include "tree.h"
64212 +#include "tree_walk.h"
64213 +#include "super.h"
64214 +#include "inode.h"
64215 +#include "key.h"
64216 +#include "znode.h"
64217 +
64218 +#include <linux/swap.h>                /* for totalram_pages */
64219 +
64220 +void reiser4_init_ra_info(ra_info_t * rai)
64221 +{
64222 +       rai->key_to_stop = *reiser4_min_key();
64223 +}
64224 +
64225 +/* global formatted node readahead parameter. It can be set by mount option -o readahead:NUM:1 */
64226 +static inline int ra_adjacent_only(int flags)
64227 +{
64228 +       return flags & RA_ADJACENT_ONLY;
64229 +}
64230 +
64231 +/* this is used by formatted_readahead to decide whether read for right neighbor of node is to be issued. It returns 1
64232 +   if right neighbor's first key is less or equal to readahead's stop key */
64233 +static int should_readahead_neighbor(znode * node, ra_info_t * info)
64234 +{
64235 +       int result;
64236 +
64237 +       read_lock_dk(znode_get_tree(node));
64238 +       result = keyle(znode_get_rd_key(node), &info->key_to_stop);
64239 +       read_unlock_dk(znode_get_tree(node));
64240 +       return result;
64241 +}
64242 +
64243 +#define LOW_MEM_PERCENTAGE (5)
64244 +
64245 +static int low_on_memory(void)
64246 +{
64247 +       unsigned int freepages;
64248 +
64249 +       freepages = nr_free_pages();
64250 +       return freepages < (totalram_pages * LOW_MEM_PERCENTAGE / 100);
64251 +}
64252 +
64253 +/* start read for @node and for a few of its right neighbors */
64254 +void formatted_readahead(znode * node, ra_info_t * info)
64255 +{
64256 +       ra_params_t *ra_params;
64257 +       znode *cur;
64258 +       int i;
64259 +       int grn_flags;
64260 +       lock_handle next_lh;
64261 +
64262 +       /* do nothing if node block number has not been assigned to node (which means it is still in cache). */
64263 +       if (reiser4_blocknr_is_fake(znode_get_block(node)))
64264 +               return;
64265 +
64266 +       ra_params = get_current_super_ra_params();
64267 +
64268 +       if (znode_page(node) == NULL)
64269 +               jstartio(ZJNODE(node));
64270 +
64271 +       if (znode_get_level(node) != LEAF_LEVEL)
64272 +               return;
64273 +
64274 +       /* don't waste memory for read-ahead when low on memory */
64275 +       if (low_on_memory())
64276 +               return;
64277 +
64278 +       /* We can have locked nodes on upper tree levels, in this situation lock
64279 +          priorities do not help to resolve deadlocks, we have to use TRY_LOCK
64280 +          here. */
64281 +       grn_flags = (GN_CAN_USE_UPPER_LEVELS | GN_TRY_LOCK);
64282 +
64283 +       i = 0;
64284 +       cur = zref(node);
64285 +       init_lh(&next_lh);
64286 +       while (i < ra_params->max) {
64287 +               const reiser4_block_nr *nextblk;
64288 +
64289 +               if (!should_readahead_neighbor(cur, info))
64290 +                       break;
64291 +
64292 +               if (reiser4_get_right_neighbor
64293 +                   (&next_lh, cur, ZNODE_READ_LOCK, grn_flags))
64294 +                       break;
64295 +
64296 +               nextblk = znode_get_block(next_lh.node);
64297 +               if (reiser4_blocknr_is_fake(nextblk) ||
64298 +                   (ra_adjacent_only(ra_params->flags)
64299 +                    && *nextblk != *znode_get_block(cur) + 1)) {
64300 +                       break;
64301 +               }
64302 +
64303 +               zput(cur);
64304 +               cur = zref(next_lh.node);
64305 +               done_lh(&next_lh);
64306 +               if (znode_page(cur) == NULL)
64307 +                       jstartio(ZJNODE(cur));
64308 +               else
64309 +                       /* Do not scan read-ahead window if pages already
64310 +                        * allocated (and i/o already started). */
64311 +                       break;
64312 +
64313 +               i++;
64314 +       }
64315 +       zput(cur);
64316 +       done_lh(&next_lh);
64317 +}
64318 +
64319 +void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap)
64320 +{
64321 +       reiser4_key *stop_key;
64322 +
64323 +       assert("nikita-3542", dir != NULL);
64324 +       assert("nikita-3543", tap != NULL);
64325 +
64326 +       stop_key = &tap->ra_info.key_to_stop;
64327 +       /* initialize readdir readahead information: include into readahead
64328 +        * stat data of all files of the directory */
64329 +       set_key_locality(stop_key, get_inode_oid(dir));
64330 +       set_key_type(stop_key, KEY_SD_MINOR);
64331 +       set_key_ordering(stop_key, get_key_ordering(reiser4_max_key()));
64332 +       set_key_objectid(stop_key, get_key_objectid(reiser4_max_key()));
64333 +       set_key_offset(stop_key, get_key_offset(reiser4_max_key()));
64334 +}
64335 +
64336 +/*
64337 +   Local variables:
64338 +   c-indentation-style: "K&R"
64339 +   mode-name: "LC"
64340 +   c-basic-offset: 8
64341 +   tab-width: 8
64342 +   fill-column: 80
64343 +   End:
64344 +*/
64345 diff --git a/fs/reiser4/readahead.h b/fs/reiser4/readahead.h
64346 new file mode 100644
64347 index 0000000..524c574
64348 --- /dev/null
64349 +++ b/fs/reiser4/readahead.h
64350 @@ -0,0 +1,48 @@
64351 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
64352 +
64353 +#ifndef __READAHEAD_H__
64354 +#define __READAHEAD_H__
64355 +
64356 +#include "key.h"
64357 +
64358 +typedef enum {
64359 +       RA_ADJACENT_ONLY = 1,   /* only requests nodes which are adjacent. Default is NO (not only adjacent) */
64360 +} ra_global_flags;
64361 +
64362 +/* reiser4 super block has a field of this type. It controls readahead during tree traversals */
64363 +typedef struct formatted_read_ahead_params {
64364 +       unsigned long max;      /* request not more than this amount of nodes. Default is totalram_pages / 4 */
64365 +       int flags;
64366 +} ra_params_t;
64367 +
64368 +typedef struct {
64369 +       reiser4_key key_to_stop;
64370 +} ra_info_t;
64371 +
64372 +void formatted_readahead(znode *, ra_info_t *);
64373 +void reiser4_init_ra_info(ra_info_t * rai);
64374 +
64375 +struct reiser4_file_ra_state {
64376 +       loff_t start;           /* Current window */
64377 +       loff_t size;
64378 +       loff_t next_size;       /* Next window size */
64379 +       loff_t ahead_start;     /* Ahead window */
64380 +       loff_t ahead_size;
64381 +       loff_t max_window_size; /* Maximum readahead window */
64382 +       loff_t slow_start;      /* enlarging r/a size algorithm. */
64383 +};
64384 +
64385 +extern void reiser4_readdir_readahead_init(struct inode *dir, tap_t * tap);
64386 +
64387 +/* __READAHEAD_H__ */
64388 +#endif
64389 +
64390 +/*
64391 +   Local variables:
64392 +   c-indentation-style: "K&R"
64393 +   mode-name: "LC"
64394 +   c-basic-offset: 8
64395 +   tab-width: 8
64396 +   fill-column: 120
64397 +   End:
64398 +*/
64399 diff --git a/fs/reiser4/reiser4.h b/fs/reiser4/reiser4.h
64400 new file mode 100644
64401 index 0000000..77d720e
64402 --- /dev/null
64403 +++ b/fs/reiser4/reiser4.h
64404 @@ -0,0 +1,269 @@
64405 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
64406 + * reiser4/README */
64407 +
64408 +/* definitions of common constants used by reiser4 */
64409 +
64410 +#if !defined( __REISER4_H__ )
64411 +#define __REISER4_H__
64412 +
64413 +#include <asm/param.h>         /* for HZ */
64414 +#include <linux/errno.h>
64415 +#include <linux/types.h>
64416 +#include <linux/fs.h>
64417 +#include <linux/hardirq.h>
64418 +#include <linux/sched.h>
64419 +
64420 +/*
64421 + * reiser4 compilation options.
64422 + */
64423 +
64424 +#if defined(CONFIG_REISER4_DEBUG)
64425 +/* turn on assertion checks */
64426 +#define REISER4_DEBUG (1)
64427 +#else
64428 +#define REISER4_DEBUG (0)
64429 +#endif
64430 +
64431 +#if defined(CONFIG_ZLIB_INFLATE)
64432 +/* turn on zlib */
64433 +#define REISER4_ZLIB (1)
64434 +#else
64435 +#define REISER4_ZLIB (0)
64436 +#endif
64437 +
64438 +#if defined(CONFIG_CRYPTO_SHA256)
64439 +#define REISER4_SHA256 (1)
64440 +#else
64441 +#define REISER4_SHA256 (0)
64442 +#endif
64443 +
64444 +/*
64445 + * Turn on large keys mode. In his mode (which is default), reiser4 key has 4
64446 + * 8-byte components. In the old "small key" mode, it's 3 8-byte
64447 + * components. Additional component, referred to as "ordering" is used to
64448 + * order items from which given object is composed of. As such, ordering is
64449 + * placed between locality and objectid. For directory item ordering contains
64450 + * initial prefix of the file name this item is for. This sorts all directory
64451 + * items within given directory lexicographically (but see
64452 + * fibration.[ch]). For file body and stat-data, ordering contains initial
64453 + * prefix of the name file was initially created with. In the common case
64454 + * (files with single name) this allows to order file bodies and stat-datas in
64455 + * the same order as their respective directory entries, thus speeding up
64456 + * readdir.
64457 + *
64458 + * Note, that kernel can only mount file system with the same key size as one
64459 + * it is compiled for, so flipping this option may render your data
64460 + * inaccessible.
64461 + */
64462 +#define REISER4_LARGE_KEY (1)
64463 +/*#define REISER4_LARGE_KEY (0)*/
64464 +
64465 +/*#define GUESS_EXISTS 1*/
64466 +
64467 +/*
64468 + * PLEASE update fs/reiser4/kattr.c:show_options() when adding new compilation
64469 + * option
64470 + */
64471 +
64472 +extern const char *REISER4_SUPER_MAGIC_STRING;
64473 +extern const int REISER4_MAGIC_OFFSET; /* offset to magic string from the
64474 +                                        * beginning of device */
64475 +
64476 +/* here go tunable parameters that are not worth special entry in kernel
64477 +   configuration */
64478 +
64479 +/* default number of slots in coord-by-key caches */
64480 +#define CBK_CACHE_SLOTS    (16)
64481 +/* how many elementary tree operation to carry on the next level */
64482 +#define CARRIES_POOL_SIZE        (5)
64483 +/* size of pool of preallocated nodes for carry process. */
64484 +#define NODES_LOCKED_POOL_SIZE   (5)
64485 +
64486 +#define REISER4_NEW_NODE_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64487 +#define REISER4_NEW_EXTENT_FLAGS (COPI_LOAD_LEFT | COPI_LOAD_RIGHT | COPI_GO_LEFT)
64488 +#define REISER4_PASTE_FLAGS (COPI_GO_LEFT)
64489 +#define REISER4_INSERT_FLAGS (COPI_GO_LEFT)
64490 +
64491 +/* we are supporting reservation of disk space on uid basis */
64492 +#define REISER4_SUPPORT_UID_SPACE_RESERVATION (0)
64493 +/* we are supporting reservation of disk space for groups */
64494 +#define REISER4_SUPPORT_GID_SPACE_RESERVATION (0)
64495 +/* we are supporting reservation of disk space for root */
64496 +#define REISER4_SUPPORT_ROOT_SPACE_RESERVATION (0)
64497 +/* we use rapid flush mode, see flush.c for comments.  */
64498 +#define REISER4_USE_RAPID_FLUSH (1)
64499 +
64500 +/*
64501 + * set this to 0 if you don't want to use wait-for-flush in ->writepage().
64502 + */
64503 +#define REISER4_USE_ENTD (1)
64504 +
64505 +/* key allocation is Plan-A */
64506 +#define REISER4_PLANA_KEY_ALLOCATION (1)
64507 +/* key allocation follows good old 3.x scheme */
64508 +#define REISER4_3_5_KEY_ALLOCATION (0)
64509 +
64510 +/* size of hash-table for znodes */
64511 +#define REISER4_ZNODE_HASH_TABLE_SIZE (1 << 13)
64512 +
64513 +/* number of buckets in lnode hash-table */
64514 +#define LNODE_HTABLE_BUCKETS (1024)
64515 +
64516 +/* some ridiculously high maximal limit on height of znode tree. This
64517 +    is used in declaration of various per level arrays and
64518 +    to allocate stattistics gathering array for per-level stats. */
64519 +#define REISER4_MAX_ZTREE_HEIGHT     (8)
64520 +
64521 +#define REISER4_PANIC_MSG_BUFFER_SIZE (1024)
64522 +
64523 +/* If array contains less than REISER4_SEQ_SEARCH_BREAK elements then,
64524 +   sequential search is on average faster than binary. This is because
64525 +   of better optimization and because sequential search is more CPU
64526 +   cache friendly. This number (25) was found by experiments on dual AMD
64527 +   Athlon(tm), 1400MHz.
64528 +
64529 +   NOTE: testing in kernel has shown that binary search is more effective than
64530 +   implied by results of the user level benchmarking. Probably because in the
64531 +   node keys are separated by other data. So value was adjusted after few
64532 +   tests. More thorough tuning is needed.
64533 +*/
64534 +#define REISER4_SEQ_SEARCH_BREAK      (3)
64535 +
64536 +/* don't allow tree to be lower than this */
64537 +#define REISER4_MIN_TREE_HEIGHT       (TWIG_LEVEL)
64538 +
64539 +/* NOTE NIKITA this is no longer used: maximal atom size is auto-adjusted to
64540 + * available memory. */
64541 +/* Default value of maximal atom size. Can be ovewritten by
64542 +   tmgr.atom_max_size mount option. By default infinity. */
64543 +#define REISER4_ATOM_MAX_SIZE         ((unsigned)(~0))
64544 +
64545 +/* Default value of maximal atom age (in jiffies). After reaching this age
64546 +   atom will be forced to commit, either synchronously or asynchronously. Can
64547 +   be overwritten by tmgr.atom_max_age mount option. */
64548 +#define REISER4_ATOM_MAX_AGE          (600 * HZ)
64549 +
64550 +/* sleeping period for ktxnmrgd */
64551 +#define REISER4_TXNMGR_TIMEOUT  (5 * HZ)
64552 +
64553 +/* timeout to wait for ent thread in writepage. Default: 3 milliseconds. */
64554 +#define REISER4_ENTD_TIMEOUT (3 * HZ / 1000)
64555 +
64556 +/* start complaining after that many restarts in coord_by_key().
64557 +
64558 +   This either means incredibly heavy contention for this part of a tree, or
64559 +   some corruption or bug.
64560 +*/
64561 +#define REISER4_CBK_ITERATIONS_LIMIT  (100)
64562 +
64563 +/* return -EIO after that many iterations in coord_by_key().
64564 +
64565 +   I have witnessed more than 800 iterations (in 30 thread test) before cbk
64566 +   finished. --nikita
64567 +*/
64568 +#define REISER4_MAX_CBK_ITERATIONS    500000
64569 +
64570 +/* put a per-inode limit on maximal number of directory entries with identical
64571 +   keys in hashed directory.
64572 +
64573 +   Disable this until inheritance interfaces stabilize: we need some way to
64574 +   set per directory limit.
64575 +*/
64576 +#define REISER4_USE_COLLISION_LIMIT    (0)
64577 +
64578 +/* If flush finds more than FLUSH_RELOCATE_THRESHOLD adjacent dirty leaf-level blocks it
64579 +   will force them to be relocated. */
64580 +#define FLUSH_RELOCATE_THRESHOLD 64
64581 +/* If flush finds can find a block allocation closer than at most FLUSH_RELOCATE_DISTANCE
64582 +   from the preceder it will relocate to that position. */
64583 +#define FLUSH_RELOCATE_DISTANCE  64
64584 +
64585 +/* If we have written this much or more blocks before encountering busy jnode
64586 +   in flush list - abort flushing hoping that next time we get called
64587 +   this jnode will be clean already, and we will save some seeks. */
64588 +#define FLUSH_WRITTEN_THRESHOLD 50
64589 +
64590 +/* The maximum number of nodes to scan left on a level during flush. */
64591 +#define FLUSH_SCAN_MAXNODES 10000
64592 +
64593 +/* per-atom limit of flushers */
64594 +#define ATOM_MAX_FLUSHERS (1)
64595 +
64596 +/* default tracing buffer size */
64597 +#define REISER4_TRACE_BUF_SIZE (1 << 15)
64598 +
64599 +/* what size units of IO we would like cp, etc., to use, in writing to
64600 +   reiser4. In bytes.
64601 +
64602 +   Can be overwritten by optimal_io_size mount option.
64603 +*/
64604 +#define REISER4_OPTIMAL_IO_SIZE (64 * 1024)
64605 +
64606 +/* see comments in inode.c:oid_to_uino() */
64607 +#define REISER4_UINO_SHIFT (1 << 30)
64608 +
64609 +/* Mark function argument as unused to avoid compiler warnings. */
64610 +#define UNUSED_ARG __attribute__((unused))
64611 +
64612 +#if ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3)
64613 +#define NONNULL __attribute__((nonnull))
64614 +#else
64615 +#define NONNULL
64616 +#endif
64617 +
64618 +/* master super block offset in bytes.*/
64619 +#define REISER4_MASTER_OFFSET 65536
64620 +
64621 +/* size of VFS block */
64622 +#define VFS_BLKSIZE 512
64623 +/* number of bits in size of VFS block (512==2^9) */
64624 +#define VFS_BLKSIZE_BITS 9
64625 +
64626 +#define REISER4_I reiser4_inode_data
64627 +
64628 +/* implication */
64629 +#define ergo( antecedent, consequent ) ( !( antecedent ) || ( consequent ) )
64630 +/* logical equivalence */
64631 +#define equi( p1, p2 ) ( ergo( ( p1 ), ( p2 ) ) && ergo( ( p2 ), ( p1 ) ) )
64632 +
64633 +#define sizeof_array(x) ((int) (sizeof(x) / sizeof(x[0])))
64634 +
64635 +#define NOT_YET                       (0)
64636 +
64637 +/** Reiser4 specific error codes **/
64638 +
64639 +#define REISER4_ERROR_CODE_BASE 500
64640 +
64641 +/* Neighbor is not available (side neighbor or parent) */
64642 +#define E_NO_NEIGHBOR  (REISER4_ERROR_CODE_BASE)
64643 +
64644 +/* Node was not found in cache */
64645 +#define E_NOT_IN_CACHE (REISER4_ERROR_CODE_BASE + 1)
64646 +
64647 +/* node has no free space enough for completion of balancing operation */
64648 +#define E_NODE_FULL    (REISER4_ERROR_CODE_BASE + 2)
64649 +
64650 +/* repeat operation */
64651 +#define E_REPEAT       (REISER4_ERROR_CODE_BASE + 3)
64652 +
64653 +/* deadlock happens */
64654 +#define E_DEADLOCK     (REISER4_ERROR_CODE_BASE + 4)
64655 +
64656 +/* operation cannot be performed, because it would block and non-blocking mode
64657 + * was requested. */
64658 +#define E_BLOCK        (REISER4_ERROR_CODE_BASE + 5)
64659 +
64660 +/* wait some event (depends on context), then repeat */
64661 +#define E_WAIT         (REISER4_ERROR_CODE_BASE + 6)
64662 +
64663 +#endif                         /* __REISER4_H__ */
64664 +
64665 +/* Make Linus happy.
64666 +   Local variables:
64667 +   c-indentation-style: "K&R"
64668 +   mode-name: "LC"
64669 +   c-basic-offset: 8
64670 +   tab-width: 8
64671 +   fill-column: 120
64672 +   End:
64673 +*/
64674 diff --git a/fs/reiser4/safe_link.c b/fs/reiser4/safe_link.c
64675 new file mode 100644
64676 index 0000000..1253bdb
64677 --- /dev/null
64678 +++ b/fs/reiser4/safe_link.c
64679 @@ -0,0 +1,351 @@
64680 +/* Copyright 2003, 2004 by Hans Reiser, licensing governed by
64681 + * reiser4/README */
64682 +
64683 +/* Safe-links. */
64684 +
64685 +/*
64686 + * Safe-links are used to maintain file system consistency during operations
64687 + * that spawns multiple transactions. For example:
64688 + *
64689 + *     1. Unlink. UNIX supports "open-but-unlinked" files, that is files
64690 + *     without user-visible names in the file system, but still opened by some
64691 + *     active process. What happens here is that unlink proper (i.e., removal
64692 + *     of the last file name) and file deletion (truncate of file body to zero
64693 + *     and deletion of stat-data, that happens when last file descriptor is
64694 + *     closed), may belong to different transactions T1 and T2. If a crash
64695 + *     happens after T1 commit, but before T2 commit, on-disk file system has
64696 + *     a file without name, that is, disk space leak.
64697 + *
64698 + *     2. Truncate. Truncate of large file may spawn multiple transactions. If
64699 + *     system crashes while truncate was in-progress, file is left partially
64700 + *     truncated, which violates "atomicity guarantees" of reiser4, viz. that
64701 + *     every system is atomic.
64702 + *
64703 + * Safe-links address both above cases. Basically, safe-link is a way post
64704 + * some operation to be executed during commit of some other transaction than
64705 + * current one. (Another way to look at the safe-link is to interpret it as a
64706 + * logical logging.)
64707 + *
64708 + * Specifically, at the beginning of unlink safe-link in inserted in the
64709 + * tree. This safe-link is normally removed by file deletion code (during
64710 + * transaction T2 in the above terms). Truncate also inserts safe-link that is
64711 + * normally removed when truncate operation is finished.
64712 + *
64713 + * This means, that in the case of "clean umount" there are no safe-links in
64714 + * the tree. If safe-links are observed during mount, it means that (a) system
64715 + * was terminated abnormally, and (b) safe-link correspond to the "pending"
64716 + * (i.e., not finished) operations that were in-progress during system
64717 + * termination. Each safe-link record enough information to complete
64718 + * corresponding operation, and mount simply "replays" them (hence, the
64719 + * analogy with the logical logging).
64720 + *
64721 + * Safe-links are implemented as blackbox items (see
64722 + * plugin/item/blackbox.[ch]).
64723 + *
64724 + * For the reference: ext3 also has similar mechanism, it's called "an orphan
64725 + * list" there.
64726 + */
64727 +
64728 +#include "safe_link.h"
64729 +#include "debug.h"
64730 +#include "inode.h"
64731 +
64732 +#include "plugin/item/blackbox.h"
64733 +
64734 +#include <linux/fs.h>
64735 +
64736 +/*
64737 + * On-disk format of safe-link.
64738 + */
64739 +typedef struct safelink {
64740 +       reiser4_key sdkey;      /* key of stat-data for the file safe-link is
64741 +                                * for */
64742 +       d64 size;               /* size to which file should be truncated */
64743 +} safelink_t;
64744 +
64745 +/*
64746 + * locality where safe-link items are stored. Next to the objectid of root
64747 + * directory.
64748 + */
64749 +static oid_t safe_link_locality(reiser4_tree * tree)
64750 +{
64751 +       return get_key_objectid(get_super_private(tree->super)->df_plug->
64752 +                               root_dir_key(tree->super)) + 1;
64753 +}
64754 +
64755 +/*
64756 +  Construct a key for the safe-link. Key has the following format:
64757 +
64758 +|        60     | 4 |        64        | 4 |      60       |         64       |
64759 ++---------------+---+------------------+---+---------------+------------------+
64760 +|   locality    | 0 |        0         | 0 |   objectid    |     link type    |
64761 ++---------------+---+------------------+---+---------------+------------------+
64762 +|                   |                  |                   |                  |
64763 +|     8 bytes       |     8 bytes      |      8 bytes      |      8 bytes     |
64764 +
64765 +   This is in large keys format. In small keys format second 8 byte chunk is
64766 +   out. Locality is a constant returned by safe_link_locality(). objectid is
64767 +   an oid of a file on which operation protected by this safe-link is
64768 +   performed. link-type is used to distinguish safe-links for different
64769 +   operations.
64770 +
64771 + */
64772 +static reiser4_key *build_link_key(reiser4_tree * tree, oid_t oid,
64773 +                                  reiser4_safe_link_t link, reiser4_key * key)
64774 +{
64775 +       reiser4_key_init(key);
64776 +       set_key_locality(key, safe_link_locality(tree));
64777 +       set_key_objectid(key, oid);
64778 +       set_key_offset(key, link);
64779 +       return key;
64780 +}
64781 +
64782 +/*
64783 + * how much disk space is necessary to insert and remove (in the
64784 + * error-handling path) safe-link.
64785 + */
64786 +static __u64 safe_link_tograb(reiser4_tree * tree)
64787 +{
64788 +       return
64789 +           /* insert safe link */
64790 +           estimate_one_insert_item(tree) +
64791 +           /* remove safe link */
64792 +           estimate_one_item_removal(tree) +
64793 +           /* drill to the leaf level during insertion */
64794 +           1 + estimate_one_insert_item(tree) +
64795 +           /*
64796 +            * possible update of existing safe-link. Actually, if
64797 +            * safe-link existed already (we failed to remove it), then no
64798 +            * insertion is necessary, so this term is already "covered",
64799 +            * but for simplicity let's left it.
64800 +            */
64801 +           1;
64802 +}
64803 +
64804 +/*
64805 + * grab enough disk space to insert and remove (in the error-handling path)
64806 + * safe-link.
64807 + */
64808 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags)
64809 +{
64810 +       int result;
64811 +
64812 +       grab_space_enable();
64813 +       /* The sbinfo->delete_mutex can be taken here.
64814 +        * safe_link_release() should be called before leaving reiser4
64815 +        * context. */
64816 +       result =
64817 +           reiser4_grab_reserved(tree->super, safe_link_tograb(tree), flags);
64818 +       grab_space_enable();
64819 +       return result;
64820 +}
64821 +
64822 +/*
64823 + * release unused disk space reserved by safe_link_grab().
64824 + */
64825 +void safe_link_release(reiser4_tree * tree)
64826 +{
64827 +       reiser4_release_reserved(tree->super);
64828 +}
64829 +
64830 +/*
64831 + * insert into tree safe-link for operation @link on inode @inode.
64832 + */
64833 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link)
64834 +{
64835 +       reiser4_key key;
64836 +       safelink_t sl;
64837 +       int length;
64838 +       int result;
64839 +       reiser4_tree *tree;
64840 +
64841 +       build_sd_key(inode, &sl.sdkey);
64842 +       length = sizeof sl.sdkey;
64843 +
64844 +       if (link == SAFE_TRUNCATE) {
64845 +               /*
64846 +                * for truncate we have to store final file length also,
64847 +                * expand item.
64848 +                */
64849 +               length += sizeof(sl.size);
64850 +               put_unaligned(cpu_to_le64(inode->i_size), &sl.size);
64851 +       }
64852 +       tree = reiser4_tree_by_inode(inode);
64853 +       build_link_key(tree, get_inode_oid(inode), link, &key);
64854 +
64855 +       result = store_black_box(tree, &key, &sl, length);
64856 +       if (result == -EEXIST)
64857 +               result = update_black_box(tree, &key, &sl, length);
64858 +       return result;
64859 +}
64860 +
64861 +/*
64862 + * remove safe-link corresponding to the operation @link on inode @inode from
64863 + * the tree.
64864 + */
64865 +int safe_link_del(reiser4_tree * tree, oid_t oid, reiser4_safe_link_t link)
64866 +{
64867 +       reiser4_key key;
64868 +
64869 +       return kill_black_box(tree, build_link_key(tree, oid, link, &key));
64870 +}
64871 +
64872 +/*
64873 + * in-memory structure to keep information extracted from safe-link. This is
64874 + * used to iterate over all safe-links.
64875 + */
64876 +typedef struct {
64877 +       reiser4_tree *tree;     /* internal tree */
64878 +       reiser4_key key;        /* safe-link key */
64879 +       reiser4_key sdkey;      /* key of object stat-data */
64880 +       reiser4_safe_link_t link;       /* safe-link type */
64881 +       oid_t oid;              /* object oid */
64882 +       __u64 size;             /* final size for truncate */
64883 +} safe_link_context;
64884 +
64885 +/*
64886 + * start iterating over all safe-links.
64887 + */
64888 +static void safe_link_iter_begin(reiser4_tree * tree, safe_link_context * ctx)
64889 +{
64890 +       ctx->tree = tree;
64891 +       reiser4_key_init(&ctx->key);
64892 +       set_key_locality(&ctx->key, safe_link_locality(tree));
64893 +       set_key_objectid(&ctx->key, get_key_objectid(reiser4_max_key()));
64894 +       set_key_offset(&ctx->key, get_key_offset(reiser4_max_key()));
64895 +}
64896 +
64897 +/*
64898 + * return next safe-link.
64899 + */
64900 +static int safe_link_iter_next(safe_link_context * ctx)
64901 +{
64902 +       int result;
64903 +       safelink_t sl;
64904 +
64905 +       result = load_black_box(ctx->tree, &ctx->key, &sl, sizeof sl, 0);
64906 +       if (result == 0) {
64907 +               ctx->oid = get_key_objectid(&ctx->key);
64908 +               ctx->link = get_key_offset(&ctx->key);
64909 +               ctx->sdkey = sl.sdkey;
64910 +               if (ctx->link == SAFE_TRUNCATE)
64911 +                       ctx->size = le64_to_cpu(get_unaligned(&sl.size));
64912 +       }
64913 +       return result;
64914 +}
64915 +
64916 +/*
64917 + * check are there any more safe-links left in the tree.
64918 + */
64919 +static int safe_link_iter_finished(safe_link_context * ctx)
64920 +{
64921 +       return get_key_locality(&ctx->key) != safe_link_locality(ctx->tree);
64922 +}
64923 +
64924 +/*
64925 + * finish safe-link iteration.
64926 + */
64927 +static void safe_link_iter_end(safe_link_context * ctx)
64928 +{
64929 +       /* nothing special */
64930 +}
64931 +
64932 +/*
64933 + * process single safe-link.
64934 + */
64935 +static int process_safelink(struct super_block *super, reiser4_safe_link_t link,
64936 +                           reiser4_key * sdkey, oid_t oid, __u64 size)
64937 +{
64938 +       struct inode *inode;
64939 +       int result;
64940 +
64941 +       /*
64942 +        * obtain object inode by reiser4_iget(), then call object plugin
64943 +        * ->safelink() method to do actual work, then delete safe-link on
64944 +        * success.
64945 +        */
64946 +       inode = reiser4_iget(super, sdkey, 1);
64947 +       if (!IS_ERR(inode)) {
64948 +               file_plugin *fplug;
64949 +
64950 +               fplug = inode_file_plugin(inode);
64951 +               assert("nikita-3428", fplug != NULL);
64952 +               assert("", oid == get_inode_oid(inode));
64953 +               if (fplug->safelink != NULL) {
64954 +                       /* reiser4_txn_restart_current is not necessary because
64955 +                        * mounting is signle thread. However, without it
64956 +                        * deadlock detection code will complain (see
64957 +                        * nikita-3361). */
64958 +                       reiser4_txn_restart_current();
64959 +                       result = fplug->safelink(inode, link, size);
64960 +               } else {
64961 +                       warning("nikita-3430",
64962 +                               "Cannot handle safelink for %lli",
64963 +                               (unsigned long long)oid);
64964 +                       reiser4_print_key("key", sdkey);
64965 +                       result = 0;
64966 +               }
64967 +               if (result != 0) {
64968 +                       warning("nikita-3431",
64969 +                               "Error processing safelink for %lli: %i",
64970 +                               (unsigned long long)oid, result);
64971 +               }
64972 +               reiser4_iget_complete(inode);
64973 +               iput(inode);
64974 +               if (result == 0) {
64975 +                       result = safe_link_grab(reiser4_get_tree(super), BA_CAN_COMMIT);
64976 +                       if (result == 0)
64977 +                               result =
64978 +                                   safe_link_del(reiser4_get_tree(super), oid, link);
64979 +                       safe_link_release(reiser4_get_tree(super));
64980 +                       /*
64981 +                        * restart transaction: if there was large number of
64982 +                        * safe-links, their processing may fail to fit into
64983 +                        * single transaction.
64984 +                        */
64985 +                       if (result == 0)
64986 +                               reiser4_txn_restart_current();
64987 +               }
64988 +       } else
64989 +               result = PTR_ERR(inode);
64990 +       return result;
64991 +}
64992 +
64993 +/*
64994 + * iterate over all safe-links in the file-system processing them one by one.
64995 + */
64996 +int process_safelinks(struct super_block *super)
64997 +{
64998 +       safe_link_context ctx;
64999 +       int result;
65000 +
65001 +       if (rofs_super(super))
65002 +               /* do nothing on the read-only file system */
65003 +               return 0;
65004 +       safe_link_iter_begin(&get_super_private(super)->tree, &ctx);
65005 +       result = 0;
65006 +       do {
65007 +               result = safe_link_iter_next(&ctx);
65008 +               if (safe_link_iter_finished(&ctx) || result == -ENOENT) {
65009 +                       result = 0;
65010 +                       break;
65011 +               }
65012 +               if (result == 0)
65013 +                       result = process_safelink(super, ctx.link,
65014 +                                                 &ctx.sdkey, ctx.oid,
65015 +                                                 ctx.size);
65016 +       } while (result == 0);
65017 +       safe_link_iter_end(&ctx);
65018 +       return result;
65019 +}
65020 +
65021 +/* Make Linus happy.
65022 +   Local variables:
65023 +   c-indentation-style: "K&R"
65024 +   mode-name: "LC"
65025 +   c-basic-offset: 8
65026 +   tab-width: 8
65027 +   fill-column: 120
65028 +   scroll-step: 1
65029 +   End:
65030 +*/
65031 diff --git a/fs/reiser4/safe_link.h b/fs/reiser4/safe_link.h
65032 new file mode 100644
65033 index 0000000..7ae4458
65034 --- /dev/null
65035 +++ b/fs/reiser4/safe_link.h
65036 @@ -0,0 +1,29 @@
65037 +/* Copyright 2003 by Hans Reiser, licensing governed by
65038 + * reiser4/README */
65039 +
65040 +/* Safe-links. See safe_link.c for details. */
65041 +
65042 +#if !defined( __FS_SAFE_LINK_H__ )
65043 +#define __FS_SAFE_LINK_H__
65044 +
65045 +#include "tree.h"
65046 +
65047 +int safe_link_grab(reiser4_tree * tree, reiser4_ba_flags_t flags);
65048 +void safe_link_release(reiser4_tree * tree);
65049 +int safe_link_add(struct inode *inode, reiser4_safe_link_t link);
65050 +int safe_link_del(reiser4_tree *, oid_t oid, reiser4_safe_link_t link);
65051 +
65052 +int process_safelinks(struct super_block *super);
65053 +
65054 +/* __FS_SAFE_LINK_H__ */
65055 +#endif
65056 +
65057 +/* Make Linus happy.
65058 +   Local variables:
65059 +   c-indentation-style: "K&R"
65060 +   mode-name: "LC"
65061 +   c-basic-offset: 8
65062 +   tab-width: 8
65063 +   fill-column: 120
65064 +   End:
65065 +*/
65066 diff --git a/fs/reiser4/seal.c b/fs/reiser4/seal.c
65067 new file mode 100644
65068 index 0000000..c91cf52
65069 --- /dev/null
65070 +++ b/fs/reiser4/seal.c
65071 @@ -0,0 +1,218 @@
65072 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
65073 +/* Seals implementation. */
65074 +/* Seals are "weak" tree pointers. They are analogous to tree coords in
65075 +   allowing to bypass tree traversal. But normal usage of coords implies that
65076 +   node pointed to by coord is locked, whereas seals don't keep a lock (or
65077 +   even a reference) to znode. In stead, each znode contains a version number,
65078 +   increased on each znode modification. This version number is copied into a
65079 +   seal when seal is created. Later, one can "validate" seal by calling
65080 +   reiser4_seal_validate(). If znode is in cache and its version number is
65081 +   still the same, seal is "pristine" and coord associated with it can be
65082 +   re-used immediately.
65083 +
65084 +   If, on the other hand, znode is out of cache, or it is obviously different
65085 +   one from the znode seal was initially attached to (for example, it is on
65086 +   the different level, or is being removed from the tree), seal is
65087 +   irreparably invalid ("burned") and tree traversal has to be repeated.
65088 +
65089 +   Otherwise, there is some hope, that while znode was modified (and seal was
65090 +   "broken" as a result), key attached to the seal is still in the node. This
65091 +   is checked by first comparing this key with delimiting keys of node and, if
65092 +   key is ok, doing intra-node lookup.
65093 +
65094 +   Znode version is maintained in the following way:
65095 +
65096 +   there is reiser4_tree.znode_epoch counter. Whenever new znode is created,
65097 +   znode_epoch is incremented and its new value is stored in ->version field
65098 +   of new znode. Whenever znode is dirtied (which means it was probably
65099 +   modified), znode_epoch is also incremented and its new value is stored in
65100 +   znode->version. This is done so, because just incrementing znode->version
65101 +   on each update is not enough: it may so happen, that znode get deleted, new
65102 +   znode is allocated for the same disk block and gets the same version
65103 +   counter, tricking seal code into false positive.
65104 +*/
65105 +
65106 +#include "forward.h"
65107 +#include "debug.h"
65108 +#include "key.h"
65109 +#include "coord.h"
65110 +#include "seal.h"
65111 +#include "plugin/item/item.h"
65112 +#include "plugin/node/node.h"
65113 +#include "jnode.h"
65114 +#include "znode.h"
65115 +#include "super.h"
65116 +
65117 +static znode *seal_node(const seal_t * seal);
65118 +static int seal_matches(const seal_t * seal, znode * node);
65119 +
65120 +/* initialise seal. This can be called several times on the same seal. @coord
65121 +   and @key can be NULL.  */
65122 +void reiser4_seal_init(seal_t * seal /* seal to initialise */ ,
65123 +                      const coord_t * coord /* coord @seal will be
65124 +                                             * attached to */ ,
65125 +                      const reiser4_key * key UNUSED_ARG /* key @seal will be
65126 +                                                          * attached to */ )
65127 +{
65128 +       assert("nikita-1886", seal != NULL);
65129 +       memset(seal, 0, sizeof *seal);
65130 +       if (coord != NULL) {
65131 +               znode *node;
65132 +
65133 +               node = coord->node;
65134 +               assert("nikita-1987", node != NULL);
65135 +               spin_lock_znode(node);
65136 +               seal->version = node->version;
65137 +               assert("nikita-1988", seal->version != 0);
65138 +               seal->block = *znode_get_block(node);
65139 +#if REISER4_DEBUG
65140 +               seal->coord1 = *coord;
65141 +               if (key != NULL)
65142 +                       seal->key = *key;
65143 +#endif
65144 +               spin_unlock_znode(node);
65145 +       }
65146 +}
65147 +
65148 +/* finish with seal */
65149 +void reiser4_seal_done(seal_t * seal /* seal to clear */ )
65150 +{
65151 +       assert("nikita-1887", seal != NULL);
65152 +       seal->version = 0;
65153 +}
65154 +
65155 +/* true if seal was initialised */
65156 +int reiser4_seal_is_set(const seal_t * seal /* seal to query */ )
65157 +{
65158 +       assert("nikita-1890", seal != NULL);
65159 +       return seal->version != 0;
65160 +}
65161 +
65162 +#if REISER4_DEBUG
65163 +/* helper function for reiser4_seal_validate(). It checks that item at @coord
65164 + * has expected key. This is to detect cases where node was modified but wasn't
65165 + * marked dirty. */
65166 +static inline int check_seal_match(const coord_t * coord /* coord to check */ ,
65167 +                                  const reiser4_key * k /* expected key */ )
65168 +{
65169 +       reiser4_key ukey;
65170 +
65171 +       return (coord->between != AT_UNIT) ||
65172 +           /* FIXME-VS: we only can compare keys for items whose units
65173 +              represent exactly one key */
65174 +           ((coord_is_existing_unit(coord))
65175 +            && (item_is_extent(coord)
65176 +                || keyeq(k, unit_key_by_coord(coord, &ukey))))
65177 +           || ((coord_is_existing_unit(coord)) && (item_is_ctail(coord))
65178 +               && keyge(k, unit_key_by_coord(coord, &ukey)));
65179 +}
65180 +#endif
65181 +
65182 +/* this is used by reiser4_seal_validate. It accepts return value of
65183 + * longterm_lock_znode and returns 1 if it can be interpreted as seal
65184 + * validation failure. For instance, when longterm_lock_znode returns -EINVAL,
65185 + * reiser4_seal_validate returns -E_REPEAT and caller will call tre search.
65186 + * We cannot do this in longterm_lock_znode(), because sometimes we want to
65187 + * distinguish between -EINVAL and -E_REPEAT. */
65188 +static int should_repeat(int return_code)
65189 +{
65190 +       return return_code == -EINVAL;
65191 +}
65192 +
65193 +/* (re-)validate seal.
65194 +
65195 +   Checks whether seal is pristine, and try to revalidate it if possible.
65196 +
65197 +   If seal was burned, or broken irreparably, return -E_REPEAT.
65198 +
65199 +   NOTE-NIKITA currently reiser4_seal_validate() returns -E_REPEAT if key we are
65200 +   looking for is in range of keys covered by the sealed node, but item wasn't
65201 +   found by node ->lookup() method. Alternative is to return -ENOENT in this
65202 +   case, but this would complicate callers logic.
65203 +
65204 +*/
65205 +int reiser4_seal_validate(seal_t * seal /* seal to validate */,
65206 +                         coord_t * coord /* coord to validate against */,
65207 +                         const reiser4_key * key /* key to validate against */,
65208 +                         lock_handle * lh /* resulting lock handle */,
65209 +                         znode_lock_mode mode /* lock node */,
65210 +                         znode_lock_request request /* locking priority */)
65211 +{
65212 +       znode *node;
65213 +       int result;
65214 +
65215 +       assert("nikita-1889", seal != NULL);
65216 +       assert("nikita-1881", reiser4_seal_is_set(seal));
65217 +       assert("nikita-1882", key != NULL);
65218 +       assert("nikita-1883", coord != NULL);
65219 +       assert("nikita-1884", lh != NULL);
65220 +       assert("nikita-1885", keyeq(&seal->key, key));
65221 +       assert("nikita-1989", coords_equal(&seal->coord1, coord));
65222 +
65223 +       /* obtain znode by block number */
65224 +       node = seal_node(seal);
65225 +       if (node != NULL) {
65226 +               /* znode was in cache, lock it */
65227 +               result = longterm_lock_znode(lh, node, mode, request);
65228 +               zput(node);
65229 +               if (result == 0) {
65230 +                       if (seal_matches(seal, node)) {
65231 +                               /* if seal version and znode version
65232 +                                  coincide */
65233 +                               ON_DEBUG(coord_update_v(coord));
65234 +                               assert("nikita-1990",
65235 +                                      node == seal->coord1.node);
65236 +                               assert("nikita-1898",
65237 +                                      WITH_DATA_RET(coord->node, 1,
65238 +                                                    check_seal_match(coord,
65239 +                                                                     key)));
65240 +                       } else
65241 +                               result = RETERR(-E_REPEAT);
65242 +               }
65243 +               if (result != 0) {
65244 +                       if (should_repeat(result))
65245 +                               result = RETERR(-E_REPEAT);
65246 +                       /* unlock node on failure */
65247 +                       done_lh(lh);
65248 +               }
65249 +       } else {
65250 +               /* znode wasn't in cache */
65251 +               result = RETERR(-E_REPEAT);
65252 +       }
65253 +       return result;
65254 +}
65255 +
65256 +/* helpers functions */
65257 +
65258 +/* obtain reference to znode seal points to, if in cache */
65259 +static znode *seal_node(const seal_t * seal /* seal to query */ )
65260 +{
65261 +       assert("nikita-1891", seal != NULL);
65262 +       return zlook(current_tree, &seal->block);
65263 +}
65264 +
65265 +/* true if @seal version and @node version coincide */
65266 +static int seal_matches(const seal_t * seal /* seal to check */ ,
65267 +                       znode * node /* node to check */ )
65268 +{
65269 +       int result;
65270 +
65271 +       assert("nikita-1991", seal != NULL);
65272 +       assert("nikita-1993", node != NULL);
65273 +
65274 +       spin_lock_znode(node);
65275 +       result = (seal->version == node->version);
65276 +       spin_unlock_znode(node);
65277 +       return result;
65278 +}
65279 +
65280 +/* Make Linus happy.
65281 +   Local variables:
65282 +   c-indentation-style: "K&R"
65283 +   mode-name: "LC"
65284 +   c-basic-offset: 8
65285 +   tab-width: 8
65286 +   fill-column: 120
65287 +   scroll-step: 1
65288 +   End:
65289 +*/
65290 diff --git a/fs/reiser4/seal.h b/fs/reiser4/seal.h
65291 new file mode 100644
65292 index 0000000..5c3c5e0
65293 --- /dev/null
65294 +++ b/fs/reiser4/seal.h
65295 @@ -0,0 +1,49 @@
65296 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
65297 +
65298 +/* Declaration of seals: "weak" tree pointers. See seal.c for comments. */
65299 +
65300 +#ifndef __SEAL_H__
65301 +#define __SEAL_H__
65302 +
65303 +#include "forward.h"
65304 +#include "debug.h"
65305 +#include "dformat.h"
65306 +#include "key.h"
65307 +#include "coord.h"
65308 +
65309 +/* for __u?? types */
65310 +/*#include <linux/types.h>*/
65311 +
65312 +/* seal. See comment at the top of seal.c */
65313 +typedef struct seal_s {
65314 +       /* version of znode recorder at the time of seal creation */
65315 +       __u64 version;
65316 +       /* block number of znode attached to this seal */
65317 +       reiser4_block_nr block;
65318 +#if REISER4_DEBUG
65319 +       /* coord this seal is attached to. For debugging. */
65320 +       coord_t coord1;
65321 +       /* key this seal is attached to. For debugging. */
65322 +       reiser4_key key;
65323 +#endif
65324 +} seal_t;
65325 +
65326 +extern void reiser4_seal_init(seal_t *, const coord_t *, const reiser4_key *);
65327 +extern void reiser4_seal_done(seal_t *);
65328 +extern int reiser4_seal_is_set(const seal_t *);
65329 +extern int reiser4_seal_validate(seal_t *, coord_t *,
65330 +                        const reiser4_key *, lock_handle *,
65331 +                        znode_lock_mode mode, znode_lock_request request);
65332 +
65333 +/* __SEAL_H__ */
65334 +#endif
65335 +
65336 +/* Make Linus happy.
65337 +   Local variables:
65338 +   c-indentation-style: "K&R"
65339 +   mode-name: "LC"
65340 +   c-basic-offset: 8
65341 +   tab-width: 8
65342 +   fill-column: 120
65343 +   End:
65344 +*/
65345 diff --git a/fs/reiser4/search.c b/fs/reiser4/search.c
65346 new file mode 100644
65347 index 0000000..9d35e11
65348 --- /dev/null
65349 +++ b/fs/reiser4/search.c
65350 @@ -0,0 +1,1611 @@
65351 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
65352 + * reiser4/README */
65353 +
65354 +#include "forward.h"
65355 +#include "debug.h"
65356 +#include "dformat.h"
65357 +#include "key.h"
65358 +#include "coord.h"
65359 +#include "seal.h"
65360 +#include "plugin/item/item.h"
65361 +#include "plugin/node/node.h"
65362 +#include "plugin/plugin.h"
65363 +#include "jnode.h"
65364 +#include "znode.h"
65365 +#include "block_alloc.h"
65366 +#include "tree_walk.h"
65367 +#include "tree.h"
65368 +#include "reiser4.h"
65369 +#include "super.h"
65370 +#include "inode.h"
65371 +
65372 +#include <linux/slab.h>
65373 +
65374 +static const char *bias_name(lookup_bias bias);
65375 +
65376 +/* tree searching algorithm, intranode searching algorithms are in
65377 +   plugin/node/ */
65378 +
65379 +/* tree lookup cache
65380 + *
65381 + * The coord by key cache consists of small list of recently accessed nodes
65382 + * maintained according to the LRU discipline. Before doing real top-to-down
65383 + * tree traversal this cache is scanned for nodes that can contain key
65384 + * requested.
65385 + *
65386 + * The efficiency of coord cache depends heavily on locality of reference for
65387 + * tree accesses. Our user level simulations show reasonably good hit ratios
65388 + * for coord cache under most loads so far.
65389 + */
65390 +
65391 +/* Initialise coord cache slot */
65392 +static void cbk_cache_init_slot(cbk_cache_slot *slot)
65393 +{
65394 +       assert("nikita-345", slot != NULL);
65395 +
65396 +       INIT_LIST_HEAD(&slot->lru);
65397 +       slot->node = NULL;
65398 +}
65399 +
65400 +/* Initialize coord cache */
65401 +int cbk_cache_init(cbk_cache *cache /* cache to init */ )
65402 +{
65403 +       int i;
65404 +
65405 +       assert("nikita-346", cache != NULL);
65406 +
65407 +       cache->slot =
65408 +               kmalloc(sizeof(cbk_cache_slot) * cache->nr_slots,
65409 +                       reiser4_ctx_gfp_mask_get());
65410 +       if (cache->slot == NULL)
65411 +               return RETERR(-ENOMEM);
65412 +
65413 +       INIT_LIST_HEAD(&cache->lru);
65414 +       for (i = 0; i < cache->nr_slots; ++i) {
65415 +               cbk_cache_init_slot(cache->slot + i);
65416 +               list_add_tail(&((cache->slot + i)->lru), &cache->lru);
65417 +       }
65418 +       rwlock_init(&cache->guard);
65419 +       return 0;
65420 +}
65421 +
65422 +/* free cbk cache data */
65423 +void cbk_cache_done(cbk_cache * cache /* cache to release */ )
65424 +{
65425 +       assert("nikita-2493", cache != NULL);
65426 +       if (cache->slot != NULL) {
65427 +               kfree(cache->slot);
65428 +               cache->slot = NULL;
65429 +       }
65430 +}
65431 +
65432 +/* macro to iterate over all cbk cache slots */
65433 +#define for_all_slots(cache, slot)                                             \
65434 +       for ((slot) = list_entry((cache)->lru.next, cbk_cache_slot, lru);       \
65435 +            &(cache)->lru != &(slot)->lru;                                     \
65436 +            (slot) = list_entry(slot->lru.next, cbk_cache_slot, lru))
65437 +
65438 +#if REISER4_DEBUG
65439 +/* this function assures that [cbk-cache-invariant] invariant holds */
65440 +static int cbk_cache_invariant(const cbk_cache *cache)
65441 +{
65442 +       cbk_cache_slot *slot;
65443 +       int result;
65444 +       int unused;
65445 +
65446 +       if (cache->nr_slots == 0)
65447 +               return 1;
65448 +
65449 +       assert("nikita-2469", cache != NULL);
65450 +       unused = 0;
65451 +       result = 1;
65452 +       read_lock(&((cbk_cache *)cache)->guard);
65453 +       for_all_slots(cache, slot) {
65454 +               /* in LRU first go all `used' slots followed by `unused' */
65455 +               if (unused && (slot->node != NULL))
65456 +                       result = 0;
65457 +               if (slot->node == NULL)
65458 +                       unused = 1;
65459 +               else {
65460 +                       cbk_cache_slot *scan;
65461 +
65462 +                       /* all cached nodes are different */
65463 +                       scan = slot;
65464 +                       while (result) {
65465 +                               scan = list_entry(scan->lru.next, cbk_cache_slot, lru);
65466 +                               if (&cache->lru == &scan->lru)
65467 +                                       break;
65468 +                               if (slot->node == scan->node)
65469 +                                       result = 0;
65470 +                       }
65471 +               }
65472 +               if (!result)
65473 +                       break;
65474 +       }
65475 +       read_unlock(&((cbk_cache *)cache)->guard);
65476 +       return result;
65477 +}
65478 +
65479 +#endif
65480 +
65481 +/* Remove references, if any, to @node from coord cache */
65482 +void cbk_cache_invalidate(const znode * node /* node to remove from cache */ ,
65483 +                         reiser4_tree * tree /* tree to remove node from */ )
65484 +{
65485 +       cbk_cache_slot *slot;
65486 +       cbk_cache *cache;
65487 +       int i;
65488 +
65489 +       assert("nikita-350", node != NULL);
65490 +       assert("nikita-1479", LOCK_CNT_GTZ(rw_locked_tree));
65491 +
65492 +       cache = &tree->cbk_cache;
65493 +       assert("nikita-2470", cbk_cache_invariant(cache));
65494 +
65495 +       write_lock(&(cache->guard));
65496 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65497 +               if (slot->node == node) {
65498 +                       list_move_tail(&slot->lru, &cache->lru);
65499 +                       slot->node = NULL;
65500 +                       break;
65501 +               }
65502 +       }
65503 +       write_unlock(&(cache->guard));
65504 +       assert("nikita-2471", cbk_cache_invariant(cache));
65505 +}
65506 +
65507 +/* add to the cbk-cache in the "tree" information about "node". This
65508 +    can actually be update of existing slot in a cache. */
65509 +static void cbk_cache_add(const znode *node /* node to add to the cache */ )
65510 +{
65511 +       cbk_cache *cache;
65512 +       cbk_cache_slot *slot;
65513 +       int i;
65514 +
65515 +       assert("nikita-352", node != NULL);
65516 +
65517 +       cache = &znode_get_tree(node)->cbk_cache;
65518 +       assert("nikita-2472", cbk_cache_invariant(cache));
65519 +
65520 +       if (cache->nr_slots == 0)
65521 +               return;
65522 +
65523 +       write_lock(&(cache->guard));
65524 +       /* find slot to update/add */
65525 +       for (i = 0, slot = cache->slot; i < cache->nr_slots; ++i, ++slot) {
65526 +               /* oops, this node is already in a cache */
65527 +               if (slot->node == node)
65528 +                       break;
65529 +       }
65530 +       /* if all slots are used, reuse least recently used one */
65531 +       if (i == cache->nr_slots) {
65532 +               slot = list_entry(cache->lru.prev, cbk_cache_slot, lru);
65533 +               slot->node = (znode *) node;
65534 +       }
65535 +       list_move(&slot->lru, &cache->lru);
65536 +       write_unlock(&(cache->guard));
65537 +       assert("nikita-2473", cbk_cache_invariant(cache));
65538 +}
65539 +
65540 +static int setup_delimiting_keys(cbk_handle * h);
65541 +static lookup_result coord_by_handle(cbk_handle * handle);
65542 +static lookup_result traverse_tree(cbk_handle * h);
65543 +static int cbk_cache_search(cbk_handle * h);
65544 +
65545 +static level_lookup_result cbk_level_lookup(cbk_handle * h);
65546 +static level_lookup_result cbk_node_lookup(cbk_handle * h);
65547 +
65548 +/* helper functions */
65549 +
65550 +static void update_stale_dk(reiser4_tree * tree, znode * node);
65551 +
65552 +/* release parent node during traversal */
65553 +static void put_parent(cbk_handle * h);
65554 +/* check consistency of fields */
65555 +static int sanity_check(cbk_handle * h);
65556 +/* release resources in handle */
65557 +static void hput(cbk_handle * h);
65558 +
65559 +static level_lookup_result search_to_left(cbk_handle * h);
65560 +
65561 +/* pack numerous (numberous I should say) arguments of coord_by_key() into
65562 + * cbk_handle */
65563 +static cbk_handle *cbk_pack(cbk_handle * handle,
65564 +                           reiser4_tree * tree,
65565 +                           const reiser4_key * key,
65566 +                           coord_t * coord,
65567 +                           lock_handle * active_lh,
65568 +                           lock_handle * parent_lh,
65569 +                           znode_lock_mode lock_mode,
65570 +                           lookup_bias bias,
65571 +                           tree_level lock_level,
65572 +                           tree_level stop_level,
65573 +                           __u32 flags, ra_info_t * info)
65574 +{
65575 +       memset(handle, 0, sizeof *handle);
65576 +
65577 +       handle->tree = tree;
65578 +       handle->key = key;
65579 +       handle->lock_mode = lock_mode;
65580 +       handle->bias = bias;
65581 +       handle->lock_level = lock_level;
65582 +       handle->stop_level = stop_level;
65583 +       handle->coord = coord;
65584 +       /* set flags. See comment in tree.h:cbk_flags */
65585 +       handle->flags = flags | CBK_TRUST_DK | CBK_USE_CRABLOCK;
65586 +
65587 +       handle->active_lh = active_lh;
65588 +       handle->parent_lh = parent_lh;
65589 +       handle->ra_info = info;
65590 +       return handle;
65591 +}
65592 +
65593 +/* main tree lookup procedure
65594 +
65595 +   Check coord cache. If key we are looking for is not found there, call cbk()
65596 +   to do real tree traversal.
65597 +
65598 +   As we have extents on the twig level, @lock_level and @stop_level can
65599 +   be different from LEAF_LEVEL and each other.
65600 +
65601 +   Thread cannot keep any reiser4 locks (tree, znode, dk spin-locks, or znode
65602 +   long term locks) while calling this.
65603 +*/
65604 +lookup_result coord_by_key(reiser4_tree * tree /* tree to perform search
65605 +                                                * in. Usually this tree is
65606 +                                                * part of file-system
65607 +                                                * super-block */ ,
65608 +                          const reiser4_key * key /* key to look for */ ,
65609 +                          coord_t * coord      /* where to store found
65610 +                                                * position in a tree. Fields
65611 +                                                * in "coord" are only valid if
65612 +                                                * coord_by_key() returned
65613 +                                                * "CBK_COORD_FOUND" */ ,
65614 +                          lock_handle * lh,    /* resulting lock handle */
65615 +                          znode_lock_mode lock_mode    /* type of lookup we
65616 +                                                        * want on node. Pass
65617 +                                                        * ZNODE_READ_LOCK here
65618 +                                                        * if you only want to
65619 +                                                        * read item found and
65620 +                                                        * ZNODE_WRITE_LOCK if
65621 +                                                        * you want to modify
65622 +                                                        * it */ ,
65623 +                          lookup_bias bias     /* what to return if coord
65624 +                                                * with exactly the @key is
65625 +                                                * not in the tree */ ,
65626 +                          tree_level lock_level        /* tree level where to start
65627 +                                                        * taking @lock type of
65628 +                                                        * locks */ ,
65629 +                          tree_level stop_level        /* tree level to stop. Pass
65630 +                                                        * LEAF_LEVEL or TWIG_LEVEL
65631 +                                                        * here Item being looked
65632 +                                                        * for has to be between
65633 +                                                        * @lock_level and
65634 +                                                        * @stop_level, inclusive */ ,
65635 +                          __u32 flags /* search flags */ ,
65636 +                          ra_info_t *
65637 +                          info
65638 +                          /* information about desired tree traversal readahead */
65639 +                          )
65640 +{
65641 +       cbk_handle handle;
65642 +       lock_handle parent_lh;
65643 +       lookup_result result;
65644 +
65645 +       init_lh(lh);
65646 +       init_lh(&parent_lh);
65647 +
65648 +       assert("nikita-3023", reiser4_schedulable());
65649 +
65650 +       assert("nikita-353", tree != NULL);
65651 +       assert("nikita-354", key != NULL);
65652 +       assert("nikita-355", coord != NULL);
65653 +       assert("nikita-356", (bias == FIND_EXACT)
65654 +              || (bias == FIND_MAX_NOT_MORE_THAN));
65655 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
65656 +       /* no locks can be held during tree traversal */
65657 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65658 +
65659 +       cbk_pack(&handle,
65660 +                tree,
65661 +                key,
65662 +                coord,
65663 +                lh,
65664 +                &parent_lh,
65665 +                lock_mode, bias, lock_level, stop_level, flags, info);
65666 +
65667 +       result = coord_by_handle(&handle);
65668 +       assert("nikita-3247",
65669 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
65670 +       return result;
65671 +}
65672 +
65673 +/* like coord_by_key(), but starts traversal from vroot of @object rather than
65674 + * from tree root. */
65675 +lookup_result reiser4_object_lookup(struct inode * object,
65676 +                                   const reiser4_key * key,
65677 +                                   coord_t * coord,
65678 +                                   lock_handle * lh,
65679 +                                   znode_lock_mode lock_mode,
65680 +                                   lookup_bias bias,
65681 +                                   tree_level lock_level,
65682 +                                   tree_level stop_level, __u32 flags,
65683 +                                   ra_info_t * info)
65684 +{
65685 +       cbk_handle handle;
65686 +       lock_handle parent_lh;
65687 +       lookup_result result;
65688 +
65689 +       init_lh(lh);
65690 +       init_lh(&parent_lh);
65691 +
65692 +       assert("nikita-3023", reiser4_schedulable());
65693 +
65694 +       assert("nikita-354", key != NULL);
65695 +       assert("nikita-355", coord != NULL);
65696 +       assert("nikita-356", (bias == FIND_EXACT)
65697 +              || (bias == FIND_MAX_NOT_MORE_THAN));
65698 +       assert("nikita-357", stop_level >= LEAF_LEVEL);
65699 +       /* no locks can be held during tree search by key */
65700 +       assert("nikita-2104", lock_stack_isclean(get_current_lock_stack()));
65701 +
65702 +       cbk_pack(&handle,
65703 +                object != NULL ? reiser4_tree_by_inode(object) : current_tree,
65704 +                key,
65705 +                coord,
65706 +                lh,
65707 +                &parent_lh,
65708 +                lock_mode, bias, lock_level, stop_level, flags, info);
65709 +       handle.object = object;
65710 +
65711 +       result = coord_by_handle(&handle);
65712 +       assert("nikita-3247",
65713 +              ergo(!IS_CBKERR(result), coord->node == lh->node));
65714 +       return result;
65715 +}
65716 +
65717 +/* lookup by cbk_handle. Common part of coord_by_key() and
65718 +   reiser4_object_lookup(). */
65719 +static lookup_result coord_by_handle(cbk_handle * handle)
65720 +{
65721 +       /*
65722 +        * first check cbk_cache (which is look-aside cache for our tree) and
65723 +        * of this fails, start traversal.
65724 +        */
65725 +       /* first check whether "key" is in cache of recent lookups. */
65726 +       if (cbk_cache_search(handle) == 0)
65727 +               return handle->result;
65728 +       else
65729 +               return traverse_tree(handle);
65730 +}
65731 +
65732 +/* Execute actor for each item (or unit, depending on @through_units_p),
65733 +   starting from @coord, right-ward, until either:
65734 +
65735 +   - end of the tree is reached
65736 +   - unformatted node is met
65737 +   - error occurred
65738 +   - @actor returns 0 or less
65739 +
65740 +   Error code, or last actor return value is returned.
65741 +
65742 +   This is used by plugin/dir/hashe_dir.c:reiser4_find_entry() to move through
65743 +   sequence of entries with identical keys and alikes.
65744 +*/
65745 +int reiser4_iterate_tree(reiser4_tree * tree /* tree to scan */ ,
65746 +                        coord_t * coord /* coord to start from */ ,
65747 +                        lock_handle * lh /* lock handle to start with and to
65748 +                                          * update along the way */ ,
65749 +                        tree_iterate_actor_t actor /* function to call on each
65750 +                                                    * item/unit */ ,
65751 +                        void *arg /* argument to pass to @actor */ ,
65752 +                        znode_lock_mode mode /* lock mode on scanned nodes */ ,
65753 +                        int through_units_p /* call @actor on each item or on
65754 +                                             * each unit */ )
65755 +{
65756 +       int result;
65757 +
65758 +       assert("nikita-1143", tree != NULL);
65759 +       assert("nikita-1145", coord != NULL);
65760 +       assert("nikita-1146", lh != NULL);
65761 +       assert("nikita-1147", actor != NULL);
65762 +
65763 +       result = zload(coord->node);
65764 +       coord_clear_iplug(coord);
65765 +       if (result != 0)
65766 +               return result;
65767 +       if (!coord_is_existing_unit(coord)) {
65768 +               zrelse(coord->node);
65769 +               return -ENOENT;
65770 +       }
65771 +       while ((result = actor(tree, coord, lh, arg)) > 0) {
65772 +               /* move further  */
65773 +               if ((through_units_p && coord_next_unit(coord)) ||
65774 +                   (!through_units_p && coord_next_item(coord))) {
65775 +                       do {
65776 +                               lock_handle couple;
65777 +
65778 +                               /* move to the next node  */
65779 +                               init_lh(&couple);
65780 +                               result =
65781 +                                   reiser4_get_right_neighbor(&couple,
65782 +                                                              coord->node,
65783 +                                                              (int)mode,
65784 +                                                              GN_CAN_USE_UPPER_LEVELS);
65785 +                               zrelse(coord->node);
65786 +                               if (result == 0) {
65787 +
65788 +                                       result = zload(couple.node);
65789 +                                       if (result != 0) {
65790 +                                               done_lh(&couple);
65791 +                                               return result;
65792 +                                       }
65793 +
65794 +                                       coord_init_first_unit(coord,
65795 +                                                             couple.node);
65796 +                                       done_lh(lh);
65797 +                                       move_lh(lh, &couple);
65798 +                               } else
65799 +                                       return result;
65800 +                       } while (node_is_empty(coord->node));
65801 +               }
65802 +
65803 +               assert("nikita-1149", coord_is_existing_unit(coord));
65804 +       }
65805 +       zrelse(coord->node);
65806 +       return result;
65807 +}
65808 +
65809 +/* return locked uber znode for @tree */
65810 +int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
65811 +                  znode_lock_request pri, lock_handle * lh)
65812 +{
65813 +       int result;
65814 +
65815 +       result = longterm_lock_znode(lh, tree->uber, mode, pri);
65816 +       return result;
65817 +}
65818 +
65819 +/* true if @key is strictly within @node
65820 +
65821 +   we are looking for possibly non-unique key and it is item is at the edge of
65822 +   @node. May be it is in the neighbor.
65823 +*/
65824 +static int znode_contains_key_strict(znode * node      /* node to check key
65825 +                                                        * against */ ,
65826 +                                    const reiser4_key *
65827 +                                    key /* key to check */ ,
65828 +                                    int isunique)
65829 +{
65830 +       int answer;
65831 +
65832 +       assert("nikita-1760", node != NULL);
65833 +       assert("nikita-1722", key != NULL);
65834 +
65835 +       if (keyge(key, &node->rd_key))
65836 +               return 0;
65837 +
65838 +       answer = keycmp(&node->ld_key, key);
65839 +
65840 +       if (isunique)
65841 +               return answer != GREATER_THAN;
65842 +       else
65843 +               return answer == LESS_THAN;
65844 +}
65845 +
65846 +/*
65847 + * Virtual Root (vroot) code.
65848 + *
65849 + *     For given file system object (e.g., regular file or directory) let's
65850 + *     define its "virtual root" as lowest in the tree (that is, furtherest
65851 + *     from the tree root) node such that all body items of said object are
65852 + *     located in a tree rooted at this node.
65853 + *
65854 + *     Once vroot of object is found all tree lookups for items within body of
65855 + *     this object ("object lookups") can be started from its vroot rather
65856 + *     than from real root. This has following advantages:
65857 + *
65858 + *         1. amount of nodes traversed during lookup (and, hence, amount of
65859 + *         key comparisons made) decreases, and
65860 + *
65861 + *         2. contention on tree root is decreased. This latter was actually
65862 + *         motivating reason behind vroot, because spin lock of root node,
65863 + *         which is taken when acquiring long-term lock on root node is the
65864 + *         hottest lock in the reiser4.
65865 + *
65866 + * How to find vroot.
65867 + *
65868 + *     When vroot of object F is not yet determined, all object lookups start
65869 + *     from the root of the tree. At each tree level during traversal we have
65870 + *     a node N such that a key we are looking for (which is the key inside
65871 + *     object's body) is located within N. In function handle_vroot() called
65872 + *     from cbk_level_lookup() we check whether N is possible vroot for
65873 + *     F. Check is trivial---if neither leftmost nor rightmost item of N
65874 + *     belongs to F (and we already have helpful ->owns_item() method of
65875 + *     object plugin for this), then N is possible vroot of F. This, of
65876 + *     course, relies on the assumption that each object occupies contiguous
65877 + *     range of keys in the tree.
65878 + *
65879 + *     Thus, traversing tree downward and checking each node as we go, we can
65880 + *     find lowest such node, which, by definition, is vroot.
65881 + *
65882 + * How to track vroot.
65883 + *
65884 + *     Nohow. If actual vroot changes, next object lookup will just restart
65885 + *     from the actual tree root, refreshing object's vroot along the way.
65886 + *
65887 + */
65888 +
65889 +/*
65890 + * Check whether @node is possible vroot of @object.
65891 + */
65892 +static void handle_vroot(struct inode *object, znode * node)
65893 +{
65894 +       file_plugin *fplug;
65895 +       coord_t coord;
65896 +
65897 +       fplug = inode_file_plugin(object);
65898 +       assert("nikita-3353", fplug != NULL);
65899 +       assert("nikita-3354", fplug->owns_item != NULL);
65900 +
65901 +       if (unlikely(node_is_empty(node)))
65902 +               return;
65903 +
65904 +       coord_init_first_unit(&coord, node);
65905 +       /*
65906 +        * if leftmost item of @node belongs to @object, we cannot be sure
65907 +        * that @node is vroot of @object, because, some items of @object are
65908 +        * probably in the sub-tree rooted at the left neighbor of @node.
65909 +        */
65910 +       if (fplug->owns_item(object, &coord))
65911 +               return;
65912 +       coord_init_last_unit(&coord, node);
65913 +       /* mutatis mutandis for the rightmost item */
65914 +       if (fplug->owns_item(object, &coord))
65915 +               return;
65916 +       /* otherwise, @node is possible vroot of @object */
65917 +       inode_set_vroot(object, node);
65918 +}
65919 +
65920 +/*
65921 + * helper function used by traverse tree to start tree traversal not from the
65922 + * tree root, but from @h->object's vroot, if possible.
65923 + */
65924 +static int prepare_object_lookup(cbk_handle * h)
65925 +{
65926 +       znode *vroot;
65927 +       int result;
65928 +
65929 +       vroot = inode_get_vroot(h->object);
65930 +       if (vroot == NULL) {
65931 +               /*
65932 +                * object doesn't have known vroot, start from real tree root.
65933 +                */
65934 +               return LOOKUP_CONT;
65935 +       }
65936 +
65937 +       h->level = znode_get_level(vroot);
65938 +       /* take a long-term lock on vroot */
65939 +       h->result = longterm_lock_znode(h->active_lh, vroot,
65940 +                                       cbk_lock_mode(h->level, h),
65941 +                                       ZNODE_LOCK_LOPRI);
65942 +       result = LOOKUP_REST;
65943 +       if (h->result == 0) {
65944 +               int isunique;
65945 +               int inside;
65946 +
65947 +               isunique = h->flags & CBK_UNIQUE;
65948 +               /* check that key is inside vroot */
65949 +               read_lock_dk(h->tree);
65950 +               inside = (znode_contains_key_strict(vroot, h->key, isunique) &&
65951 +                         !ZF_ISSET(vroot, JNODE_HEARD_BANSHEE));
65952 +               read_unlock_dk(h->tree);
65953 +               if (inside) {
65954 +                       h->result = zload(vroot);
65955 +                       if (h->result == 0) {
65956 +                               /* search for key in vroot. */
65957 +                               result = cbk_node_lookup(h);
65958 +                               zrelse(vroot);  /*h->active_lh->node); */
65959 +                               if (h->active_lh->node != vroot) {
65960 +                                       result = LOOKUP_REST;
65961 +                               } else if (result == LOOKUP_CONT) {
65962 +                                       move_lh(h->parent_lh, h->active_lh);
65963 +                                       h->flags &= ~CBK_DKSET;
65964 +                               }
65965 +                       }
65966 +               }
65967 +       }
65968 +
65969 +       zput(vroot);
65970 +
65971 +       if (IS_CBKERR(h->result) || result == LOOKUP_REST)
65972 +               hput(h);
65973 +       return result;
65974 +}
65975 +
65976 +/* main function that handles common parts of tree traversal: starting
65977 +    (fake znode handling), restarts, error handling, completion */
65978 +static lookup_result traverse_tree(cbk_handle * h /* search handle */ )
65979 +{
65980 +       int done;
65981 +       int iterations;
65982 +       int vroot_used;
65983 +
65984 +       assert("nikita-365", h != NULL);
65985 +       assert("nikita-366", h->tree != NULL);
65986 +       assert("nikita-367", h->key != NULL);
65987 +       assert("nikita-368", h->coord != NULL);
65988 +       assert("nikita-369", (h->bias == FIND_EXACT)
65989 +              || (h->bias == FIND_MAX_NOT_MORE_THAN));
65990 +       assert("nikita-370", h->stop_level >= LEAF_LEVEL);
65991 +       assert("nikita-2949", !(h->flags & CBK_DKSET));
65992 +       assert("zam-355", lock_stack_isclean(get_current_lock_stack()));
65993 +
65994 +       done = 0;
65995 +       iterations = 0;
65996 +       vroot_used = 0;
65997 +
65998 +       /* loop for restarts */
65999 +      restart:
66000 +
66001 +       assert("nikita-3024", reiser4_schedulable());
66002 +
66003 +       h->result = CBK_COORD_FOUND;
66004 +       /* connect_znode() needs it */
66005 +       h->ld_key = *reiser4_min_key();
66006 +       h->rd_key = *reiser4_max_key();
66007 +       h->flags |= CBK_DKSET;
66008 +       h->error = NULL;
66009 +
66010 +       if (!vroot_used && h->object != NULL) {
66011 +               vroot_used = 1;
66012 +               done = prepare_object_lookup(h);
66013 +               if (done == LOOKUP_REST) {
66014 +                       goto restart;
66015 +               } else if (done == LOOKUP_DONE)
66016 +                       return h->result;
66017 +       }
66018 +       if (h->parent_lh->node == NULL) {
66019 +               done =
66020 +                   get_uber_znode(h->tree, ZNODE_READ_LOCK, ZNODE_LOCK_LOPRI,
66021 +                                  h->parent_lh);
66022 +
66023 +               assert("nikita-1637", done != -E_DEADLOCK);
66024 +
66025 +               h->block = h->tree->root_block;
66026 +               h->level = h->tree->height;
66027 +               h->coord->node = h->parent_lh->node;
66028 +
66029 +               if (done != 0)
66030 +                       return done;
66031 +       }
66032 +
66033 +       /* loop descending a tree */
66034 +       while (!done) {
66035 +
66036 +               if (unlikely((iterations > REISER4_CBK_ITERATIONS_LIMIT) &&
66037 +                            IS_POW(iterations))) {
66038 +                       warning("nikita-1481", "Too many iterations: %i",
66039 +                               iterations);
66040 +                       reiser4_print_key("key", h->key);
66041 +                       ++iterations;
66042 +               } else if (unlikely(iterations > REISER4_MAX_CBK_ITERATIONS)) {
66043 +                       h->error =
66044 +                           "reiser-2018: Too many iterations. Tree corrupted, or (less likely) starvation occurring.";
66045 +                       h->result = RETERR(-EIO);
66046 +                       break;
66047 +               }
66048 +               switch (cbk_level_lookup(h)) {
66049 +               case LOOKUP_CONT:
66050 +                       move_lh(h->parent_lh, h->active_lh);
66051 +                       continue;
66052 +               default:
66053 +                       wrong_return_value("nikita-372", "cbk_level");
66054 +               case LOOKUP_DONE:
66055 +                       done = 1;
66056 +                       break;
66057 +               case LOOKUP_REST:
66058 +                       hput(h);
66059 +                       /* deadlock avoidance is normal case. */
66060 +                       if (h->result != -E_DEADLOCK)
66061 +                               ++iterations;
66062 +                       reiser4_preempt_point();
66063 +                       goto restart;
66064 +               }
66065 +       }
66066 +       /* that's all. The rest is error handling */
66067 +       if (unlikely(h->error != NULL)) {
66068 +               warning("nikita-373", "%s: level: %i, "
66069 +                       "lock_level: %i, stop_level: %i "
66070 +                       "lock_mode: %s, bias: %s",
66071 +                       h->error, h->level, h->lock_level, h->stop_level,
66072 +                       lock_mode_name(h->lock_mode), bias_name(h->bias));
66073 +               reiser4_print_address("block", &h->block);
66074 +               reiser4_print_key("key", h->key);
66075 +               print_coord_content("coord", h->coord);
66076 +       }
66077 +       /* `unlikely' error case */
66078 +       if (unlikely(IS_CBKERR(h->result))) {
66079 +               /* failure. do cleanup */
66080 +               hput(h);
66081 +       } else {
66082 +               assert("nikita-1605", WITH_DATA_RET
66083 +                      (h->coord->node, 1,
66084 +                       ergo((h->result == CBK_COORD_FOUND) &&
66085 +                            (h->bias == FIND_EXACT) &&
66086 +                            (!node_is_empty(h->coord->node)),
66087 +                            coord_is_existing_item(h->coord))));
66088 +       }
66089 +       return h->result;
66090 +}
66091 +
66092 +/* find delimiting keys of child
66093 +
66094 +   Determine left and right delimiting keys for child pointed to by
66095 +   @parent_coord.
66096 +
66097 +*/
66098 +static void find_child_delimiting_keys(znode * parent  /* parent znode, passed
66099 +                                                        * locked */ ,
66100 +                                      const coord_t * parent_coord     /* coord where
66101 +                                                                        * pointer to
66102 +                                                                        * child is
66103 +                                                                        * stored */ ,
66104 +                                      reiser4_key * ld /* where to store left
66105 +                                                        * delimiting key */ ,
66106 +                                      reiser4_key * rd /* where to store right
66107 +                                                        * delimiting key */ )
66108 +{
66109 +       coord_t neighbor;
66110 +
66111 +       assert("nikita-1484", parent != NULL);
66112 +       assert_rw_locked(&(znode_get_tree(parent)->dk_lock));
66113 +
66114 +       coord_dup(&neighbor, parent_coord);
66115 +
66116 +       if (neighbor.between == AT_UNIT)
66117 +               /* imitate item ->lookup() behavior. */
66118 +               neighbor.between = AFTER_UNIT;
66119 +
66120 +       if (coord_set_to_left(&neighbor) == 0)
66121 +               unit_key_by_coord(&neighbor, ld);
66122 +       else {
66123 +               assert("nikita-14851", 0);
66124 +               *ld = *znode_get_ld_key(parent);
66125 +       }
66126 +
66127 +       coord_dup(&neighbor, parent_coord);
66128 +       if (neighbor.between == AT_UNIT)
66129 +               neighbor.between = AFTER_UNIT;
66130 +       if (coord_set_to_right(&neighbor) == 0)
66131 +               unit_key_by_coord(&neighbor, rd);
66132 +       else
66133 +               *rd = *znode_get_rd_key(parent);
66134 +}
66135 +
66136 +/*
66137 + * setup delimiting keys for a child
66138 + *
66139 + * @parent parent node
66140 + *
66141 + * @coord location in @parent where pointer to @child is
66142 + *
66143 + * @child child node
66144 + */
66145 +int
66146 +set_child_delimiting_keys(znode * parent, const coord_t * coord, znode * child)
66147 +{
66148 +       reiser4_tree *tree;
66149 +
66150 +       assert("nikita-2952",
66151 +              znode_get_level(parent) == znode_get_level(coord->node));
66152 +
66153 +       /* fast check without taking dk lock. This is safe, because
66154 +        * JNODE_DKSET is never cleared once set. */
66155 +       if (!ZF_ISSET(child, JNODE_DKSET)) {
66156 +               tree = znode_get_tree(parent);
66157 +               write_lock_dk(tree);
66158 +               if (likely(!ZF_ISSET(child, JNODE_DKSET))) {
66159 +                       find_child_delimiting_keys(parent, coord,
66160 +                                                  &child->ld_key,
66161 +                                                  &child->rd_key);
66162 +                       ON_DEBUG(child->ld_key_version =
66163 +                                atomic_inc_return(&delim_key_version);
66164 +                                child->rd_key_version =
66165 +                                atomic_inc_return(&delim_key_version););
66166 +                       ZF_SET(child, JNODE_DKSET);
66167 +               }
66168 +               write_unlock_dk(tree);
66169 +               return 1;
66170 +       }
66171 +       return 0;
66172 +}
66173 +
66174 +/* Perform tree lookup at one level. This is called from cbk_traverse()
66175 +   function that drives lookup through tree and calls cbk_node_lookup() to
66176 +   perform lookup within one node.
66177 +
66178 +   See comments in a code.
66179 +*/
66180 +static level_lookup_result cbk_level_lookup(cbk_handle * h /* search handle */ )
66181 +{
66182 +       int ret;
66183 +       int setdk;
66184 +       int ldkeyset = 0;
66185 +       reiser4_key ldkey;
66186 +       reiser4_key key;
66187 +       znode *active;
66188 +
66189 +       assert("nikita-3025", reiser4_schedulable());
66190 +
66191 +       /* acquire reference to @active node */
66192 +       active =
66193 +           zget(h->tree, &h->block, h->parent_lh->node, h->level,
66194 +                reiser4_ctx_gfp_mask_get());
66195 +
66196 +       if (IS_ERR(active)) {
66197 +               h->result = PTR_ERR(active);
66198 +               return LOOKUP_DONE;
66199 +       }
66200 +
66201 +       /* lock @active */
66202 +       h->result = longterm_lock_znode(h->active_lh,
66203 +                                       active,
66204 +                                       cbk_lock_mode(h->level, h),
66205 +                                       ZNODE_LOCK_LOPRI);
66206 +       /* longterm_lock_znode() acquires additional reference to znode (which
66207 +          will be later released by longterm_unlock_znode()). Release
66208 +          reference acquired by zget().
66209 +        */
66210 +       zput(active);
66211 +       if (unlikely(h->result != 0))
66212 +               goto fail_or_restart;
66213 +
66214 +       setdk = 0;
66215 +       /* if @active is accessed for the first time, setup delimiting keys on
66216 +          it. Delimiting keys are taken from the parent node. See
66217 +          setup_delimiting_keys() for details.
66218 +        */
66219 +       if (h->flags & CBK_DKSET) {
66220 +               setdk = setup_delimiting_keys(h);
66221 +               h->flags &= ~CBK_DKSET;
66222 +       } else {
66223 +               znode *parent;
66224 +
66225 +               parent = h->parent_lh->node;
66226 +               h->result = zload(parent);
66227 +               if (unlikely(h->result != 0))
66228 +                       goto fail_or_restart;
66229 +
66230 +               if (!ZF_ISSET(active, JNODE_DKSET))
66231 +                       setdk = set_child_delimiting_keys(parent,
66232 +                                                         h->coord, active);
66233 +               else {
66234 +                       read_lock_dk(h->tree);
66235 +                       find_child_delimiting_keys(parent, h->coord, &ldkey,
66236 +                                                  &key);
66237 +                       read_unlock_dk(h->tree);
66238 +                       ldkeyset = 1;
66239 +               }
66240 +               zrelse(parent);
66241 +       }
66242 +
66243 +       /* this is ugly kludge. Reminder: this is necessary, because
66244 +          ->lookup() method returns coord with ->between field probably set
66245 +          to something different from AT_UNIT.
66246 +        */
66247 +       h->coord->between = AT_UNIT;
66248 +
66249 +       if (znode_just_created(active) && (h->coord->node != NULL)) {
66250 +               write_lock_tree(h->tree);
66251 +               /* if we are going to load znode right now, setup
66252 +                  ->in_parent: coord where pointer to this node is stored in
66253 +                  parent.
66254 +                */
66255 +               coord_to_parent_coord(h->coord, &active->in_parent);
66256 +               write_unlock_tree(h->tree);
66257 +       }
66258 +
66259 +       /* check connectedness without holding tree lock---false negatives
66260 +        * will be re-checked by connect_znode(), and false positives are
66261 +        * impossible---@active cannot suddenly turn into unconnected
66262 +        * state. */
66263 +       if (!znode_is_connected(active)) {
66264 +               h->result = connect_znode(h->coord, active);
66265 +               if (unlikely(h->result != 0)) {
66266 +                       put_parent(h);
66267 +                       goto fail_or_restart;
66268 +               }
66269 +       }
66270 +
66271 +       jload_prefetch(ZJNODE(active));
66272 +
66273 +       if (setdk)
66274 +               update_stale_dk(h->tree, active);
66275 +
66276 +       /* put_parent() cannot be called earlier, because connect_znode()
66277 +          assumes parent node is referenced; */
66278 +       put_parent(h);
66279 +
66280 +       if ((!znode_contains_key_lock(active, h->key) &&
66281 +            (h->flags & CBK_TRUST_DK))
66282 +           || ZF_ISSET(active, JNODE_HEARD_BANSHEE)) {
66283 +               /* 1. key was moved out of this node while this thread was
66284 +                  waiting for the lock. Restart. More elaborate solution is
66285 +                  to determine where key moved (to the left, or to the right)
66286 +                  and try to follow it through sibling pointers.
66287 +
66288 +                  2. or, node itself is going to be removed from the
66289 +                  tree. Release lock and restart.
66290 +                */
66291 +               h->result = -E_REPEAT;
66292 +       }
66293 +       if (h->result == -E_REPEAT)
66294 +               return LOOKUP_REST;
66295 +
66296 +       h->result = zload_ra(active, h->ra_info);
66297 +       if (h->result) {
66298 +               return LOOKUP_DONE;
66299 +       }
66300 +
66301 +       /* sanity checks */
66302 +       if (sanity_check(h)) {
66303 +               zrelse(active);
66304 +               return LOOKUP_DONE;
66305 +       }
66306 +
66307 +       /* check that key of leftmost item in the @active is the same as in
66308 +        * its parent */
66309 +       if (ldkeyset && !node_is_empty(active) &&
66310 +           !keyeq(leftmost_key_in_node(active, &key), &ldkey)) {
66311 +               warning("vs-3533", "Keys are inconsistent. Fsck?");
66312 +               reiser4_print_key("inparent", &ldkey);
66313 +               reiser4_print_key("inchild", &key);
66314 +               h->result = RETERR(-EIO);
66315 +               zrelse(active);
66316 +               return LOOKUP_DONE;
66317 +       }
66318 +
66319 +       if (h->object != NULL)
66320 +               handle_vroot(h->object, active);
66321 +
66322 +       ret = cbk_node_lookup(h);
66323 +
66324 +       /* h->active_lh->node might change, but active is yet to be zrelsed */
66325 +       zrelse(active);
66326 +
66327 +       return ret;
66328 +
66329 +      fail_or_restart:
66330 +       if (h->result == -E_DEADLOCK)
66331 +               return LOOKUP_REST;
66332 +       return LOOKUP_DONE;
66333 +}
66334 +
66335 +#if REISER4_DEBUG
66336 +/* check left and right delimiting keys of a znode */
66337 +void check_dkeys(znode * node)
66338 +{
66339 +       znode *left;
66340 +       znode *right;
66341 +
66342 +       read_lock_tree(current_tree);
66343 +       read_lock_dk(current_tree);
66344 +
66345 +       assert("vs-1710", znode_is_any_locked(node));
66346 +       assert("vs-1197",
66347 +              !keygt(znode_get_ld_key(node), znode_get_rd_key(node)));
66348 +
66349 +       left = node->left;
66350 +       right = node->right;
66351 +
66352 +       if (ZF_ISSET(node, JNODE_LEFT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
66353 +           && left != NULL && ZF_ISSET(left, JNODE_DKSET))
66354 +               /* check left neighbor. Note that left neighbor is not locked,
66355 +                  so it might get wrong delimiting keys therefore */
66356 +               assert("vs-1198",
66357 +                      (keyeq(znode_get_rd_key(left), znode_get_ld_key(node))
66358 +                       || ZF_ISSET(left, JNODE_HEARD_BANSHEE)));
66359 +
66360 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) && ZF_ISSET(node, JNODE_DKSET)
66361 +           && right != NULL && ZF_ISSET(right, JNODE_DKSET))
66362 +               /* check right neighbor. Note that right neighbor is not
66363 +                  locked, so it might get wrong delimiting keys therefore  */
66364 +               assert("vs-1199",
66365 +                      (keyeq(znode_get_rd_key(node), znode_get_ld_key(right))
66366 +                       || ZF_ISSET(right, JNODE_HEARD_BANSHEE)));
66367 +
66368 +       read_unlock_dk(current_tree);
66369 +       read_unlock_tree(current_tree);
66370 +}
66371 +#endif
66372 +
66373 +/* true if @key is left delimiting key of @node */
66374 +static int key_is_ld(znode * node, const reiser4_key * key)
66375 +{
66376 +       int ld;
66377 +
66378 +       assert("nikita-1716", node != NULL);
66379 +       assert("nikita-1758", key != NULL);
66380 +
66381 +       read_lock_dk(znode_get_tree(node));
66382 +       assert("nikita-1759", znode_contains_key(node, key));
66383 +       ld = keyeq(znode_get_ld_key(node), key);
66384 +       read_unlock_dk(znode_get_tree(node));
66385 +       return ld;
66386 +}
66387 +
66388 +/* Process one node during tree traversal.
66389 +
66390 +   This is called by cbk_level_lookup(). */
66391 +static level_lookup_result cbk_node_lookup(cbk_handle * h /* search handle */ )
66392 +{
66393 +       /* node plugin of @active */
66394 +       node_plugin *nplug;
66395 +       /* item plugin of item that was found */
66396 +       item_plugin *iplug;
66397 +       /* search bias */
66398 +       lookup_bias node_bias;
66399 +       /* node we are operating upon */
66400 +       znode *active;
66401 +       /* tree we are searching in */
66402 +       reiser4_tree *tree;
66403 +       /* result */
66404 +       int result;
66405 +
66406 +       assert("nikita-379", h != NULL);
66407 +
66408 +       active = h->active_lh->node;
66409 +       tree = h->tree;
66410 +
66411 +       nplug = active->nplug;
66412 +       assert("nikita-380", nplug != NULL);
66413 +
66414 +       ON_DEBUG(check_dkeys(active));
66415 +
66416 +       /* return item from "active" node with maximal key not greater than
66417 +          "key"  */
66418 +       node_bias = h->bias;
66419 +       result = nplug->lookup(active, h->key, node_bias, h->coord);
66420 +       if (unlikely(result != NS_FOUND && result != NS_NOT_FOUND)) {
66421 +               /* error occurred */
66422 +               h->result = result;
66423 +               return LOOKUP_DONE;
66424 +       }
66425 +       if (h->level == h->stop_level) {
66426 +               /* welcome to the stop level */
66427 +               assert("nikita-381", h->coord->node == active);
66428 +               if (result == NS_FOUND) {
66429 +                       /* success of tree lookup */
66430 +                       if (!(h->flags & CBK_UNIQUE)
66431 +                           && key_is_ld(active, h->key)) {
66432 +                               return search_to_left(h);
66433 +                       } else
66434 +                               h->result = CBK_COORD_FOUND;
66435 +               } else {
66436 +                       h->result = CBK_COORD_NOTFOUND;
66437 +               }
66438 +               if (!(h->flags & CBK_IN_CACHE))
66439 +                       cbk_cache_add(active);
66440 +               return LOOKUP_DONE;
66441 +       }
66442 +
66443 +       if (h->level > TWIG_LEVEL && result == NS_NOT_FOUND) {
66444 +               h->error = "not found on internal node";
66445 +               h->result = result;
66446 +               return LOOKUP_DONE;
66447 +       }
66448 +
66449 +       assert("vs-361", h->level > h->stop_level);
66450 +
66451 +       if (handle_eottl(h, &result)) {
66452 +               assert("vs-1674", (result == LOOKUP_DONE ||
66453 +                                  result == LOOKUP_REST));
66454 +               return result;
66455 +       }
66456 +
66457 +       /* go down to next level */
66458 +       check_me("vs-12", zload(h->coord->node) == 0);
66459 +       assert("nikita-2116", item_is_internal(h->coord));
66460 +       iplug = item_plugin_by_coord(h->coord);
66461 +       iplug->s.internal.down_link(h->coord, h->key, &h->block);
66462 +       zrelse(h->coord->node);
66463 +       --h->level;
66464 +       return LOOKUP_CONT;     /* continue */
66465 +}
66466 +
66467 +/* scan cbk_cache slots looking for a match for @h */
66468 +static int cbk_cache_scan_slots(cbk_handle * h /* cbk handle */ )
66469 +{
66470 +       level_lookup_result llr;
66471 +       znode *node;
66472 +       reiser4_tree *tree;
66473 +       cbk_cache_slot *slot;
66474 +       cbk_cache *cache;
66475 +       tree_level level;
66476 +       int isunique;
66477 +       const reiser4_key *key;
66478 +       int result;
66479 +
66480 +       assert("nikita-1317", h != NULL);
66481 +       assert("nikita-1315", h->tree != NULL);
66482 +       assert("nikita-1316", h->key != NULL);
66483 +
66484 +       tree = h->tree;
66485 +       cache = &tree->cbk_cache;
66486 +       if (cache->nr_slots == 0)
66487 +               /* size of cbk cache was set to 0 by mount time option. */
66488 +               return RETERR(-ENOENT);
66489 +
66490 +       assert("nikita-2474", cbk_cache_invariant(cache));
66491 +       node = NULL;            /* to keep gcc happy */
66492 +       level = h->level;
66493 +       key = h->key;
66494 +       isunique = h->flags & CBK_UNIQUE;
66495 +       result = RETERR(-ENOENT);
66496 +
66497 +       /*
66498 +        * this is time-critical function and dragons had, hence, been settled
66499 +        * here.
66500 +        *
66501 +        * Loop below scans cbk cache slots trying to find matching node with
66502 +        * suitable range of delimiting keys and located at the h->level.
66503 +        *
66504 +        * Scan is done under cbk cache spin lock that protects slot->node
66505 +        * pointers. If suitable node is found we want to pin it in
66506 +        * memory. But slot->node can point to the node with x_count 0
66507 +        * (unreferenced). Such node can be recycled at any moment, or can
66508 +        * already be in the process of being recycled (within jput()).
66509 +        *
66510 +        * As we found node in the cbk cache, it means that jput() hasn't yet
66511 +        * called cbk_cache_invalidate().
66512 +        *
66513 +        * We acquire reference to the node without holding tree lock, and
66514 +        * later, check node's RIP bit. This avoids races with jput().
66515 +        */
66516 +
66517 +       rcu_read_lock();
66518 +       read_lock(&((cbk_cache *)cache)->guard);
66519 +
66520 +       slot = list_entry(cache->lru.next, cbk_cache_slot, lru);
66521 +       slot = list_entry(slot->lru.prev, cbk_cache_slot, lru);
66522 +       BUG_ON(&slot->lru != &cache->lru);/*????*/
66523 +       while (1) {
66524 +
66525 +               slot = list_entry(slot->lru.next, cbk_cache_slot, lru);
66526 +
66527 +               if (&cache->lru != &slot->lru)
66528 +                       node = slot->node;
66529 +               else
66530 +                       node = NULL;
66531 +
66532 +               if (unlikely(node == NULL))
66533 +                       break;
66534 +
66535 +               /*
66536 +                * this is (hopefully) the only place in the code where we are
66537 +                * working with delimiting keys without holding dk lock. This
66538 +                * is fine here, because this is only "guess" anyway---keys
66539 +                * are rechecked under dk lock below.
66540 +                */
66541 +               if (znode_get_level(node) == level &&
66542 +                   /* reiser4_min_key < key < reiser4_max_key */
66543 +                   znode_contains_key_strict(node, key, isunique)) {
66544 +                       zref(node);
66545 +                       result = 0;
66546 +                       spin_lock_prefetch(&tree->tree_lock);
66547 +                       break;
66548 +               }
66549 +       }
66550 +       read_unlock(&((cbk_cache *)cache)->guard);
66551 +
66552 +       assert("nikita-2475", cbk_cache_invariant(cache));
66553 +
66554 +       if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
66555 +               result = -ENOENT;
66556 +
66557 +       rcu_read_unlock();
66558 +
66559 +       if (result != 0) {
66560 +               h->result = CBK_COORD_NOTFOUND;
66561 +               return RETERR(-ENOENT);
66562 +       }
66563 +
66564 +       result =
66565 +           longterm_lock_znode(h->active_lh, node, cbk_lock_mode(level, h),
66566 +                               ZNODE_LOCK_LOPRI);
66567 +       zput(node);
66568 +       if (result != 0)
66569 +               return result;
66570 +       result = zload(node);
66571 +       if (result != 0)
66572 +               return result;
66573 +
66574 +       /* recheck keys */
66575 +       read_lock_dk(tree);
66576 +       result = (znode_contains_key_strict(node, key, isunique) &&
66577 +               !ZF_ISSET(node, JNODE_HEARD_BANSHEE));
66578 +       read_unlock_dk(tree);
66579 +       if (result) {
66580 +               /* do lookup inside node */
66581 +               llr = cbk_node_lookup(h);
66582 +               /* if cbk_node_lookup() wandered to another node (due to eottl
66583 +                  or non-unique keys), adjust @node */
66584 +               /*node = h->active_lh->node; */
66585 +
66586 +               if (llr != LOOKUP_DONE) {
66587 +                       /* restart or continue on the next level */
66588 +                       result = RETERR(-ENOENT);
66589 +               } else if (IS_CBKERR(h->result))
66590 +                       /* io or oom */
66591 +                       result = RETERR(-ENOENT);
66592 +               else {
66593 +                       /* good. Either item found or definitely not found. */
66594 +                       result = 0;
66595 +
66596 +                       write_lock(&(cache->guard));
66597 +                       if (slot->node == h->active_lh->node /*node */ ) {
66598 +                               /* if this node is still in cbk cache---move
66599 +                                  its slot to the head of the LRU list. */
66600 +                               list_move(&slot->lru, &cache->lru);
66601 +                       }
66602 +                       write_unlock(&(cache->guard));
66603 +               }
66604 +       } else {
66605 +               /* race. While this thread was waiting for the lock, node was
66606 +                  rebalanced and item we are looking for, shifted out of it
66607 +                  (if it ever was here).
66608 +
66609 +                  Continuing scanning is almost hopeless: node key range was
66610 +                  moved to, is almost certainly at the beginning of the LRU
66611 +                  list at this time, because it's hot, but restarting
66612 +                  scanning from the very beginning is complex. Just return,
66613 +                  so that cbk() will be performed. This is not that
66614 +                  important, because such races should be rare. Are they?
66615 +                */
66616 +               result = RETERR(-ENOENT);       /* -ERAUGHT */
66617 +       }
66618 +       zrelse(node);
66619 +       assert("nikita-2476", cbk_cache_invariant(cache));
66620 +       return result;
66621 +}
66622 +
66623 +/* look for item with given key in the coord cache
66624 +
66625 +   This function, called by coord_by_key(), scans "coord cache" (&cbk_cache)
66626 +   which is a small LRU list of znodes accessed lately. For each znode in
66627 +   znode in this list, it checks whether key we are looking for fits into key
66628 +   range covered by this node. If so, and in addition, node lies at allowed
66629 +   level (this is to handle extents on a twig level), node is locked, and
66630 +   lookup inside it is performed.
66631 +
66632 +   we need a measurement of the cost of this cache search compared to the cost
66633 +   of coord_by_key.
66634 +
66635 +*/
66636 +static int cbk_cache_search(cbk_handle * h /* cbk handle */ )
66637 +{
66638 +       int result = 0;
66639 +       tree_level level;
66640 +
66641 +       /* add CBK_IN_CACHE to the handle flags. This means that
66642 +        * cbk_node_lookup() assumes that cbk_cache is scanned and would add
66643 +        * found node to the cache. */
66644 +       h->flags |= CBK_IN_CACHE;
66645 +       for (level = h->stop_level; level <= h->lock_level; ++level) {
66646 +               h->level = level;
66647 +               result = cbk_cache_scan_slots(h);
66648 +               if (result != 0) {
66649 +                       done_lh(h->active_lh);
66650 +                       done_lh(h->parent_lh);
66651 +               } else {
66652 +                       assert("nikita-1319", !IS_CBKERR(h->result));
66653 +                       break;
66654 +               }
66655 +       }
66656 +       h->flags &= ~CBK_IN_CACHE;
66657 +       return result;
66658 +}
66659 +
66660 +/* type of lock we want to obtain during tree traversal. On stop level
66661 +    we want type of lock user asked for, on upper levels: read lock. */
66662 +znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h)
66663 +{
66664 +       assert("nikita-382", h != NULL);
66665 +
66666 +       return (level <= h->lock_level) ? h->lock_mode : ZNODE_READ_LOCK;
66667 +}
66668 +
66669 +/* update outdated delimiting keys */
66670 +static void stale_dk(reiser4_tree * tree, znode * node)
66671 +{
66672 +       znode *right;
66673 +
66674 +       read_lock_tree(tree);
66675 +       write_lock_dk(tree);
66676 +       right = node->right;
66677 +
66678 +       if (ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66679 +           right && ZF_ISSET(right, JNODE_DKSET) &&
66680 +           !keyeq(znode_get_rd_key(node), znode_get_ld_key(right)))
66681 +               znode_set_rd_key(node, znode_get_ld_key(right));
66682 +
66683 +       write_unlock_dk(tree);
66684 +       read_unlock_tree(tree);
66685 +}
66686 +
66687 +/* check for possibly outdated delimiting keys, and update them if
66688 + * necessary. */
66689 +static void update_stale_dk(reiser4_tree * tree, znode * node)
66690 +{
66691 +       znode *right;
66692 +       reiser4_key rd;
66693 +
66694 +       read_lock_tree(tree);
66695 +       read_lock_dk(tree);
66696 +       rd = *znode_get_rd_key(node);
66697 +       right = node->right;
66698 +       if (unlikely(ZF_ISSET(node, JNODE_RIGHT_CONNECTED) &&
66699 +                    right && ZF_ISSET(right, JNODE_DKSET) &&
66700 +                    !keyeq(&rd, znode_get_ld_key(right)))) {
66701 +               assert("nikita-38211", ZF_ISSET(node, JNODE_DKSET));
66702 +               read_unlock_dk(tree);
66703 +               read_unlock_tree(tree);
66704 +               stale_dk(tree, node);
66705 +               return;
66706 +       }
66707 +       read_unlock_dk(tree);
66708 +       read_unlock_tree(tree);
66709 +}
66710 +
66711 +/*
66712 + * handle searches a the non-unique key.
66713 + *
66714 + * Suppose that we are looking for an item with possibly non-unique key 100.
66715 + *
66716 + * Root node contains two pointers: one to a node with left delimiting key 0,
66717 + * and another to a node with left delimiting key 100. Item we interested in
66718 + * may well happen in the sub-tree rooted at the first pointer.
66719 + *
66720 + * To handle this search_to_left() is called when search reaches stop
66721 + * level. This function checks it is _possible_ that item we are looking for
66722 + * is in the left neighbor (this can be done by comparing delimiting keys) and
66723 + * if so, tries to lock left neighbor (this is low priority lock, so it can
66724 + * deadlock, tree traversal is just restarted if it did) and then checks
66725 + * whether left neighbor actually contains items with our key.
66726 + *
66727 + * Note that this is done on the stop level only. It is possible to try such
66728 + * left-check on each level, but as duplicate keys are supposed to be rare
66729 + * (very unlikely that more than one node is completely filled with items with
66730 + * duplicate keys), it sis cheaper to scan to the left on the stop level once.
66731 + *
66732 + */
66733 +static level_lookup_result search_to_left(cbk_handle * h /* search handle */ )
66734 +{
66735 +       level_lookup_result result;
66736 +       coord_t *coord;
66737 +       znode *node;
66738 +       znode *neighbor;
66739 +
66740 +       lock_handle lh;
66741 +
66742 +       assert("nikita-1761", h != NULL);
66743 +       assert("nikita-1762", h->level == h->stop_level);
66744 +
66745 +       init_lh(&lh);
66746 +       coord = h->coord;
66747 +       node = h->active_lh->node;
66748 +       assert("nikita-1763", coord_is_leftmost_unit(coord));
66749 +
66750 +       h->result =
66751 +           reiser4_get_left_neighbor(&lh, node, (int)h->lock_mode,
66752 +                                     GN_CAN_USE_UPPER_LEVELS);
66753 +       neighbor = NULL;
66754 +       switch (h->result) {
66755 +       case -E_DEADLOCK:
66756 +               result = LOOKUP_REST;
66757 +               break;
66758 +       case 0:{
66759 +                       node_plugin *nplug;
66760 +                       coord_t crd;
66761 +                       lookup_bias bias;
66762 +
66763 +                       neighbor = lh.node;
66764 +                       h->result = zload(neighbor);
66765 +                       if (h->result != 0) {
66766 +                               result = LOOKUP_DONE;
66767 +                               break;
66768 +                       }
66769 +
66770 +                       nplug = neighbor->nplug;
66771 +
66772 +                       coord_init_zero(&crd);
66773 +                       bias = h->bias;
66774 +                       h->bias = FIND_EXACT;
66775 +                       h->result =
66776 +                           nplug->lookup(neighbor, h->key, h->bias, &crd);
66777 +                       h->bias = bias;
66778 +
66779 +                       if (h->result == NS_NOT_FOUND) {
66780 +       case -E_NO_NEIGHBOR:
66781 +                               h->result = CBK_COORD_FOUND;
66782 +                               if (!(h->flags & CBK_IN_CACHE))
66783 +                                       cbk_cache_add(node);
66784 +       default:                /* some other error */
66785 +                               result = LOOKUP_DONE;
66786 +                       } else if (h->result == NS_FOUND) {
66787 +                               read_lock_dk(znode_get_tree(neighbor));
66788 +                               h->rd_key = *znode_get_ld_key(node);
66789 +                               leftmost_key_in_node(neighbor, &h->ld_key);
66790 +                               read_unlock_dk(znode_get_tree(neighbor));
66791 +                               h->flags |= CBK_DKSET;
66792 +
66793 +                               h->block = *znode_get_block(neighbor);
66794 +                               /* clear coord -> node so that cbk_level_lookup()
66795 +                                  wouldn't overwrite parent hint in neighbor.
66796 +
66797 +                                  Parent hint was set up by
66798 +                                  reiser4_get_left_neighbor()
66799 +                                */
66800 +                               /* FIXME: why do we have to spinlock here? */
66801 +                               write_lock_tree(znode_get_tree(neighbor));
66802 +                               h->coord->node = NULL;
66803 +                               write_unlock_tree(znode_get_tree(neighbor));
66804 +                               result = LOOKUP_CONT;
66805 +                       } else {
66806 +                               result = LOOKUP_DONE;
66807 +                       }
66808 +                       if (neighbor != NULL)
66809 +                               zrelse(neighbor);
66810 +               }
66811 +       }
66812 +       done_lh(&lh);
66813 +       return result;
66814 +}
66815 +
66816 +/* debugging aid: return symbolic name of search bias */
66817 +static const char *bias_name(lookup_bias bias /* bias to get name of */ )
66818 +{
66819 +       if (bias == FIND_EXACT)
66820 +               return "exact";
66821 +       else if (bias == FIND_MAX_NOT_MORE_THAN)
66822 +               return "left-slant";
66823 +/*     else if( bias == RIGHT_SLANT_BIAS ) */
66824 +/*             return "right-bias"; */
66825 +       else {
66826 +               static char buf[30];
66827 +
66828 +               sprintf(buf, "unknown: %i", bias);
66829 +               return buf;
66830 +       }
66831 +}
66832 +
66833 +#if REISER4_DEBUG
66834 +/* debugging aid: print human readable information about @p */
66835 +void print_coord_content(const char *prefix /* prefix to print */ ,
66836 +                        coord_t * p /* coord to print */ )
66837 +{
66838 +       reiser4_key key;
66839 +
66840 +       if (p == NULL) {
66841 +               printk("%s: null\n", prefix);
66842 +               return;
66843 +       }
66844 +       if ((p->node != NULL) && znode_is_loaded(p->node)
66845 +           && coord_is_existing_item(p))
66846 +               printk("%s: data: %p, length: %i\n", prefix,
66847 +                      item_body_by_coord(p), item_length_by_coord(p));
66848 +       if (znode_is_loaded(p->node)) {
66849 +               item_key_by_coord(p, &key);
66850 +               reiser4_print_key(prefix, &key);
66851 +       }
66852 +}
66853 +
66854 +/* debugging aid: print human readable information about @block */
66855 +void reiser4_print_address(const char *prefix /* prefix to print */ ,
66856 +                  const reiser4_block_nr * block /* block number to print */ )
66857 +{
66858 +       printk("%s: %s\n", prefix, sprint_address(block));
66859 +}
66860 +#endif
66861 +
66862 +/* return string containing human readable representation of @block */
66863 +char *sprint_address(const reiser4_block_nr *
66864 +                    block /* block number to print */ )
66865 +{
66866 +       static char address[30];
66867 +
66868 +       if (block == NULL)
66869 +               sprintf(address, "null");
66870 +       else if (reiser4_blocknr_is_fake(block))
66871 +               sprintf(address, "%llx", (unsigned long long)(*block));
66872 +       else
66873 +               sprintf(address, "%llu", (unsigned long long)(*block));
66874 +       return address;
66875 +}
66876 +
66877 +/* release parent node during traversal */
66878 +static void put_parent(cbk_handle * h /* search handle */ )
66879 +{
66880 +       assert("nikita-383", h != NULL);
66881 +       if (h->parent_lh->node != NULL) {
66882 +               longterm_unlock_znode(h->parent_lh);
66883 +       }
66884 +}
66885 +
66886 +/* helper function used by coord_by_key(): release reference to parent znode
66887 +   stored in handle before processing its child. */
66888 +static void hput(cbk_handle * h /* search handle */ )
66889 +{
66890 +       assert("nikita-385", h != NULL);
66891 +       done_lh(h->parent_lh);
66892 +       done_lh(h->active_lh);
66893 +}
66894 +
66895 +/* Helper function used by cbk(): update delimiting keys of child node (stored
66896 +   in h->active_lh->node) using key taken from parent on the parent level. */
66897 +static int setup_delimiting_keys(cbk_handle * h /* search handle */ )
66898 +{
66899 +       znode *active;
66900 +       reiser4_tree *tree;
66901 +
66902 +       assert("nikita-1088", h != NULL);
66903 +
66904 +       active = h->active_lh->node;
66905 +
66906 +       /* fast check without taking dk lock. This is safe, because
66907 +        * JNODE_DKSET is never cleared once set. */
66908 +       if (!ZF_ISSET(active, JNODE_DKSET)) {
66909 +               tree = znode_get_tree(active);
66910 +               write_lock_dk(tree);
66911 +               if (!ZF_ISSET(active, JNODE_DKSET)) {
66912 +                       znode_set_ld_key(active, &h->ld_key);
66913 +                       znode_set_rd_key(active, &h->rd_key);
66914 +                       ZF_SET(active, JNODE_DKSET);
66915 +               }
66916 +               write_unlock_dk(tree);
66917 +               return 1;
66918 +       }
66919 +       return 0;
66920 +}
66921 +
66922 +/* true if @block makes sense for the @tree. Used to detect corrupted node
66923 + * pointers */
66924 +static int
66925 +block_nr_is_correct(reiser4_block_nr * block /* block number to check */ ,
66926 +                   reiser4_tree * tree /* tree to check against */ )
66927 +{
66928 +       assert("nikita-757", block != NULL);
66929 +       assert("nikita-758", tree != NULL);
66930 +
66931 +       /* check to see if it exceeds the size of the device. */
66932 +       return reiser4_blocknr_is_sane_for(tree->super, block);
66933 +}
66934 +
66935 +/* check consistency of fields */
66936 +static int sanity_check(cbk_handle * h /* search handle */ )
66937 +{
66938 +       assert("nikita-384", h != NULL);
66939 +
66940 +       if (h->level < h->stop_level) {
66941 +               h->error = "Buried under leaves";
66942 +               h->result = RETERR(-EIO);
66943 +               return LOOKUP_DONE;
66944 +       } else if (!block_nr_is_correct(&h->block, h->tree)) {
66945 +               h->error = "bad block number";
66946 +               h->result = RETERR(-EIO);
66947 +               return LOOKUP_DONE;
66948 +       } else
66949 +               return 0;
66950 +}
66951 +
66952 +/* Make Linus happy.
66953 +   Local variables:
66954 +   c-indentation-style: "K&R"
66955 +   mode-name: "LC"
66956 +   c-basic-offset: 8
66957 +   tab-width: 8
66958 +   fill-column: 120
66959 +   scroll-step: 1
66960 +   End:
66961 +*/
66962 diff --git a/fs/reiser4/status_flags.c b/fs/reiser4/status_flags.c
66963 new file mode 100644
66964 index 0000000..b32f89a
66965 --- /dev/null
66966 +++ b/fs/reiser4/status_flags.c
66967 @@ -0,0 +1,175 @@
66968 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
66969 + * reiser4/README */
66970 +
66971 +/* Functions that deal with reiser4 status block, query status and update it, if needed */
66972 +
66973 +#include <linux/bio.h>
66974 +#include <linux/highmem.h>
66975 +#include <linux/fs.h>
66976 +#include <linux/blkdev.h>
66977 +#include "debug.h"
66978 +#include "dformat.h"
66979 +#include "status_flags.h"
66980 +#include "super.h"
66981 +
66982 +/* This is our end I/O handler that marks page uptodate if IO was successful. It also
66983 +   unconditionally unlocks the page, so we can see that io was done.
66984 +   We do not free bio, because we hope to reuse that. */
66985 +static int reiser4_status_endio(struct bio *bio, unsigned int bytes_done,
66986 +                               int err)
66987 +{
66988 +       if (bio->bi_size)
66989 +               return 1;
66990 +
66991 +       if (test_bit(BIO_UPTODATE, &bio->bi_flags)) {
66992 +               SetPageUptodate(bio->bi_io_vec->bv_page);
66993 +       } else {
66994 +               ClearPageUptodate(bio->bi_io_vec->bv_page);
66995 +               SetPageError(bio->bi_io_vec->bv_page);
66996 +       }
66997 +       unlock_page(bio->bi_io_vec->bv_page);
66998 +       return 0;
66999 +}
67000 +
67001 +/* Initialise status code. This is expected to be called from the disk format
67002 +   code. block paremeter is where status block lives. */
67003 +int reiser4_status_init(reiser4_block_nr block)
67004 +{
67005 +       struct super_block *sb = reiser4_get_current_sb();
67006 +       struct reiser4_status *statuspage;
67007 +       struct bio *bio;
67008 +       struct page *page;
67009 +
67010 +       get_super_private(sb)->status_page = NULL;
67011 +       get_super_private(sb)->status_bio = NULL;
67012 +
67013 +       page = alloc_pages(reiser4_ctx_gfp_mask_get(), 0);
67014 +       if (!page)
67015 +               return -ENOMEM;
67016 +
67017 +       bio = bio_alloc(reiser4_ctx_gfp_mask_get(), 1);
67018 +       if (bio != NULL) {
67019 +               bio->bi_sector = block * (sb->s_blocksize >> 9);
67020 +               bio->bi_bdev = sb->s_bdev;
67021 +               bio->bi_io_vec[0].bv_page = page;
67022 +               bio->bi_io_vec[0].bv_len = sb->s_blocksize;
67023 +               bio->bi_io_vec[0].bv_offset = 0;
67024 +               bio->bi_vcnt = 1;
67025 +               bio->bi_size = sb->s_blocksize;
67026 +               bio->bi_end_io = reiser4_status_endio;
67027 +       } else {
67028 +               __free_pages(page, 0);
67029 +               return -ENOMEM;
67030 +       }
67031 +       lock_page(page);
67032 +       submit_bio(READ, bio);
67033 +       blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
67034 +       wait_on_page_locked(page);
67035 +       if (!PageUptodate(page)) {
67036 +               warning("green-2007",
67037 +                       "I/O error while tried to read status page\n");
67038 +               return -EIO;
67039 +       }
67040 +
67041 +       statuspage = (struct reiser4_status *)kmap_atomic(page, KM_USER0);
67042 +       if (memcmp
67043 +           (statuspage->magic, REISER4_STATUS_MAGIC,
67044 +            sizeof(REISER4_STATUS_MAGIC))) {
67045 +               /* Magic does not match. */
67046 +               kunmap_atomic((char *)statuspage, KM_USER0);
67047 +               warning("green-2008", "Wrong magic in status block\n");
67048 +               __free_pages(page, 0);
67049 +               bio_put(bio);
67050 +               return -EINVAL;
67051 +       }
67052 +       kunmap_atomic((char *)statuspage, KM_USER0);
67053 +
67054 +       get_super_private(sb)->status_page = page;
67055 +       get_super_private(sb)->status_bio = bio;
67056 +       return 0;
67057 +}
67058 +
67059 +/* Query the status of fs. Returns if the FS can be safely mounted.
67060 +   Also if "status" and "extended" parameters are given, it will fill
67061 +   actual parts of status from disk there. */
67062 +int reiser4_status_query(u64 * status, u64 * extended)
67063 +{
67064 +       struct super_block *sb = reiser4_get_current_sb();
67065 +       struct reiser4_status *statuspage;
67066 +       int retval;
67067 +
67068 +       if (!get_super_private(sb)->status_page) {      // No status page?
67069 +               return REISER4_STATUS_MOUNT_UNKNOWN;
67070 +       }
67071 +       statuspage = (struct reiser4_status *)
67072 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
67073 +       switch ((long)le64_to_cpu(get_unaligned(&statuspage->status))) {        // FIXME: this cast is a hack for 32 bit arches to work.
67074 +       case REISER4_STATUS_OK:
67075 +               retval = REISER4_STATUS_MOUNT_OK;
67076 +               break;
67077 +       case REISER4_STATUS_CORRUPTED:
67078 +               retval = REISER4_STATUS_MOUNT_WARN;
67079 +               break;
67080 +       case REISER4_STATUS_DAMAGED:
67081 +       case REISER4_STATUS_DESTROYED:
67082 +       case REISER4_STATUS_IOERROR:
67083 +               retval = REISER4_STATUS_MOUNT_RO;
67084 +               break;
67085 +       default:
67086 +               retval = REISER4_STATUS_MOUNT_UNKNOWN;
67087 +               break;
67088 +       }
67089 +
67090 +       if (status)
67091 +               *status = le64_to_cpu(get_unaligned(&statuspage->status));
67092 +       if (extended)
67093 +               *extended = le64_to_cpu(get_unaligned(&statuspage->extended_status));
67094 +
67095 +       kunmap_atomic((char *)statuspage, KM_USER0);
67096 +       return retval;
67097 +}
67098 +
67099 +/* This function should be called when something bad happens (e.g. from reiser4_panic).
67100 +   It fills the status structure and tries to push it to disk. */
67101 +int reiser4_status_write(__u64 status, __u64 extended_status, char *message)
67102 +{
67103 +       struct super_block *sb = reiser4_get_current_sb();
67104 +       struct reiser4_status *statuspage;
67105 +       struct bio *bio = get_super_private(sb)->status_bio;
67106 +
67107 +       if (!get_super_private(sb)->status_page) {      // No status page?
67108 +               return -1;
67109 +       }
67110 +       statuspage = (struct reiser4_status *)
67111 +           kmap_atomic(get_super_private(sb)->status_page, KM_USER0);
67112 +
67113 +       put_unaligned(cpu_to_le64(status), &statuspage->status);
67114 +       put_unaligned(cpu_to_le64(extended_status), &statuspage->extended_status);
67115 +       strncpy(statuspage->texterror, message, REISER4_TEXTERROR_LEN);
67116 +
67117 +       kunmap_atomic((char *)statuspage, KM_USER0);
67118 +       bio->bi_bdev = sb->s_bdev;
67119 +       bio->bi_io_vec[0].bv_page = get_super_private(sb)->status_page;
67120 +       bio->bi_io_vec[0].bv_len = sb->s_blocksize;
67121 +       bio->bi_io_vec[0].bv_offset = 0;
67122 +       bio->bi_vcnt = 1;
67123 +       bio->bi_size = sb->s_blocksize;
67124 +       bio->bi_end_io = reiser4_status_endio;
67125 +       lock_page(get_super_private(sb)->status_page);  // Safe as nobody should touch our page.
67126 +       /* We can block now, but we have no other choice anyway */
67127 +       submit_bio(WRITE, bio);
67128 +       blk_run_address_space(reiser4_get_super_fake(sb)->i_mapping);
67129 +       return 0;               // We do not wait for io to finish.
67130 +}
67131 +
67132 +/* Frees the page with status and bio structure. Should be called by disk format at umount time */
67133 +int reiser4_status_finish(void)
67134 +{
67135 +       struct super_block *sb = reiser4_get_current_sb();
67136 +
67137 +       __free_pages(get_super_private(sb)->status_page, 0);
67138 +       get_super_private(sb)->status_page = NULL;
67139 +       bio_put(get_super_private(sb)->status_bio);
67140 +       get_super_private(sb)->status_bio = NULL;
67141 +       return 0;
67142 +}
67143 diff --git a/fs/reiser4/status_flags.h b/fs/reiser4/status_flags.h
67144 new file mode 100644
67145 index 0000000..6cfa5ad
67146 --- /dev/null
67147 +++ b/fs/reiser4/status_flags.h
67148 @@ -0,0 +1,43 @@
67149 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
67150 + * reiser4/README */
67151 +
67152 +/* Here we declare structures and flags that store reiser4 status on disk.
67153 +   The status that helps us to find out if the filesystem is valid or if it
67154 +   contains some critical, or not so critical errors */
67155 +
67156 +#if !defined( __REISER4_STATUS_FLAGS_H__ )
67157 +#define __REISER4_STATUS_FLAGS_H__
67158 +
67159 +#include "dformat.h"
67160 +/* These are major status flags */
67161 +#define REISER4_STATUS_OK 0
67162 +#define REISER4_STATUS_CORRUPTED 0x1
67163 +#define REISER4_STATUS_DAMAGED 0x2
67164 +#define REISER4_STATUS_DESTROYED 0x4
67165 +#define REISER4_STATUS_IOERROR 0x8
67166 +
67167 +/* Return values for reiser4_status_query() */
67168 +#define REISER4_STATUS_MOUNT_OK 0
67169 +#define REISER4_STATUS_MOUNT_WARN 1
67170 +#define REISER4_STATUS_MOUNT_RO 2
67171 +#define REISER4_STATUS_MOUNT_UNKNOWN -1
67172 +
67173 +#define REISER4_TEXTERROR_LEN 256
67174 +
67175 +#define REISER4_STATUS_MAGIC "ReiSeR4StATusBl"
67176 +/* We probably need to keep its size under sector size which is 512 bytes */
67177 +struct reiser4_status {
67178 +       char magic[16];
67179 +       d64 status;             /* Current FS state */
67180 +       d64 extended_status;    /* Any additional info that might have sense in addition to "status". E.g.
67181 +                                  last sector where io error happened if status is "io error encountered" */
67182 +       d64 stacktrace[10];     /* Last ten functional calls made (addresses) */
67183 +       char texterror[REISER4_TEXTERROR_LEN];  /* Any error message if appropriate, otherwise filled with zeroes */
67184 +};
67185 +
67186 +int reiser4_status_init(reiser4_block_nr block);
67187 +int reiser4_status_query(u64 * status, u64 * extended);
67188 +int reiser4_status_write(u64 status, u64 extended_status, char *message);
67189 +int reiser4_status_finish(void);
67190 +
67191 +#endif
67192 diff --git a/fs/reiser4/super.c b/fs/reiser4/super.c
67193 new file mode 100644
67194 index 0000000..bc4113e
67195 --- /dev/null
67196 +++ b/fs/reiser4/super.c
67197 @@ -0,0 +1,316 @@
67198 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67199 + * reiser4/README */
67200 +
67201 +/* Super-block manipulations. */
67202 +
67203 +#include "debug.h"
67204 +#include "dformat.h"
67205 +#include "key.h"
67206 +#include "plugin/security/perm.h"
67207 +#include "plugin/space/space_allocator.h"
67208 +#include "plugin/plugin.h"
67209 +#include "tree.h"
67210 +#include "vfs_ops.h"
67211 +#include "super.h"
67212 +#include "reiser4.h"
67213 +
67214 +#include <linux/types.h>       /* for __u??  */
67215 +#include <linux/fs.h>          /* for struct super_block  */
67216 +
67217 +static __u64 reserved_for_gid(const struct super_block *super, gid_t gid);
67218 +static __u64 reserved_for_uid(const struct super_block *super, uid_t uid);
67219 +static __u64 reserved_for_root(const struct super_block *super);
67220 +
67221 +/* Return reiser4-specific part of super block */
67222 +reiser4_super_info_data *get_super_private_nocheck(const struct super_block *super     /* super block
67223 +                                                                                        * queried */ )
67224 +{
67225 +       return (reiser4_super_info_data *) super->s_fs_info;
67226 +}
67227 +
67228 +/* Return reiser4 fstype: value that is returned in ->f_type field by statfs() */
67229 +long reiser4_statfs_type(const struct super_block *super UNUSED_ARG)
67230 +{
67231 +       assert("nikita-448", super != NULL);
67232 +       assert("nikita-449", is_reiser4_super(super));
67233 +       return (long)REISER4_SUPER_MAGIC;
67234 +}
67235 +
67236 +/* functions to read/modify fields of reiser4_super_info_data */
67237 +
67238 +/* get number of blocks in file system */
67239 +__u64 reiser4_block_count(const struct super_block *super      /* super block
67240 +                                                                  queried */ )
67241 +{
67242 +       assert("vs-494", super != NULL);
67243 +       assert("vs-495", is_reiser4_super(super));
67244 +       return get_super_private(super)->block_count;
67245 +}
67246 +
67247 +#if REISER4_DEBUG
67248 +/*
67249 + * number of blocks in the current file system
67250 + */
67251 +__u64 reiser4_current_block_count(void)
67252 +{
67253 +       return get_current_super_private()->block_count;
67254 +}
67255 +#endif  /*  REISER4_DEBUG  */
67256 +
67257 +/* set number of block in filesystem */
67258 +void reiser4_set_block_count(const struct super_block *super, __u64 nr)
67259 +{
67260 +       assert("vs-501", super != NULL);
67261 +       assert("vs-502", is_reiser4_super(super));
67262 +       get_super_private(super)->block_count = nr;
67263 +       /*
67264 +        * The proper calculation of the reserved space counter (%5 of device
67265 +        * block counter) we need a 64 bit division which is missing in Linux
67266 +        * on i386 platform. Because we do not need a precise calculation here
67267 +        * we can replace a div64 operation by this combination of
67268 +        * multiplication and shift: 51. / (2^10) == .0498 .
67269 +        * FIXME: this is a bug. It comes up only for very small filesystems
67270 +        * which probably are never used. Nevertheless, it is a bug. Number of
67271 +        * reserved blocks must be not less than maximal number of blocks which
67272 +        * get grabbed with BA_RESERVED.
67273 +        */
67274 +       get_super_private(super)->blocks_reserved = ((nr * 51) >> 10);
67275 +}
67276 +
67277 +/* amount of blocks used (allocated for data) in file system */
67278 +__u64 reiser4_data_blocks(const struct super_block *super      /* super block
67279 +                                                                  queried */ )
67280 +{
67281 +       assert("nikita-452", super != NULL);
67282 +       assert("nikita-453", is_reiser4_super(super));
67283 +       return get_super_private(super)->blocks_used;
67284 +}
67285 +
67286 +/* set number of block used in filesystem */
67287 +void reiser4_set_data_blocks(const struct super_block *super, __u64 nr)
67288 +{
67289 +       assert("vs-503", super != NULL);
67290 +       assert("vs-504", is_reiser4_super(super));
67291 +       get_super_private(super)->blocks_used = nr;
67292 +}
67293 +
67294 +/* amount of free blocks in file system */
67295 +__u64 reiser4_free_blocks(const struct super_block *super      /* super block
67296 +                                                                  queried */ )
67297 +{
67298 +       assert("nikita-454", super != NULL);
67299 +       assert("nikita-455", is_reiser4_super(super));
67300 +       return get_super_private(super)->blocks_free;
67301 +}
67302 +
67303 +/* set number of blocks free in filesystem */
67304 +void reiser4_set_free_blocks(const struct super_block *super, __u64 nr)
67305 +{
67306 +       assert("vs-505", super != NULL);
67307 +       assert("vs-506", is_reiser4_super(super));
67308 +       get_super_private(super)->blocks_free = nr;
67309 +}
67310 +
67311 +/* get mkfs unique identifier */
67312 +__u32 reiser4_mkfs_id(const struct super_block *super  /* super block
67313 +                                                          queried */ )
67314 +{
67315 +       assert("vpf-221", super != NULL);
67316 +       assert("vpf-222", is_reiser4_super(super));
67317 +       return get_super_private(super)->mkfs_id;
67318 +}
67319 +
67320 +/* amount of free blocks in file system */
67321 +__u64 reiser4_free_committed_blocks(const struct super_block *super)
67322 +{
67323 +       assert("vs-497", super != NULL);
67324 +       assert("vs-498", is_reiser4_super(super));
67325 +       return get_super_private(super)->blocks_free_committed;
67326 +}
67327 +
67328 +/* amount of blocks in the file system reserved for @uid and @gid */
67329 +long reiser4_reserved_blocks(const struct super_block *super   /* super block
67330 +                                                                  queried */ ,
67331 +                            uid_t uid /* user id */ ,
67332 +                            gid_t gid /* group id */ )
67333 +{
67334 +       long reserved;
67335 +
67336 +       assert("nikita-456", super != NULL);
67337 +       assert("nikita-457", is_reiser4_super(super));
67338 +
67339 +       reserved = 0;
67340 +       if (REISER4_SUPPORT_GID_SPACE_RESERVATION)
67341 +               reserved += reserved_for_gid(super, gid);
67342 +       if (REISER4_SUPPORT_UID_SPACE_RESERVATION)
67343 +               reserved += reserved_for_uid(super, uid);
67344 +       if (REISER4_SUPPORT_ROOT_SPACE_RESERVATION && (uid == 0))
67345 +               reserved += reserved_for_root(super);
67346 +       return reserved;
67347 +}
67348 +
67349 +/* get/set value of/to grabbed blocks counter */
67350 +__u64 reiser4_grabbed_blocks(const struct super_block * super)
67351 +{
67352 +       assert("zam-512", super != NULL);
67353 +       assert("zam-513", is_reiser4_super(super));
67354 +
67355 +       return get_super_private(super)->blocks_grabbed;
67356 +}
67357 +
67358 +__u64 reiser4_flush_reserved(const struct super_block * super)
67359 +{
67360 +       assert("vpf-285", super != NULL);
67361 +       assert("vpf-286", is_reiser4_super(super));
67362 +
67363 +       return get_super_private(super)->blocks_flush_reserved;
67364 +}
67365 +
67366 +/* get/set value of/to counter of fake allocated formatted blocks */
67367 +__u64 reiser4_fake_allocated(const struct super_block * super)
67368 +{
67369 +       assert("zam-516", super != NULL);
67370 +       assert("zam-517", is_reiser4_super(super));
67371 +
67372 +       return get_super_private(super)->blocks_fake_allocated;
67373 +}
67374 +
67375 +/* get/set value of/to counter of fake allocated unformatted blocks */
67376 +__u64 reiser4_fake_allocated_unformatted(const struct super_block * super)
67377 +{
67378 +       assert("zam-516", super != NULL);
67379 +       assert("zam-517", is_reiser4_super(super));
67380 +
67381 +       return get_super_private(super)->blocks_fake_allocated_unformatted;
67382 +}
67383 +
67384 +/* get/set value of/to counter of clustered blocks */
67385 +__u64 reiser4_clustered_blocks(const struct super_block * super)
67386 +{
67387 +       assert("edward-601", super != NULL);
67388 +       assert("edward-602", is_reiser4_super(super));
67389 +
67390 +       return get_super_private(super)->blocks_clustered;
67391 +}
67392 +
67393 +/* space allocator used by this file system */
67394 +reiser4_space_allocator * reiser4_get_space_allocator(const struct super_block
67395 +                                                     *super)
67396 +{
67397 +       assert("nikita-1965", super != NULL);
67398 +       assert("nikita-1966", is_reiser4_super(super));
67399 +       return &get_super_private(super)->space_allocator;
67400 +}
67401 +
67402 +/* return fake inode used to bind formatted nodes in the page cache */
67403 +struct inode *reiser4_get_super_fake(const struct super_block *super   /* super block
67404 +                                                                  queried */ )
67405 +{
67406 +       assert("nikita-1757", super != NULL);
67407 +       return get_super_private(super)->fake;
67408 +}
67409 +
67410 +/* return fake inode used to bind copied on capture nodes in the page cache */
67411 +struct inode *reiser4_get_cc_fake(const struct super_block *super      /* super block
67412 +                                                                  queried */ )
67413 +{
67414 +       assert("nikita-1757", super != NULL);
67415 +       return get_super_private(super)->cc;
67416 +}
67417 +
67418 +/* return fake inode used to bind bitmaps and journlal heads */
67419 +struct inode *reiser4_get_bitmap_fake(const struct super_block *super)
67420 +{
67421 +       assert("nikita-17571", super != NULL);
67422 +       return get_super_private(super)->bitmap;
67423 +}
67424 +
67425 +/* tree used by this file system */
67426 +reiser4_tree *reiser4_get_tree(const struct super_block * super        /* super block
67427 +                                                        * queried */ )
67428 +{
67429 +       assert("nikita-460", super != NULL);
67430 +       assert("nikita-461", is_reiser4_super(super));
67431 +       return &get_super_private(super)->tree;
67432 +}
67433 +
67434 +/* Check that @super is (looks like) reiser4 super block. This is mainly for
67435 +   use in assertions. */
67436 +int is_reiser4_super(const struct super_block *super   /* super block
67437 +                                                        * queried */ )
67438 +{
67439 +       return
67440 +           super != NULL &&
67441 +           get_super_private(super) != NULL &&
67442 +           super->s_op == &(get_super_private(super)->ops.super);
67443 +}
67444 +
67445 +int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f)
67446 +{
67447 +       return test_bit((int)f, &get_super_private(super)->fs_flags);
67448 +}
67449 +
67450 +/* amount of blocks reserved for given group in file system */
67451 +static __u64 reserved_for_gid(const struct super_block *super UNUSED_ARG       /* super
67452 +                                                                                * block
67453 +                                                                                * queried */ ,
67454 +                             gid_t gid UNUSED_ARG /* group id */ )
67455 +{
67456 +       return 0;
67457 +}
67458 +
67459 +/* amount of blocks reserved for given user in file system */
67460 +static __u64 reserved_for_uid(const struct super_block *super UNUSED_ARG       /* super
67461 +                                                                                  block
67462 +                                                                                  queried */ ,
67463 +                             uid_t uid UNUSED_ARG /* user id */ )
67464 +{
67465 +       return 0;
67466 +}
67467 +
67468 +/* amount of blocks reserved for super user in file system */
67469 +static __u64 reserved_for_root(const struct super_block *super UNUSED_ARG      /* super
67470 +                                                                                  block
67471 +                                                                                  queried */ )
67472 +{
67473 +       return 0;
67474 +}
67475 +
67476 +/*
67477 + * true if block number @blk makes sense for the file system at @super.
67478 + */
67479 +int
67480 +reiser4_blocknr_is_sane_for(const struct super_block *super,
67481 +                           const reiser4_block_nr * blk)
67482 +{
67483 +       reiser4_super_info_data *sbinfo;
67484 +
67485 +       assert("nikita-2957", super != NULL);
67486 +       assert("nikita-2958", blk != NULL);
67487 +
67488 +       if (reiser4_blocknr_is_fake(blk))
67489 +               return 1;
67490 +
67491 +       sbinfo = get_super_private(super);
67492 +       return *blk < sbinfo->block_count;
67493 +}
67494 +
67495 +#if REISER4_DEBUG
67496 +/*
67497 + * true, if block number @blk makes sense for the current file system
67498 + */
67499 +int reiser4_blocknr_is_sane(const reiser4_block_nr * blk)
67500 +{
67501 +       return reiser4_blocknr_is_sane_for(reiser4_get_current_sb(), blk);
67502 +}
67503 +#endif  /*  REISER4_DEBUG  */
67504 +
67505 +/* Make Linus happy.
67506 +   Local variables:
67507 +   c-indentation-style: "K&R"
67508 +   mode-name: "LC"
67509 +   c-basic-offset: 8
67510 +   tab-width: 8
67511 +   fill-column: 120
67512 +   End:
67513 +*/
67514 diff --git a/fs/reiser4/super.h b/fs/reiser4/super.h
67515 new file mode 100644
67516 index 0000000..120f021
67517 --- /dev/null
67518 +++ b/fs/reiser4/super.h
67519 @@ -0,0 +1,464 @@
67520 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
67521 + * reiser4/README */
67522 +
67523 +/* Super-block functions. See super.c for details. */
67524 +
67525 +#if !defined( __REISER4_SUPER_H__ )
67526 +#define __REISER4_SUPER_H__
67527 +
67528 +#include "tree.h"
67529 +#include "entd.h"
67530 +#include "wander.h"
67531 +#include "fsdata.h"
67532 +#include "plugin/object.h"
67533 +#include "plugin/space/space_allocator.h"
67534 +
67535 +/*
67536 + * Flush algorithms parameters.
67537 + */
67538 +typedef struct {
67539 +       unsigned relocate_threshold;
67540 +       unsigned relocate_distance;
67541 +       unsigned written_threshold;
67542 +       unsigned scan_maxnodes;
67543 +} flush_params;
67544 +
67545 +typedef enum {
67546 +       /*
67547 +        * True if this file system doesn't support hard-links (multiple names)
67548 +        * for directories: this is default UNIX behavior.
67549 +        *
67550 +        * If hard-links on directoires are not allowed, file system is Acyclic
67551 +        * Directed Graph (modulo dot, and dotdot, of course).
67552 +        *
67553 +        * This is used by reiser4_link().
67554 +        */
67555 +       REISER4_ADG = 0,
67556 +       /*
67557 +        * set if all nodes in internal tree have the same node layout plugin.
67558 +        * If so, znode_guess_plugin() will return tree->node_plugin in stead
67559 +        * of guessing plugin by plugin id stored in the node.
67560 +        */
67561 +       REISER4_ONE_NODE_PLUGIN = 1,
67562 +       /* if set, bsd gid assignment is supported. */
67563 +       REISER4_BSD_GID = 2,
67564 +       /* [mac]_time are 32 bit in inode */
67565 +       REISER4_32_BIT_TIMES = 3,
67566 +       /* load all bitmap blocks at mount time */
67567 +       REISER4_DONT_LOAD_BITMAP = 5,
67568 +       /* enforce atomicity during write(2) */
67569 +       REISER4_ATOMIC_WRITE = 6,
67570 +       /* don't use write barriers in the log writer code. */
67571 +       REISER4_NO_WRITE_BARRIER = 7
67572 +} reiser4_fs_flag;
67573 +
67574 +/*
67575 + * VFS related operation vectors.
67576 + */
67577 +typedef struct object_ops {
67578 +       struct super_operations super;
67579 +       struct dentry_operations dentry;
67580 +       struct export_operations export;
67581 +} object_ops;
67582 +
67583 +/* reiser4-specific part of super block
67584 +
67585 +   Locking
67586 +
67587 +   Fields immutable after mount:
67588 +
67589 +    ->oid*
67590 +    ->space*
67591 +    ->default_[ug]id
67592 +    ->mkfs_id
67593 +    ->trace_flags
67594 +    ->debug_flags
67595 +    ->fs_flags
67596 +    ->df_plug
67597 +    ->optimal_io_size
67598 +    ->plug
67599 +    ->flush
67600 +    ->u (bad name)
67601 +    ->txnmgr
67602 +    ->ra_params
67603 +    ->fsuid
67604 +    ->journal_header
67605 +    ->journal_footer
67606 +
67607 +   Fields protected by ->lnode_guard
67608 +
67609 +    ->lnode_htable
67610 +
67611 +   Fields protected by per-super block spin lock
67612 +
67613 +    ->block_count
67614 +    ->blocks_used
67615 +    ->blocks_free
67616 +    ->blocks_free_committed
67617 +    ->blocks_grabbed
67618 +    ->blocks_fake_allocated_unformatted
67619 +    ->blocks_fake_allocated
67620 +    ->blocks_flush_reserved
67621 +    ->eflushed
67622 +    ->blocknr_hint_default
67623 +
67624 +   After journal replaying during mount,
67625 +
67626 +    ->last_committed_tx
67627 +
67628 +   is protected by ->tmgr.commit_mutex
67629 +
67630 +   Invariants involving this data-type:
67631 +
67632 +      [sb-block-counts]
67633 +      [sb-grabbed]
67634 +      [sb-fake-allocated]
67635 +*/
67636 +struct reiser4_super_info_data {
67637 +       /*
67638 +        * guard spinlock which protects reiser4 super block fields (currently
67639 +        * blocks_free, blocks_free_committed)
67640 +        */
67641 +       spinlock_t guard;
67642 +
67643 +       /* next oid that will be returned by oid_allocate() */
67644 +       oid_t next_to_use;
67645 +       /* total number of used oids */
67646 +       oid_t oids_in_use;
67647 +
67648 +       /* space manager plugin */
67649 +       reiser4_space_allocator space_allocator;
67650 +
67651 +       /* reiser4 internal tree */
67652 +       reiser4_tree tree;
67653 +
67654 +       /*
67655 +        * default user id used for light-weight files without their own
67656 +        * stat-data.
67657 +        */
67658 +       uid_t default_uid;
67659 +
67660 +       /*
67661 +        * default group id used for light-weight files without their own
67662 +        * stat-data.
67663 +        */
67664 +       gid_t default_gid;
67665 +
67666 +       /* mkfs identifier generated at mkfs time. */
67667 +       __u32 mkfs_id;
67668 +       /* amount of blocks in a file system */
67669 +       __u64 block_count;
67670 +
67671 +       /* inviolable reserve */
67672 +       __u64 blocks_reserved;
67673 +
67674 +       /* amount of blocks used by file system data and meta-data. */
67675 +       __u64 blocks_used;
67676 +
67677 +       /*
67678 +        * amount of free blocks. This is "working" free blocks counter. It is
67679 +        * like "working" bitmap, please see block_alloc.c for description.
67680 +        */
67681 +       __u64 blocks_free;
67682 +
67683 +       /*
67684 +        * free block count for fs committed state. This is "commit" version of
67685 +        * free block counter.
67686 +        */
67687 +       __u64 blocks_free_committed;
67688 +
67689 +       /*
67690 +        * number of blocks reserved for further allocation, for all
67691 +        * threads.
67692 +        */
67693 +       __u64 blocks_grabbed;
67694 +
67695 +       /* number of fake allocated unformatted blocks in tree. */
67696 +       __u64 blocks_fake_allocated_unformatted;
67697 +
67698 +       /* number of fake allocated formatted blocks in tree. */
67699 +       __u64 blocks_fake_allocated;
67700 +
67701 +       /* number of blocks reserved for flush operations. */
67702 +       __u64 blocks_flush_reserved;
67703 +
67704 +       /* number of blocks reserved for cluster operations. */
67705 +       __u64 blocks_clustered;
67706 +
67707 +       /* unique file-system identifier */
67708 +       __u32 fsuid;
67709 +
67710 +       /* On-disk format version. If does not equal to the disk_format
67711 +          plugin version, some format updates (e.g. enlarging plugin
67712 +          set, etc) may have place on mount. */
67713 +       int version;
67714 +
67715 +       /* file-system wide flags. See reiser4_fs_flag enum */
67716 +       unsigned long fs_flags;
67717 +
67718 +       /* transaction manager */
67719 +       txn_mgr tmgr;
67720 +
67721 +       /* ent thread */
67722 +       entd_context entd;
67723 +
67724 +       /* fake inode used to bind formatted nodes */
67725 +       struct inode *fake;
67726 +       /* inode used to bind bitmaps (and journal heads) */
67727 +       struct inode *bitmap;
67728 +       /* inode used to bind copied on capture nodes */
67729 +       struct inode *cc;
67730 +
67731 +       /* disk layout plugin */
67732 +       disk_format_plugin *df_plug;
67733 +
67734 +       /* disk layout specific part of reiser4 super info data */
67735 +       union {
67736 +               format40_super_info format40;
67737 +       } u;
67738 +
67739 +       /* value we return in st_blksize on stat(2) */
67740 +       unsigned long optimal_io_size;
67741 +
67742 +       /* parameters for the flush algorithm */
67743 +       flush_params flush;
67744 +
67745 +       /* pointers to jnodes for journal header and footer */
67746 +       jnode *journal_header;
67747 +       jnode *journal_footer;
67748 +
67749 +       journal_location jloc;
67750 +
67751 +       /* head block number of last committed transaction */
67752 +       __u64 last_committed_tx;
67753 +
67754 +       /*
67755 +        * we remember last written location for using as a hint for new block
67756 +        * allocation
67757 +        */
67758 +       __u64 blocknr_hint_default;
67759 +
67760 +       /* committed number of files (oid allocator state variable ) */
67761 +       __u64 nr_files_committed;
67762 +
67763 +       ra_params_t ra_params;
67764 +
67765 +       /*
67766 +        * A mutex for serializing cut tree operation if out-of-free-space:
67767 +        * the only one cut_tree thread is allowed to grab space from reserved
67768 +        * area (it is 5% of disk space)
67769 +        */
67770 +       struct mutex delete_mutex;
67771 +       /* task owning ->delete_mutex */
67772 +       struct task_struct *delete_mutex_owner;
67773 +
67774 +       /* Diskmap's blocknumber */
67775 +       __u64 diskmap_block;
67776 +
67777 +       /* What to do in case of error */
67778 +       int onerror;
67779 +
67780 +       /* operations for objects on this file system */
67781 +       object_ops ops;
67782 +
67783 +       /*
67784 +        * structure to maintain d_cursors. See plugin/file_ops_readdir.c for
67785 +        * more details
67786 +        */
67787 +       d_cursor_info d_info;
67788 +
67789 +#ifdef CONFIG_REISER4_BADBLOCKS
67790 +       /* Alternative master superblock offset (in bytes) */
67791 +       unsigned long altsuper;
67792 +#endif
67793 +       struct repacker *repacker;
67794 +       struct page *status_page;
67795 +       struct bio *status_bio;
67796 +
67797 +#if REISER4_DEBUG
67798 +       /*
67799 +        * minimum used blocks value (includes super blocks, bitmap blocks and
67800 +        * other fs reserved areas), depends on fs format and fs size.
67801 +        */
67802 +       __u64 min_blocks_used;
67803 +
67804 +       /*
67805 +        * when debugging is on, all jnodes (including znodes, bitmaps, etc.)
67806 +        * are kept on a list anchored at sbinfo->all_jnodes. This list is
67807 +        * protected by sbinfo->all_guard spin lock. This lock should be taken
67808 +        * with _irq modifier, because it is also modified from interrupt
67809 +        * contexts (by RCU).
67810 +        */
67811 +       spinlock_t all_guard;
67812 +       /* list of all jnodes */
67813 +       struct list_head all_jnodes;
67814 +#endif
67815 +       struct dentry *debugfs_root;
67816 +};
67817 +
67818 +extern reiser4_super_info_data *get_super_private_nocheck(const struct
67819 +                                                         super_block *super);
67820 +
67821 +/* Return reiser4-specific part of super block */
67822 +static inline reiser4_super_info_data *get_super_private(const struct
67823 +                                                        super_block *super)
67824 +{
67825 +       assert("nikita-447", super != NULL);
67826 +
67827 +       return (reiser4_super_info_data *) super->s_fs_info;
67828 +}
67829 +
67830 +/* get ent context for the @super */
67831 +static inline entd_context *get_entd_context(struct super_block *super)
67832 +{
67833 +       return &get_super_private(super)->entd;
67834 +}
67835 +
67836 +/* "Current" super-block: main super block used during current system
67837 +   call. Reference to this super block is stored in reiser4_context. */
67838 +static inline struct super_block *reiser4_get_current_sb(void)
67839 +{
67840 +       return get_current_context()->super;
67841 +}
67842 +
67843 +/* Reiser4-specific part of "current" super-block: main super block used
67844 +   during current system call. Reference to this super block is stored in
67845 +   reiser4_context. */
67846 +static inline reiser4_super_info_data *get_current_super_private(void)
67847 +{
67848 +       return get_super_private(reiser4_get_current_sb());
67849 +}
67850 +
67851 +static inline ra_params_t *get_current_super_ra_params(void)
67852 +{
67853 +       return &(get_current_super_private()->ra_params);
67854 +}
67855 +
67856 +/*
67857 + * true, if file system on @super is read-only
67858 + */
67859 +static inline int rofs_super(struct super_block *super)
67860 +{
67861 +       return super->s_flags & MS_RDONLY;
67862 +}
67863 +
67864 +/*
67865 + * true, if @tree represents read-only file system
67866 + */
67867 +static inline int rofs_tree(reiser4_tree * tree)
67868 +{
67869 +       return rofs_super(tree->super);
67870 +}
67871 +
67872 +/*
67873 + * true, if file system where @inode lives on, is read-only
67874 + */
67875 +static inline int rofs_inode(struct inode *inode)
67876 +{
67877 +       return rofs_super(inode->i_sb);
67878 +}
67879 +
67880 +/*
67881 + * true, if file system where @node lives on, is read-only
67882 + */
67883 +static inline int rofs_jnode(jnode * node)
67884 +{
67885 +       return rofs_tree(jnode_get_tree(node));
67886 +}
67887 +
67888 +extern __u64 reiser4_current_block_count(void);
67889 +
67890 +extern void build_object_ops(struct super_block *super, object_ops * ops);
67891 +
67892 +#define REISER4_SUPER_MAGIC 0x52345362 /* (*(__u32 *)"R4Sb"); */
67893 +
67894 +static inline void spin_lock_reiser4_super(reiser4_super_info_data *sbinfo)
67895 +{
67896 +       spin_lock(&(sbinfo->guard));
67897 +}
67898 +
67899 +static inline void spin_unlock_reiser4_super(reiser4_super_info_data *sbinfo)
67900 +{
67901 +       assert_spin_locked(&(sbinfo->guard));
67902 +       spin_unlock(&(sbinfo->guard));
67903 +}
67904 +
67905 +extern __u64 reiser4_flush_reserved(const struct super_block *);
67906 +extern int reiser4_is_set(const struct super_block *super, reiser4_fs_flag f);
67907 +extern long reiser4_statfs_type(const struct super_block *super);
67908 +extern __u64 reiser4_block_count(const struct super_block *super);
67909 +extern void reiser4_set_block_count(const struct super_block *super, __u64 nr);
67910 +extern __u64 reiser4_data_blocks(const struct super_block *super);
67911 +extern void reiser4_set_data_blocks(const struct super_block *super, __u64 nr);
67912 +extern __u64 reiser4_free_blocks(const struct super_block *super);
67913 +extern void reiser4_set_free_blocks(const struct super_block *super, __u64 nr);
67914 +extern __u32 reiser4_mkfs_id(const struct super_block *super);
67915 +
67916 +extern __u64 reiser4_free_committed_blocks(const struct super_block *super);
67917 +
67918 +extern __u64 reiser4_grabbed_blocks(const struct super_block *);
67919 +extern __u64 reiser4_fake_allocated(const struct super_block *);
67920 +extern __u64 reiser4_fake_allocated_unformatted(const struct super_block *);
67921 +extern __u64 reiser4_clustered_blocks(const struct super_block *);
67922 +
67923 +extern long reiser4_reserved_blocks(const struct super_block *super, uid_t uid,
67924 +                                   gid_t gid);
67925 +
67926 +extern reiser4_space_allocator *
67927 +reiser4_get_space_allocator(const struct super_block *super);
67928 +extern reiser4_oid_allocator *
67929 +reiser4_get_oid_allocator(const struct super_block *super);
67930 +extern struct inode *reiser4_get_super_fake(const struct super_block *super);
67931 +extern struct inode *reiser4_get_cc_fake(const struct super_block *super);
67932 +extern struct inode *reiser4_get_bitmap_fake(const struct super_block *super);
67933 +extern reiser4_tree *reiser4_get_tree(const struct super_block *super);
67934 +extern int is_reiser4_super(const struct super_block *super);
67935 +
67936 +extern int reiser4_blocknr_is_sane(const reiser4_block_nr * blk);
67937 +extern int reiser4_blocknr_is_sane_for(const struct super_block *super,
67938 +                                      const reiser4_block_nr * blk);
67939 +extern int reiser4_fill_super(struct super_block *s, void *data, int silent);
67940 +extern int reiser4_done_super(struct super_block *s);
67941 +
67942 +/* step of fill super */
67943 +extern int reiser4_init_fs_info(struct super_block *);
67944 +extern void reiser4_done_fs_info(struct super_block *);
67945 +extern int reiser4_init_super_data(struct super_block *, char *opt_string);
67946 +extern int reiser4_init_read_super(struct super_block *, int silent);
67947 +extern int reiser4_init_root_inode(struct super_block *);
67948 +extern reiser4_plugin *get_default_plugin(pset_member memb);
67949 +
67950 +/* Maximal possible object id. */
67951 +#define  ABSOLUTE_MAX_OID ((oid_t)~0)
67952 +
67953 +#define OIDS_RESERVED  ( 1 << 16 )
67954 +int oid_init_allocator(struct super_block *, oid_t nr_files, oid_t next);
67955 +oid_t oid_allocate(struct super_block *);
67956 +int oid_release(struct super_block *, oid_t);
67957 +oid_t oid_next(const struct super_block *);
67958 +void oid_count_allocated(void);
67959 +void oid_count_released(void);
67960 +long oids_used(const struct super_block *);
67961 +
67962 +#if REISER4_DEBUG
67963 +void print_fs_info(const char *prefix, const struct super_block *);
67964 +#endif
67965 +
67966 +extern void destroy_reiser4_cache(struct kmem_cache **);
67967 +
67968 +extern struct super_operations reiser4_super_operations;
67969 +extern struct export_operations reiser4_export_operations;
67970 +extern struct dentry_operations reiser4_dentry_operations;
67971 +
67972 +/* __REISER4_SUPER_H__ */
67973 +#endif
67974 +
67975 +/*
67976 + * Local variables:
67977 + * c-indentation-style: "K&R"
67978 + * mode-name: "LC"
67979 + * c-basic-offset: 8
67980 + * tab-width: 8
67981 + * fill-column: 120
67982 + * End:
67983 + */
67984 diff --git a/fs/reiser4/super_ops.c b/fs/reiser4/super_ops.c
67985 new file mode 100644
67986 index 0000000..41e9c1a
67987 --- /dev/null
67988 +++ b/fs/reiser4/super_ops.c
67989 @@ -0,0 +1,730 @@
67990 +/* Copyright 2005 by Hans Reiser, licensing governed by
67991 + * reiser4/README */
67992 +
67993 +#include "inode.h"
67994 +#include "page_cache.h"
67995 +#include "ktxnmgrd.h"
67996 +#include "flush.h"
67997 +#include "safe_link.h"
67998 +
67999 +#include <linux/vfs.h>
68000 +#include <linux/writeback.h>
68001 +#include <linux/mount.h>
68002 +#include <linux/seq_file.h>
68003 +#include <linux/debugfs.h>
68004 +
68005 +/* slab cache for inodes */
68006 +static struct kmem_cache *inode_cache;
68007 +
68008 +static struct dentry *reiser4_debugfs_root = NULL;
68009 +
68010 +/**
68011 + * init_once - constructor for reiser4 inodes
68012 + * @obj: inode to be initialized
68013 + * @cache: cache @obj belongs to
68014 + * @flags: SLAB flags
68015 + *
68016 + * Initialization function to be called when new page is allocated by reiser4
68017 + * inode cache. It is set on inode cache creation.
68018 + */
68019 +static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags)
68020 +{
68021 +       reiser4_inode_object *info;
68022 +
68023 +       info = obj;
68024 +
68025 +       if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
68026 +           SLAB_CTOR_CONSTRUCTOR) {
68027 +               /* initialize vfs inode */
68028 +               inode_init_once(&info->vfs_inode);
68029 +
68030 +               /*
68031 +                * initialize reiser4 specific part fo inode.
68032 +                * NOTE-NIKITA add here initializations for locks, list heads,
68033 +                * etc. that will be added to our private inode part.
68034 +                */
68035 +               INIT_LIST_HEAD(get_readdir_list(&info->vfs_inode));
68036 +               init_rwsem(&info->p.conv_sem);
68037 +               /* init semaphore which is used during inode loading */
68038 +               loading_init_once(&info->p);
68039 +               INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p),
68040 +                               GFP_ATOMIC);
68041 +#if REISER4_DEBUG
68042 +               info->p.nr_jnodes = 0;
68043 +#endif
68044 +       }
68045 +}
68046 +
68047 +/**
68048 + * init_inodes - create znode cache
68049 + *
68050 + * Initializes slab cache of inodes. It is part of reiser4 module initialization.
68051 + */
68052 +static int init_inodes(void)
68053 +{
68054 +       inode_cache = kmem_cache_create("reiser4_inode",
68055 +                                       sizeof(reiser4_inode_object),
68056 +                                       0,
68057 +                                       SLAB_HWCACHE_ALIGN |
68058 +                                       SLAB_RECLAIM_ACCOUNT, init_once, NULL);
68059 +       if (inode_cache == NULL)
68060 +               return RETERR(-ENOMEM);
68061 +       return 0;
68062 +}
68063 +
68064 +/**
68065 + * done_inodes - delete inode cache
68066 + *
68067 + * This is called on reiser4 module unloading or system shutdown.
68068 + */
68069 +static void done_inodes(void)
68070 +{
68071 +       destroy_reiser4_cache(&inode_cache);
68072 +}
68073 +
68074 +/**
68075 + * reiser4_alloc_inode - alloc_inode of super operations
68076 + * @super: super block new inode is allocated for
68077 + *
68078 + * Allocates new inode, initializes reiser4 specific part of it.
68079 + */
68080 +static struct inode *reiser4_alloc_inode(struct super_block *super)
68081 +{
68082 +       reiser4_inode_object *obj;
68083 +
68084 +       assert("nikita-1696", super != NULL);
68085 +       obj = kmem_cache_alloc(inode_cache, reiser4_ctx_gfp_mask_get());
68086 +       if (obj != NULL) {
68087 +               reiser4_inode *info;
68088 +
68089 +               info = &obj->p;
68090 +
68091 +               info->pset = plugin_set_get_empty();
68092 +               info->hset = plugin_set_get_empty();
68093 +               info->extmask = 0;
68094 +               info->locality_id = 0ull;
68095 +               info->plugin_mask = 0;
68096 +               info->heir_mask = 0;
68097 +#if !REISER4_INO_IS_OID
68098 +               info->oid_hi = 0;
68099 +#endif
68100 +               reiser4_seal_init(&info->sd_seal, NULL, NULL);
68101 +               coord_init_invalid(&info->sd_coord, NULL);
68102 +               info->flags = 0;
68103 +               spin_lock_init(&info->guard);
68104 +               /* this deals with info's loading semaphore */
68105 +               loading_alloc(info);
68106 +               info->vroot = UBER_TREE_ADDR;
68107 +               return &obj->vfs_inode;
68108 +       } else
68109 +               return NULL;
68110 +}
68111 +
68112 +/**
68113 + * reiser4_destroy_inode - destroy_inode of super operations
68114 + * @inode: inode being destroyed
68115 + *
68116 + * Puts reiser4 specific portion of inode, frees memory occupied by inode.
68117 + */
68118 +static void reiser4_destroy_inode(struct inode *inode)
68119 +{
68120 +       reiser4_inode *info;
68121 +
68122 +       info = reiser4_inode_data(inode);
68123 +
68124 +       assert("vs-1220", inode_has_no_jnodes(info));
68125 +
68126 +       if (!is_bad_inode(inode) && is_inode_loaded(inode)) {
68127 +               file_plugin *fplug = inode_file_plugin(inode);
68128 +               if (fplug->destroy_inode != NULL)
68129 +                       fplug->destroy_inode(inode);
68130 +       }
68131 +       reiser4_dispose_cursors(inode);
68132 +       if (info->pset)
68133 +               plugin_set_put(info->pset);
68134 +       if (info->hset)
68135 +               plugin_set_put(info->hset);
68136 +
68137 +       /*
68138 +        * cannot add similar assertion about ->i_list as prune_icache return
68139 +        * inode into slab with dangling ->list.{next,prev}. This is safe,
68140 +        * because they are re-initialized in the new_inode().
68141 +        */
68142 +       assert("nikita-2895", list_empty(&inode->i_dentry));
68143 +       assert("nikita-2896", hlist_unhashed(&inode->i_hash));
68144 +       assert("nikita-2898", list_empty_careful(get_readdir_list(inode)));
68145 +
68146 +       /* this deals with info's loading semaphore */
68147 +       loading_destroy(info);
68148 +
68149 +       kmem_cache_free(inode_cache,
68150 +                       container_of(info, reiser4_inode_object, p));
68151 +}
68152 +
68153 +/**
68154 + * reiser4_dirty_inode - dirty_inode of super operations
68155 + * @inode: inode being dirtied
68156 + *
68157 + * Updates stat data.
68158 + */
68159 +static void reiser4_dirty_inode(struct inode *inode)
68160 +{
68161 +       int result;
68162 +
68163 +       if (!is_in_reiser4_context())
68164 +               return;
68165 +       assert("", !IS_RDONLY(inode));
68166 +       assert("", (inode_file_plugin(inode)->estimate.update(inode) <=
68167 +                   get_current_context()->grabbed_blocks));
68168 +
68169 +       result = reiser4_update_sd(inode);
68170 +       if (result)
68171 +               warning("", "failed to dirty inode for %llu: %d",
68172 +                       get_inode_oid(inode), result);
68173 +}
68174 +
68175 +/**
68176 + * reiser4_delete_inode - delete_inode of super operations
68177 + * @inode: inode to delete
68178 + *
68179 + * Calls file plugin's delete_object method to delete object items from
68180 + * filesystem tree and calls clear_inode.
68181 + */
68182 +static void reiser4_delete_inode(struct inode *inode)
68183 +{
68184 +       reiser4_context *ctx;
68185 +       file_plugin *fplug;
68186 +
68187 +       ctx = reiser4_init_context(inode->i_sb);
68188 +       if (IS_ERR(ctx)) {
68189 +               warning("vs-15", "failed to init context");
68190 +               return;
68191 +       }
68192 +
68193 +       if (is_inode_loaded(inode)) {
68194 +               fplug = inode_file_plugin(inode);
68195 +               if (fplug != NULL && fplug->delete_object != NULL)
68196 +                       fplug->delete_object(inode);
68197 +       }
68198 +
68199 +       truncate_inode_pages(&inode->i_data, 0);
68200 +       inode->i_blocks = 0;
68201 +       clear_inode(inode);
68202 +       reiser4_exit_context(ctx);
68203 +}
68204 +
68205 +/**
68206 + * reiser4_put_super - put_super of super operations
68207 + * @super: super block to free
68208 + *
68209 + * Stops daemons, release resources, umounts in short.
68210 + */
68211 +static void reiser4_put_super(struct super_block *super)
68212 +{
68213 +       reiser4_super_info_data *sbinfo;
68214 +       reiser4_context *ctx;
68215 +
68216 +       sbinfo = get_super_private(super);
68217 +       assert("vs-1699", sbinfo);
68218 +
68219 +       debugfs_remove(sbinfo->tmgr.debugfs_atom_count);
68220 +       debugfs_remove(sbinfo->tmgr.debugfs_id_count);
68221 +       debugfs_remove(sbinfo->debugfs_root);
68222 +
68223 +       ctx = reiser4_init_context(super);
68224 +       if (IS_ERR(ctx)) {
68225 +               warning("vs-17", "failed to init context");
68226 +               return;
68227 +       }
68228 +
68229 +       /* have disk format plugin to free its resources */
68230 +       if (get_super_private(super)->df_plug->release)
68231 +               get_super_private(super)->df_plug->release(super);
68232 +
68233 +       reiser4_done_formatted_fake(super);
68234 +
68235 +       /* stop daemons: ktxnmgr and entd */
68236 +       reiser4_done_entd(super);
68237 +       reiser4_done_ktxnmgrd(super);
68238 +       reiser4_done_txnmgr(&sbinfo->tmgr);
68239 +
68240 +       reiser4_done_fs_info(super);
68241 +       reiser4_exit_context(ctx);
68242 +}
68243 +
68244 +/**
68245 + * reiser4_write_super - write_super of super operations
68246 + * @super: super block to write
68247 + *
68248 + * Captures znode associated with super block, comit all transactions.
68249 + */
68250 +static void reiser4_write_super(struct super_block *super)
68251 +{
68252 +       int ret;
68253 +       reiser4_context *ctx;
68254 +
68255 +       assert("vs-1700", !rofs_super(super));
68256 +
68257 +       ctx = reiser4_init_context(super);
68258 +       if (IS_ERR(ctx)) {
68259 +               warning("vs-16", "failed to init context");
68260 +               return;
68261 +       }
68262 +
68263 +       ret = reiser4_capture_super_block(super);
68264 +       if (ret != 0)
68265 +               warning("vs-1701",
68266 +                       "reiser4_capture_super_block failed in write_super: %d",
68267 +                       ret);
68268 +       ret = txnmgr_force_commit_all(super, 0);
68269 +       if (ret != 0)
68270 +               warning("jmacd-77113",
68271 +                       "txn_force failed in write_super: %d", ret);
68272 +
68273 +       super->s_dirt = 0;
68274 +
68275 +       reiser4_exit_context(ctx);
68276 +}
68277 +
68278 +/**
68279 + * reiser4_statfs - statfs of super operations
68280 + * @super: super block of file system in queried
68281 + * @stafs: buffer to fill with statistics
68282 + *
68283 + * Returns information about filesystem.
68284 + */
68285 +static int reiser4_statfs(struct dentry *dentry, struct kstatfs *statfs)
68286 +{
68287 +       sector_t total;
68288 +       sector_t reserved;
68289 +       sector_t free;
68290 +       sector_t forroot;
68291 +       sector_t deleted;
68292 +       reiser4_context *ctx;
68293 +       struct super_block *super = dentry->d_sb;
68294 +
68295 +       assert("nikita-408", super != NULL);
68296 +       assert("nikita-409", statfs != NULL);
68297 +
68298 +       ctx = reiser4_init_context(super);
68299 +       if (IS_ERR(ctx))
68300 +               return PTR_ERR(ctx);
68301 +
68302 +       statfs->f_type = reiser4_statfs_type(super);
68303 +       statfs->f_bsize = super->s_blocksize;
68304 +
68305 +       /*
68306 +        * 5% of total block space is reserved. This is needed for flush and
68307 +        * for truncates (so that we are able to perform truncate/unlink even
68308 +        * on the otherwise completely full file system). If this reservation
68309 +        * is hidden from statfs(2), users will mistakenly guess that they
68310 +        * have enough free space to complete some operation, which is
68311 +        * frustrating.
68312 +        *
68313 +        * Another possible solution is to subtract ->blocks_reserved from
68314 +        * ->f_bfree, but changing available space seems less intrusive than
68315 +        * letting user to see 5% of disk space to be used directly after
68316 +        * mkfs.
68317 +        */
68318 +       total = reiser4_block_count(super);
68319 +       reserved = get_super_private(super)->blocks_reserved;
68320 +       deleted = txnmgr_count_deleted_blocks();
68321 +       free = reiser4_free_blocks(super) + deleted;
68322 +       forroot = reiser4_reserved_blocks(super, 0, 0);
68323 +
68324 +       /*
68325 +        * These counters may be in inconsistent state because we take the
68326 +        * values without keeping any global spinlock.  Here we do a sanity
68327 +        * check that free block counter does not exceed the number of all
68328 +        * blocks.
68329 +        */
68330 +       if (free > total)
68331 +               free = total;
68332 +       statfs->f_blocks = total - reserved;
68333 +       /* make sure statfs->f_bfree is never larger than statfs->f_blocks */
68334 +       if (free > reserved)
68335 +               free -= reserved;
68336 +       else
68337 +               free = 0;
68338 +       statfs->f_bfree = free;
68339 +
68340 +       if (free > forroot)
68341 +               free -= forroot;
68342 +       else
68343 +               free = 0;
68344 +       statfs->f_bavail = free;
68345 +
68346 +       statfs->f_files = 0;
68347 +       statfs->f_ffree = 0;
68348 +
68349 +       /* maximal acceptable name length depends on directory plugin. */
68350 +       assert("nikita-3351", super->s_root->d_inode != NULL);
68351 +       statfs->f_namelen = reiser4_max_filename_len(super->s_root->d_inode);
68352 +       reiser4_exit_context(ctx);
68353 +       return 0;
68354 +}
68355 +
68356 +/**
68357 + * reiser4_clear_inode - clear_inode of super operation
68358 + * @inode: inode about to destroy
68359 + *
68360 + * Does sanity checks: being destroyed should have all jnodes detached.
68361 + */
68362 +static void reiser4_clear_inode(struct inode *inode)
68363 +{
68364 +#if REISER4_DEBUG
68365 +       reiser4_inode *r4_inode;
68366 +
68367 +       r4_inode = reiser4_inode_data(inode);
68368 +       if (!inode_has_no_jnodes(r4_inode))
68369 +               warning("vs-1732", "reiser4 inode has %ld jnodes\n",
68370 +                       r4_inode->nr_jnodes);
68371 +#endif
68372 +}
68373 +
68374 +/**
68375 + * reiser4_sync_inodes - sync_inodes of super operations
68376 + * @super:
68377 + * @wbc:
68378 + *
68379 + * This method is called by background and non-backgound writeback. Reiser4's
68380 + * implementation uses generic_sync_sb_inodes to call reiser4_writepages for
68381 + * each of dirty inodes. Reiser4_writepages handles pages dirtied via shared
68382 + * mapping - dirty pages get into atoms. Writeout is called to flush some
68383 + * atoms.
68384 + */
68385 +static void reiser4_sync_inodes(struct super_block *super,
68386 +                               struct writeback_control *wbc)
68387 +{
68388 +       reiser4_context *ctx;
68389 +       long to_write;
68390 +
68391 +       if (wbc->for_kupdate)
68392 +               /* reiser4 has its own means of periodical write-out */
68393 +               return;
68394 +
68395 +       to_write = wbc->nr_to_write;
68396 +       assert("vs-49", wbc->older_than_this == NULL);
68397 +
68398 +       ctx = reiser4_init_context(super);
68399 +       if (IS_ERR(ctx)) {
68400 +               warning("vs-13", "failed to init context");
68401 +               return;
68402 +       }
68403 +
68404 +       /*
68405 +        * call reiser4_writepages for each of dirty inodes to turn dirty pages
68406 +        * into transactions if they were not yet.
68407 +        */
68408 +       generic_sync_sb_inodes(super, wbc);
68409 +
68410 +       /* flush goes here */
68411 +       wbc->nr_to_write = to_write;
68412 +       reiser4_writeout(super, wbc);
68413 +
68414 +       /* avoid recursive calls to ->sync_inodes */
68415 +       context_set_commit_async(ctx);
68416 +       reiser4_exit_context(ctx);
68417 +}
68418 +
68419 +/**
68420 + * reiser4_show_options - show_options of super operations
68421 + * @m: file where to write information
68422 + * @mnt: mount structure
68423 + *
68424 + * Makes reiser4 mount options visible in /proc/mounts.
68425 + */
68426 +static int reiser4_show_options(struct seq_file *m, struct vfsmount *mnt)
68427 +{
68428 +       struct super_block *super;
68429 +       reiser4_super_info_data *sbinfo;
68430 +
68431 +       super = mnt->mnt_sb;
68432 +       sbinfo = get_super_private(super);
68433 +
68434 +       seq_printf(m, ",atom_max_size=0x%x", sbinfo->tmgr.atom_max_size);
68435 +       seq_printf(m, ",atom_max_age=0x%x", sbinfo->tmgr.atom_max_age);
68436 +       seq_printf(m, ",atom_min_size=0x%x", sbinfo->tmgr.atom_min_size);
68437 +       seq_printf(m, ",atom_max_flushers=0x%x",
68438 +                  sbinfo->tmgr.atom_max_flushers);
68439 +       seq_printf(m, ",cbk_cache_slots=0x%x",
68440 +                  sbinfo->tree.cbk_cache.nr_slots);
68441 +
68442 +       return 0;
68443 +}
68444 +
68445 +struct super_operations reiser4_super_operations = {
68446 +       .alloc_inode = reiser4_alloc_inode,
68447 +       .destroy_inode = reiser4_destroy_inode,
68448 +       .dirty_inode = reiser4_dirty_inode,
68449 +       .delete_inode = reiser4_delete_inode,
68450 +       .put_super = reiser4_put_super,
68451 +       .write_super = reiser4_write_super,
68452 +       .statfs = reiser4_statfs,
68453 +       .clear_inode = reiser4_clear_inode,
68454 +       .sync_inodes = reiser4_sync_inodes,
68455 +       .show_options = reiser4_show_options
68456 +};
68457 +
68458 +/**
68459 + * fill_super - initialize super block on mount
68460 + * @super: super block to fill
68461 + * @data: reiser4 specific mount option
68462 + * @silent:
68463 + *
68464 + * This is to be called by reiser4_get_sb. Mounts filesystem.
68465 + */
68466 +static int fill_super(struct super_block *super, void *data, int silent)
68467 +{
68468 +       reiser4_context ctx;
68469 +       int result;
68470 +       reiser4_super_info_data *sbinfo;
68471 +
68472 +       assert("zam-989", super != NULL);
68473 +
68474 +       super->s_op = NULL;
68475 +       init_stack_context(&ctx, super);
68476 +
68477 +       /* allocate reiser4 specific super block */
68478 +       if ((result = reiser4_init_fs_info(super)) != 0)
68479 +               goto failed_init_sinfo;
68480 +
68481 +       sbinfo = get_super_private(super);
68482 +       /* initialize various reiser4 parameters, parse mount options */
68483 +       if ((result = reiser4_init_super_data(super, data)) != 0)
68484 +               goto failed_init_super_data;
68485 +
68486 +       /* read reiser4 master super block, initialize disk format plugin */
68487 +       if ((result = reiser4_init_read_super(super, silent)) != 0)
68488 +               goto failed_init_read_super;
68489 +
68490 +       /* initialize transaction manager */
68491 +       reiser4_init_txnmgr(&sbinfo->tmgr);
68492 +
68493 +       /* initialize ktxnmgrd context and start kernel thread ktxnmrgd */
68494 +       if ((result = reiser4_init_ktxnmgrd(super)) != 0)
68495 +               goto failed_init_ktxnmgrd;
68496 +
68497 +       /* initialize entd context and start kernel thread entd */
68498 +       if ((result = reiser4_init_entd(super)) != 0)
68499 +               goto failed_init_entd;
68500 +
68501 +       /* initialize address spaces for formatted nodes and bitmaps */
68502 +       if ((result = reiser4_init_formatted_fake(super)) != 0)
68503 +               goto failed_init_formatted_fake;
68504 +
68505 +       /* initialize disk format plugin */
68506 +       if ((result = get_super_private(super)->df_plug->init_format(super, data)) != 0 )
68507 +               goto failed_init_disk_format;
68508 +
68509 +       /*
68510 +        * There are some 'committed' versions of reiser4 super block counters,
68511 +        * which correspond to reiser4 on-disk state. These counters are
68512 +        * initialized here
68513 +        */
68514 +       sbinfo->blocks_free_committed = sbinfo->blocks_free;
68515 +       sbinfo->nr_files_committed = oids_used(super);
68516 +
68517 +       /* get inode of root directory */
68518 +       if ((result = reiser4_init_root_inode(super)) != 0)
68519 +               goto failed_init_root_inode;
68520 +
68521 +       if ((result = get_super_private(super)->df_plug->version_update(super)) != 0 )
68522 +               goto failed_update_format_version;
68523 +
68524 +       process_safelinks(super);
68525 +       reiser4_exit_context(&ctx);
68526 +
68527 +       sbinfo->debugfs_root = debugfs_create_dir(super->s_id,
68528 +                                                 reiser4_debugfs_root);
68529 +       if (sbinfo->debugfs_root) {
68530 +               sbinfo->tmgr.debugfs_atom_count =
68531 +                       debugfs_create_u32("atom_count", S_IFREG|S_IRUSR,
68532 +                                          sbinfo->debugfs_root,
68533 +                                          &sbinfo->tmgr.atom_count);
68534 +               sbinfo->tmgr.debugfs_id_count =
68535 +                       debugfs_create_u32("id_count", S_IFREG|S_IRUSR,
68536 +                                          sbinfo->debugfs_root,
68537 +                                          &sbinfo->tmgr.id_count);
68538 +       }
68539 +       return 0;
68540 +
68541 + failed_update_format_version:
68542 + failed_init_root_inode:
68543 +       if (sbinfo->df_plug->release)
68544 +               sbinfo->df_plug->release(super);
68545 + failed_init_disk_format:
68546 +       reiser4_done_formatted_fake(super);
68547 + failed_init_formatted_fake:
68548 +       reiser4_done_entd(super);
68549 + failed_init_entd:
68550 +       reiser4_done_ktxnmgrd(super);
68551 + failed_init_ktxnmgrd:
68552 +       reiser4_done_txnmgr(&sbinfo->tmgr);
68553 + failed_init_read_super:
68554 + failed_init_super_data:
68555 +       reiser4_done_fs_info(super);
68556 + failed_init_sinfo:
68557 +       reiser4_exit_context(&ctx);
68558 +       return result;
68559 +}
68560 +
68561 +/**
68562 + * reiser4_get_sb - get_sb of file_system_type operations
68563 + * @fs_type:
68564 + * @flags: mount flags MS_RDONLY, MS_VERBOSE, etc
68565 + * @dev_name: block device file name
68566 + * @data: specific mount options
68567 + *
68568 + * Reiser4 mount entry.
68569 + */
68570 +static int reiser4_get_sb(struct file_system_type *fs_type,
68571 +                         int flags,
68572 +                         const char *dev_name,
68573 +                         void *data,
68574 +                         struct vfsmount *mnt)
68575 +{
68576 +       return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
68577 +}
68578 +
68579 +/* structure describing the reiser4 filesystem implementation */
68580 +static struct file_system_type reiser4_fs_type = {
68581 +       .owner = THIS_MODULE,
68582 +       .name = "reiser4",
68583 +       .fs_flags = FS_REQUIRES_DEV,
68584 +       .get_sb = reiser4_get_sb,
68585 +       .kill_sb = kill_block_super,
68586 +       .next = NULL
68587 +};
68588 +
68589 +void destroy_reiser4_cache(struct kmem_cache **cachep)
68590 +{
68591 +       kmem_cache_destroy(*cachep);
68592 +       *cachep = NULL;
68593 +}
68594 +
68595 +/**
68596 + * init_reiser4 - reiser4 initialization entry point
68597 + *
68598 + * Initializes reiser4 slabs, registers reiser4 filesystem type. It is called
68599 + * on kernel initialization or during reiser4 module load.
68600 + */
68601 +static int __init init_reiser4(void)
68602 +{
68603 +       int result;
68604 +
68605 +       printk(KERN_INFO
68606 +              "Loading Reiser4. "
68607 +              "See www.namesys.com for a description of Reiser4.\n");
68608 +
68609 +       /* initialize slab cache of inodes */
68610 +       if ((result = init_inodes()) != 0)
68611 +               goto failed_inode_cache;
68612 +
68613 +       /* initialize cache of znodes */
68614 +       if ((result = init_znodes()) != 0)
68615 +               goto failed_init_znodes;
68616 +
68617 +       /* initialize all plugins */
68618 +       if ((result = init_plugins()) != 0)
68619 +               goto failed_init_plugins;
68620 +
68621 +       /* initialize cache of plugin_set-s and plugin_set's hash table */
68622 +       if ((result = init_plugin_set()) != 0)
68623 +               goto failed_init_plugin_set;
68624 +
68625 +       /* initialize caches of txn_atom-s and txn_handle-s */
68626 +       if ((result = init_txnmgr_static()) != 0)
68627 +               goto failed_init_txnmgr_static;
68628 +
68629 +       /* initialize cache of jnodes */
68630 +       if ((result = init_jnodes()) != 0)
68631 +               goto failed_init_jnodes;
68632 +
68633 +       /* initialize cache of flush queues */
68634 +       if ((result = reiser4_init_fqs()) != 0)
68635 +               goto failed_init_fqs;
68636 +
68637 +       /* initialize cache of structures attached to dentry->d_fsdata */
68638 +       if ((result = reiser4_init_dentry_fsdata()) != 0)
68639 +               goto failed_init_dentry_fsdata;
68640 +
68641 +       /* initialize cache of structures attached to file->private_data */
68642 +       if ((result = reiser4_init_file_fsdata()) != 0)
68643 +               goto failed_init_file_fsdata;
68644 +
68645 +       /*
68646 +        * initialize cache of d_cursors. See plugin/file_ops_readdir.c for
68647 +        * more details
68648 +        */
68649 +       if ((result = reiser4_init_d_cursor()) != 0)
68650 +               goto failed_init_d_cursor;
68651 +
68652 +       if ((result = register_filesystem(&reiser4_fs_type)) == 0) {
68653 +               reiser4_debugfs_root = debugfs_create_dir("reiser4", NULL);
68654 +               return 0;
68655 +       }
68656 +
68657 +       reiser4_done_d_cursor();
68658 + failed_init_d_cursor:
68659 +       reiser4_done_file_fsdata();
68660 + failed_init_file_fsdata:
68661 +       reiser4_done_dentry_fsdata();
68662 + failed_init_dentry_fsdata:
68663 +       reiser4_done_fqs();
68664 + failed_init_fqs:
68665 +       done_jnodes();
68666 + failed_init_jnodes:
68667 +       done_txnmgr_static();
68668 + failed_init_txnmgr_static:
68669 +       done_plugin_set();
68670 + failed_init_plugin_set:
68671 + failed_init_plugins:
68672 +       done_znodes();
68673 + failed_init_znodes:
68674 +       done_inodes();
68675 + failed_inode_cache:
68676 +       return result;
68677 +}
68678 +
68679 +/**
68680 + * done_reiser4 - reiser4 exit entry point
68681 + *
68682 + * Unregister reiser4 filesystem type, deletes caches. It is called on shutdown
68683 + * or at module unload.
68684 + */
68685 +static void __exit done_reiser4(void)
68686 +{
68687 +       int result;
68688 +
68689 +       debugfs_remove(reiser4_debugfs_root);
68690 +       result = unregister_filesystem(&reiser4_fs_type);
68691 +       BUG_ON(result != 0);
68692 +       reiser4_done_d_cursor();
68693 +       reiser4_done_file_fsdata();
68694 +       reiser4_done_dentry_fsdata();
68695 +       reiser4_done_fqs();
68696 +       done_jnodes();
68697 +       done_txnmgr_static();
68698 +       done_plugin_set();
68699 +       done_znodes();
68700 +       destroy_reiser4_cache(&inode_cache);
68701 +}
68702 +
68703 +module_init(init_reiser4);
68704 +module_exit(done_reiser4);
68705 +
68706 +MODULE_DESCRIPTION("Reiser4 filesystem");
68707 +MODULE_AUTHOR("Hans Reiser <Reiser@Namesys.COM>");
68708 +
68709 +MODULE_LICENSE("GPL");
68710 +
68711 +/*
68712 + * Local variables:
68713 + * c-indentation-style: "K&R"
68714 + * mode-name: "LC"
68715 + * c-basic-offset: 8
68716 + * tab-width: 8
68717 + * fill-column: 79
68718 + * End:
68719 + */
68720 diff --git a/fs/reiser4/tap.c b/fs/reiser4/tap.c
68721 new file mode 100644
68722 index 0000000..cfa5179
68723 --- /dev/null
68724 +++ b/fs/reiser4/tap.c
68725 @@ -0,0 +1,377 @@
68726 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
68727 + * reiser4/README */
68728 +
68729 +/*
68730 +   Tree Access Pointer (tap).
68731 +
68732 +   tap is data structure combining coord and lock handle (mostly). It is
68733 +   useful when one has to scan tree nodes (for example, in readdir, or flush),
68734 +   for tap functions allow to move tap in either direction transparently
68735 +   crossing unit/item/node borders.
68736 +
68737 +   Tap doesn't provide automatic synchronization of its fields as it is
68738 +   supposed to be per-thread object.
68739 +*/
68740 +
68741 +#include "forward.h"
68742 +#include "debug.h"
68743 +#include "coord.h"
68744 +#include "tree.h"
68745 +#include "context.h"
68746 +#include "tap.h"
68747 +#include "znode.h"
68748 +#include "tree_walk.h"
68749 +
68750 +#if REISER4_DEBUG
68751 +static int tap_invariant(const tap_t * tap);
68752 +static void tap_check(const tap_t * tap);
68753 +#else
68754 +#define tap_check(tap) noop
68755 +#endif
68756 +
68757 +/** load node tap is pointing to, if not loaded already */
68758 +int reiser4_tap_load(tap_t * tap)
68759 +{
68760 +       tap_check(tap);
68761 +       if (tap->loaded == 0) {
68762 +               int result;
68763 +
68764 +               result = zload_ra(tap->coord->node, &tap->ra_info);
68765 +               if (result != 0)
68766 +                       return result;
68767 +               coord_clear_iplug(tap->coord);
68768 +       }
68769 +       ++tap->loaded;
68770 +       tap_check(tap);
68771 +       return 0;
68772 +}
68773 +
68774 +/** release node tap is pointing to. Dual to tap_load() */
68775 +void reiser4_tap_relse(tap_t * tap)
68776 +{
68777 +       tap_check(tap);
68778 +       if (tap->loaded > 0) {
68779 +               --tap->loaded;
68780 +               if (tap->loaded == 0) {
68781 +                       zrelse(tap->coord->node);
68782 +               }
68783 +       }
68784 +       tap_check(tap);
68785 +}
68786 +
68787 +/**
68788 + * init tap to consist of @coord and @lh. Locks on nodes will be acquired with
68789 + * @mode
68790 + */
68791 +void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
68792 +                     znode_lock_mode mode)
68793 +{
68794 +       tap->coord = coord;
68795 +       tap->lh = lh;
68796 +       tap->mode = mode;
68797 +       tap->loaded = 0;
68798 +       INIT_LIST_HEAD(&tap->linkage);
68799 +       reiser4_init_ra_info(&tap->ra_info);
68800 +}
68801 +
68802 +/** add @tap to the per-thread list of all taps */
68803 +void reiser4_tap_monitor(tap_t * tap)
68804 +{
68805 +       assert("nikita-2623", tap != NULL);
68806 +       tap_check(tap);
68807 +       list_add(&tap->linkage, reiser4_taps_list());
68808 +       tap_check(tap);
68809 +}
68810 +
68811 +/* duplicate @src into @dst. Copy lock handle. @dst is not initially
68812 + * loaded. */
68813 +void reiser4_tap_copy(tap_t * dst, tap_t * src)
68814 +{
68815 +       assert("nikita-3193", src != NULL);
68816 +       assert("nikita-3194", dst != NULL);
68817 +
68818 +       *dst->coord = *src->coord;
68819 +       if (src->lh->node)
68820 +               copy_lh(dst->lh, src->lh);
68821 +       dst->mode = src->mode;
68822 +       dst->loaded = 0;
68823 +       INIT_LIST_HEAD(&dst->linkage);
68824 +       dst->ra_info = src->ra_info;
68825 +}
68826 +
68827 +/** finish with @tap */
68828 +void reiser4_tap_done(tap_t * tap)
68829 +{
68830 +       assert("nikita-2565", tap != NULL);
68831 +       tap_check(tap);
68832 +       if (tap->loaded > 0)
68833 +               zrelse(tap->coord->node);
68834 +       done_lh(tap->lh);
68835 +       tap->loaded = 0;
68836 +       list_del_init(&tap->linkage);
68837 +       tap->coord->node = NULL;
68838 +}
68839 +
68840 +/**
68841 + * move @tap to the new node, locked with @target. Load @target, if @tap was
68842 + * already loaded.
68843 + */
68844 +int reiser4_tap_move(tap_t * tap, lock_handle * target)
68845 +{
68846 +       int result = 0;
68847 +
68848 +       assert("nikita-2567", tap != NULL);
68849 +       assert("nikita-2568", target != NULL);
68850 +       assert("nikita-2570", target->node != NULL);
68851 +       assert("nikita-2569", tap->coord->node == tap->lh->node);
68852 +
68853 +       tap_check(tap);
68854 +       if (tap->loaded > 0)
68855 +               result = zload_ra(target->node, &tap->ra_info);
68856 +
68857 +       if (result == 0) {
68858 +               if (tap->loaded > 0)
68859 +                       zrelse(tap->coord->node);
68860 +               done_lh(tap->lh);
68861 +               copy_lh(tap->lh, target);
68862 +               tap->coord->node = target->node;
68863 +               coord_clear_iplug(tap->coord);
68864 +       }
68865 +       tap_check(tap);
68866 +       return result;
68867 +}
68868 +
68869 +/**
68870 + * move @tap to @target. Acquire lock on @target, if @tap was already
68871 + * loaded.
68872 + */
68873 +static int tap_to(tap_t * tap, znode * target)
68874 +{
68875 +       int result;
68876 +
68877 +       assert("nikita-2624", tap != NULL);
68878 +       assert("nikita-2625", target != NULL);
68879 +
68880 +       tap_check(tap);
68881 +       result = 0;
68882 +       if (tap->coord->node != target) {
68883 +               lock_handle here;
68884 +
68885 +               init_lh(&here);
68886 +               result = longterm_lock_znode(&here, target,
68887 +                                            tap->mode, ZNODE_LOCK_HIPRI);
68888 +               if (result == 0) {
68889 +                       result = reiser4_tap_move(tap, &here);
68890 +                       done_lh(&here);
68891 +               }
68892 +       }
68893 +       tap_check(tap);
68894 +       return result;
68895 +}
68896 +
68897 +/**
68898 + * move @tap to given @target, loading and locking @target->node if
68899 + * necessary
68900 + */
68901 +int tap_to_coord(tap_t * tap, coord_t * target)
68902 +{
68903 +       int result;
68904 +
68905 +       tap_check(tap);
68906 +       result = tap_to(tap, target->node);
68907 +       if (result == 0)
68908 +               coord_dup(tap->coord, target);
68909 +       tap_check(tap);
68910 +       return result;
68911 +}
68912 +
68913 +/** return list of all taps */
68914 +struct list_head *reiser4_taps_list(void)
68915 +{
68916 +       return &get_current_context()->taps;
68917 +}
68918 +
68919 +/** helper function for go_{next,prev}_{item,unit,node}() */
68920 +int go_dir_el(tap_t * tap, sideof dir, int units_p)
68921 +{
68922 +       coord_t dup;
68923 +       coord_t *coord;
68924 +       int result;
68925 +
68926 +       int (*coord_dir) (coord_t *);
68927 +       int (*get_dir_neighbor) (lock_handle *, znode *, int, int);
68928 +       void (*coord_init) (coord_t *, const znode *);
68929 +       ON_DEBUG(int (*coord_check) (const coord_t *));
68930 +
68931 +       assert("nikita-2556", tap != NULL);
68932 +       assert("nikita-2557", tap->coord != NULL);
68933 +       assert("nikita-2558", tap->lh != NULL);
68934 +       assert("nikita-2559", tap->coord->node != NULL);
68935 +
68936 +       tap_check(tap);
68937 +       if (dir == LEFT_SIDE) {
68938 +               coord_dir = units_p ? coord_prev_unit : coord_prev_item;
68939 +               get_dir_neighbor = reiser4_get_left_neighbor;
68940 +               coord_init = coord_init_last_unit;
68941 +       } else {
68942 +               coord_dir = units_p ? coord_next_unit : coord_next_item;
68943 +               get_dir_neighbor = reiser4_get_right_neighbor;
68944 +               coord_init = coord_init_first_unit;
68945 +       }
68946 +       ON_DEBUG(coord_check =
68947 +                units_p ? coord_is_existing_unit : coord_is_existing_item);
68948 +       assert("nikita-2560", coord_check(tap->coord));
68949 +
68950 +       coord = tap->coord;
68951 +       coord_dup(&dup, coord);
68952 +       if (coord_dir(&dup) != 0) {
68953 +               do {
68954 +                       /* move to the left neighboring node */
68955 +                       lock_handle dup;
68956 +
68957 +                       init_lh(&dup);
68958 +                       result =
68959 +                           get_dir_neighbor(&dup, coord->node, (int)tap->mode,
68960 +                                            GN_CAN_USE_UPPER_LEVELS);
68961 +                       if (result == 0) {
68962 +                               result = reiser4_tap_move(tap, &dup);
68963 +                               if (result == 0)
68964 +                                       coord_init(tap->coord, dup.node);
68965 +                               done_lh(&dup);
68966 +                       }
68967 +                       /* skip empty nodes */
68968 +               } while ((result == 0) && node_is_empty(coord->node));
68969 +       } else {
68970 +               result = 0;
68971 +               coord_dup(coord, &dup);
68972 +       }
68973 +       assert("nikita-2564", ergo(!result, coord_check(tap->coord)));
68974 +       tap_check(tap);
68975 +       return result;
68976 +}
68977 +
68978 +/**
68979 + * move @tap to the next unit, transparently crossing item and node
68980 + * boundaries
68981 + */
68982 +int go_next_unit(tap_t * tap)
68983 +{
68984 +       return go_dir_el(tap, RIGHT_SIDE, 1);
68985 +}
68986 +
68987 +/**
68988 + * move @tap to the previous unit, transparently crossing item and node
68989 + * boundaries
68990 + */
68991 +int go_prev_unit(tap_t * tap)
68992 +{
68993 +       return go_dir_el(tap, LEFT_SIDE, 1);
68994 +}
68995 +
68996 +/**
68997 + * @shift times apply @actor to the @tap. This is used to move @tap by
68998 + * @shift units (or items, or nodes) in either direction.
68999 + */
69000 +static int rewind_to(tap_t * tap, go_actor_t actor, int shift)
69001 +{
69002 +       int result;
69003 +
69004 +       assert("nikita-2555", shift >= 0);
69005 +       assert("nikita-2562", tap->coord->node == tap->lh->node);
69006 +
69007 +       tap_check(tap);
69008 +       result = reiser4_tap_load(tap);
69009 +       if (result != 0)
69010 +               return result;
69011 +
69012 +       for (; shift > 0; --shift) {
69013 +               result = actor(tap);
69014 +               assert("nikita-2563", tap->coord->node == tap->lh->node);
69015 +               if (result != 0)
69016 +                       break;
69017 +       }
69018 +       reiser4_tap_relse(tap);
69019 +       tap_check(tap);
69020 +       return result;
69021 +}
69022 +
69023 +/** move @tap @shift units rightward */
69024 +int rewind_right(tap_t * tap, int shift)
69025 +{
69026 +       return rewind_to(tap, go_next_unit, shift);
69027 +}
69028 +
69029 +/** move @tap @shift units leftward */
69030 +int rewind_left(tap_t * tap, int shift)
69031 +{
69032 +       return rewind_to(tap, go_prev_unit, shift);
69033 +}
69034 +
69035 +#if REISER4_DEBUG
69036 +/** debugging function: print @tap content in human readable form */
69037 +static void print_tap(const char *prefix, const tap_t * tap)
69038 +{
69039 +       if (tap == NULL) {
69040 +               printk("%s: null tap\n", prefix);
69041 +               return;
69042 +       }
69043 +       printk("%s: loaded: %i, in-list: %i, node: %p, mode: %s\n", prefix,
69044 +              tap->loaded, (&tap->linkage == tap->linkage.next &&
69045 +                            &tap->linkage == tap->linkage.prev),
69046 +              tap->lh->node,
69047 +              lock_mode_name(tap->mode));
69048 +       print_coord("\tcoord", tap->coord, 0);
69049 +}
69050 +
69051 +/** check [tap-sane] invariant */
69052 +static int tap_invariant(const tap_t * tap)
69053 +{
69054 +       /* [tap-sane] invariant */
69055 +
69056 +       if (tap == NULL)
69057 +               return 1;
69058 +       /* tap->mode is one of
69059 +        *
69060 +        * {ZNODE_NO_LOCK, ZNODE_READ_LOCK, ZNODE_WRITE_LOCK}, and
69061 +        */
69062 +       if (tap->mode != ZNODE_NO_LOCK &&
69063 +           tap->mode != ZNODE_READ_LOCK && tap->mode != ZNODE_WRITE_LOCK)
69064 +               return 2;
69065 +       /* tap->coord != NULL, and */
69066 +       if (tap->coord == NULL)
69067 +               return 3;
69068 +       /* tap->lh != NULL, and */
69069 +       if (tap->lh == NULL)
69070 +               return 4;
69071 +       /* tap->loaded > 0 => znode_is_loaded(tap->coord->node), and */
69072 +       if (!ergo(tap->loaded, znode_is_loaded(tap->coord->node)))
69073 +               return 5;
69074 +       /* tap->coord->node == tap->lh->node if tap->lh->node is not 0 */
69075 +       if (tap->lh->node != NULL && tap->coord->node != tap->lh->node)
69076 +               return 6;
69077 +       return 0;
69078 +}
69079 +
69080 +/** debugging function: check internal @tap consistency */
69081 +static void tap_check(const tap_t * tap)
69082 +{
69083 +       int result;
69084 +
69085 +       result = tap_invariant(tap);
69086 +       if (result != 0) {
69087 +               print_tap("broken", tap);
69088 +               reiser4_panic("nikita-2831", "tap broken: %i\n", result);
69089 +       }
69090 +}
69091 +#endif
69092 +
69093 +/* Make Linus happy.
69094 +   Local variables:
69095 +   c-indentation-style: "K&R"
69096 +   mode-name: "LC"
69097 +   c-basic-offset: 8
69098 +   tab-width: 8
69099 +   fill-column: 120
69100 +   scroll-step: 1
69101 +   End:
69102 +*/
69103 diff --git a/fs/reiser4/tap.h b/fs/reiser4/tap.h
69104 new file mode 100644
69105 index 0000000..1416729
69106 --- /dev/null
69107 +++ b/fs/reiser4/tap.h
69108 @@ -0,0 +1,70 @@
69109 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
69110 +
69111 +/* Tree Access Pointers. See tap.c for more details. */
69112 +
69113 +#if !defined( __REISER4_TAP_H__ )
69114 +#define __REISER4_TAP_H__
69115 +
69116 +#include "forward.h"
69117 +#include "readahead.h"
69118 +
69119 +/**
69120 +    tree_access_pointer aka tap. Data structure combining coord_t and lock
69121 +    handle.
69122 +    Invariants involving this data-type, see doc/lock-ordering for details:
69123 +
69124 +      [tap-sane]
69125 + */
69126 +struct tree_access_pointer {
69127 +       /* coord tap is at */
69128 +       coord_t *coord;
69129 +       /* lock handle on ->coord->node */
69130 +       lock_handle *lh;
69131 +       /* mode of lock acquired by this tap */
69132 +       znode_lock_mode mode;
69133 +       /* incremented by reiser4_tap_load().
69134 +          Decremented by reiser4_tap_relse(). */
69135 +       int loaded;
69136 +       /* list of taps */
69137 +       struct list_head linkage;
69138 +       /* read-ahead hint */
69139 +       ra_info_t ra_info;
69140 +};
69141 +
69142 +typedef int (*go_actor_t) (tap_t * tap);
69143 +
69144 +extern int reiser4_tap_load(tap_t * tap);
69145 +extern void reiser4_tap_relse(tap_t * tap);
69146 +extern void reiser4_tap_init(tap_t * tap, coord_t * coord, lock_handle * lh,
69147 +                    znode_lock_mode mode);
69148 +extern void reiser4_tap_monitor(tap_t * tap);
69149 +extern void reiser4_tap_copy(tap_t * dst, tap_t * src);
69150 +extern void reiser4_tap_done(tap_t * tap);
69151 +extern int reiser4_tap_move(tap_t * tap, lock_handle * target);
69152 +extern int tap_to_coord(tap_t * tap, coord_t * target);
69153 +
69154 +extern int go_dir_el(tap_t * tap, sideof dir, int units_p);
69155 +extern int go_next_unit(tap_t * tap);
69156 +extern int go_prev_unit(tap_t * tap);
69157 +extern int rewind_right(tap_t * tap, int shift);
69158 +extern int rewind_left(tap_t * tap, int shift);
69159 +
69160 +extern struct list_head *reiser4_taps_list(void);
69161 +
69162 +#define for_all_taps(tap)                                                     \
69163 +       for (tap = list_entry(reiser4_taps_list()->next, tap_t, linkage);      \
69164 +            reiser4_taps_list() != &tap->linkage;                             \
69165 +            tap = list_entry(tap->linkage.next, tap_t, linkage))
69166 +
69167 +/* __REISER4_TAP_H__ */
69168 +#endif
69169 +/* Make Linus happy.
69170 +   Local variables:
69171 +   c-indentation-style: "K&R"
69172 +   mode-name: "LC"
69173 +   c-basic-offset: 8
69174 +   tab-width: 8
69175 +   fill-column: 120
69176 +   scroll-step: 1
69177 +   End:
69178 +*/
69179 diff --git a/fs/reiser4/tree.c b/fs/reiser4/tree.c
69180 new file mode 100644
69181 index 0000000..32548d2
69182 --- /dev/null
69183 +++ b/fs/reiser4/tree.c
69184 @@ -0,0 +1,1876 @@
69185 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
69186 + * reiser4/README */
69187 +
69188 +/*
69189 + * KEYS IN A TREE.
69190 + *
69191 + * The tree consists of nodes located on the disk. Node in the tree is either
69192 + * formatted or unformatted. Formatted node is one that has structure
69193 + * understood by the tree balancing and traversal code. Formatted nodes are
69194 + * further classified into leaf and internal nodes. Latter distinctions is
69195 + * (almost) of only historical importance: general structure of leaves and
69196 + * internal nodes is the same in Reiser4. Unformatted nodes contain raw data
69197 + * that are part of bodies of ordinary files and attributes.
69198 + *
69199 + * Each node in the tree spawns some interval in the key space. Key ranges for
69200 + * all nodes in the tree are disjoint. Actually, this only holds in some weak
69201 + * sense, because of the non-unique keys: intersection of key ranges for
69202 + * different nodes is either empty, or consists of exactly one key.
69203 + *
69204 + * Formatted node consists of a sequence of items. Each item spawns some
69205 + * interval in key space. Key ranges for all items in a tree are disjoint,
69206 + * modulo non-unique keys again. Items within nodes are ordered in the key
69207 + * order of the smallest key in a item.
69208 + *
69209 + * Particular type of item can be further split into units. Unit is piece of
69210 + * item that can be cut from item and moved into another item of the same
69211 + * time. Units are used by balancing code to repack data during balancing.
69212 + *
69213 + * Unit can be further split into smaller entities (for example, extent unit
69214 + * represents several pages, and it is natural for extent code to operate on
69215 + * particular pages and even bytes within one unit), but this is of no
69216 + * relevance to the generic balancing and lookup code.
69217 + *
69218 + * Although item is said to "spawn" range or interval of keys, it is not
69219 + * necessary that item contains piece of data addressable by each and every
69220 + * key in this range. For example, compound directory item, consisting of
69221 + * units corresponding to directory entries and keyed by hashes of file names,
69222 + * looks more as having "discrete spectrum": only some disjoint keys inside
69223 + * range occupied by this item really address data.
69224 + *
69225 + * No than less, each item always has well-defined least (minimal) key, that
69226 + * is recorded in item header, stored in the node this item is in. Also, item
69227 + * plugin can optionally define method ->max_key_inside() returning maximal
69228 + * key that can _possibly_ be located within this item. This method is used
69229 + * (mainly) to determine when given piece of data should be merged into
69230 + * existing item, in stead of creating new one. Because of this, even though
69231 + * ->max_key_inside() can be larger that any key actually located in the item,
69232 + * intervals
69233 + *
69234 + * [ reiser4_min_key( item ), ->max_key_inside( item ) ]
69235 + *
69236 + * are still disjoint for all items within the _same_ node.
69237 + *
69238 + * In memory node is represented by znode. It plays several roles:
69239 + *
69240 + *  . something locks are taken on
69241 + *
69242 + *  . something tracked by transaction manager (this is going to change)
69243 + *
69244 + *  . something used to access node data
69245 + *
69246 + *  . something used to maintain tree structure in memory: sibling and
69247 + *  parental linkage.
69248 + *
69249 + *  . something used to organize nodes into "slums"
69250 + *
69251 + * More on znodes see in znode.[ch]
69252 + *
69253 + * DELIMITING KEYS
69254 + *
69255 + *   To simplify balancing, allow some flexibility in locking and speed up
69256 + *   important coord cache optimization, we keep delimiting keys of nodes in
69257 + *   memory. Depending on disk format (implemented by appropriate node plugin)
69258 + *   node on disk can record both left and right delimiting key, only one of
69259 + *   them, or none. Still, our balancing and tree traversal code keep both
69260 + *   delimiting keys for a node that is in memory stored in the znode. When
69261 + *   node is first brought into memory during tree traversal, its left
69262 + *   delimiting key is taken from its parent, and its right delimiting key is
69263 + *   either next key in its parent, or is right delimiting key of parent if
69264 + *   node is the rightmost child of parent.
69265 + *
69266 + *   Physical consistency of delimiting key is protected by special dk
69267 + *   read-write lock. That is, delimiting keys can only be inspected or
69268 + *   modified under this lock. But dk lock is only sufficient for fast
69269 + *   "pessimistic" check, because to simplify code and to decrease lock
69270 + *   contention, balancing (carry) only updates delimiting keys right before
69271 + *   unlocking all locked nodes on the given tree level. For example,
69272 + *   coord-by-key cache scans LRU list of recently accessed znodes. For each
69273 + *   node it first does fast check under dk spin lock. If key looked for is
69274 + *   not between delimiting keys for this node, next node is inspected and so
69275 + *   on. If key is inside of the key range, long term lock is taken on node
69276 + *   and key range is rechecked.
69277 + *
69278 + * COORDINATES
69279 + *
69280 + *   To find something in the tree, you supply a key, and the key is resolved
69281 + *   by coord_by_key() into a coord (coordinate) that is valid as long as the
69282 + *   node the coord points to remains locked.  As mentioned above trees
69283 + *   consist of nodes that consist of items that consist of units. A unit is
69284 + *   the smallest and indivisible piece of tree as far as balancing and tree
69285 + *   search are concerned. Each node, item, and unit can be addressed by
69286 + *   giving its level in the tree and the key occupied by this entity.  A node
69287 + *   knows what the key ranges are of the items within it, and how to find its
69288 + *   items and invoke their item handlers, but it does not know how to access
69289 + *   individual units within its items except through the item handlers.
69290 + *   coord is a structure containing a pointer to the node, the ordinal number
69291 + *   of the item within this node (a sort of item offset), and the ordinal
69292 + *   number of the unit within this item.
69293 + *
69294 + * TREE LOOKUP
69295 + *
69296 + *   There are two types of access to the tree: lookup and modification.
69297 + *
69298 + *   Lookup is a search for the key in the tree. Search can look for either
69299 + *   exactly the key given to it, or for the largest key that is not greater
69300 + *   than the key given to it. This distinction is determined by "bias"
69301 + *   parameter of search routine (coord_by_key()). coord_by_key() either
69302 + *   returns error (key is not in the tree, or some kind of external error
69303 + *   occurred), or successfully resolves key into coord.
69304 + *
69305 + *   This resolution is done by traversing tree top-to-bottom from root level
69306 + *   to the desired level. On levels above twig level (level one above the
69307 + *   leaf level) nodes consist exclusively of internal items. Internal item is
69308 + *   nothing more than pointer to the tree node on the child level. On twig
69309 + *   level nodes consist of internal items intermixed with extent
69310 + *   items. Internal items form normal search tree structure used by traversal
69311 + *   to descent through the tree.
69312 + *
69313 + * TREE LOOKUP OPTIMIZATIONS
69314 + *
69315 + * Tree lookup described above is expensive even if all nodes traversed are
69316 + * already in the memory: for each node binary search within it has to be
69317 + * performed and binary searches are CPU consuming and tend to destroy CPU
69318 + * caches.
69319 + *
69320 + * Several optimizations are used to work around this:
69321 + *
69322 + *   . cbk_cache (look-aside cache for tree traversals, see search.c for
69323 + *   details)
69324 + *
69325 + *   . seals (see seal.[ch])
69326 + *
69327 + *   . vroot (see search.c)
69328 + *
69329 + * General search-by-key is layered thusly:
69330 + *
69331 + *                   [check seal, if any]   --ok--> done
69332 + *                           |
69333 + *                         failed
69334 + *                           |
69335 + *                           V
69336 + *                     [vroot defined] --no--> node = tree_root
69337 + *                           |                   |
69338 + *                          yes                  |
69339 + *                           |                   |
69340 + *                           V                   |
69341 + *                       node = vroot            |
69342 + *                                 |             |
69343 + *                                 |             |
69344 + *                                 |             |
69345 + *                                 V             V
69346 + *                            [check cbk_cache for key]  --ok--> done
69347 + *                                        |
69348 + *                                      failed
69349 + *                                        |
69350 + *                                        V
69351 + *                       [start tree traversal from node]
69352 + *
69353 + */
69354 +
69355 +#include "forward.h"
69356 +#include "debug.h"
69357 +#include "dformat.h"
69358 +#include "key.h"
69359 +#include "coord.h"
69360 +#include "plugin/item/static_stat.h"
69361 +#include "plugin/item/item.h"
69362 +#include "plugin/node/node.h"
69363 +#include "plugin/plugin.h"
69364 +#include "txnmgr.h"
69365 +#include "jnode.h"
69366 +#include "znode.h"
69367 +#include "block_alloc.h"
69368 +#include "tree_walk.h"
69369 +#include "carry.h"
69370 +#include "carry_ops.h"
69371 +#include "tap.h"
69372 +#include "tree.h"
69373 +#include "vfs_ops.h"
69374 +#include "page_cache.h"
69375 +#include "super.h"
69376 +#include "reiser4.h"
69377 +#include "inode.h"
69378 +
69379 +#include <linux/fs.h>          /* for struct super_block  */
69380 +#include <linux/spinlock.h>
69381 +
69382 +/* Disk address (block number) never ever used for any real tree node. This is
69383 +   used as block number of "uber" znode.
69384 +
69385 +   Invalid block addresses are 0 by tradition.
69386 +
69387 +*/
69388 +const reiser4_block_nr UBER_TREE_ADDR = 0ull;
69389 +
69390 +#define CUT_TREE_MIN_ITERATIONS 64
69391 +
69392 +static int find_child_by_addr(znode * parent, znode * child, coord_t * result);
69393 +
69394 +/* return node plugin of coord->node */
69395 +node_plugin *node_plugin_by_coord(const coord_t * coord)
69396 +{
69397 +       assert("vs-1", coord != NULL);
69398 +       assert("vs-2", coord->node != NULL);
69399 +
69400 +       return coord->node->nplug;
69401 +}
69402 +
69403 +/* insert item into tree. Fields of @coord are updated so that they can be
69404 + * used by consequent insert operation. */
69405 +insert_result insert_by_key(reiser4_tree * tree        /* tree to insert new item
69406 +                                                * into */ ,
69407 +                           const reiser4_key * key /* key of new item */ ,
69408 +                           reiser4_item_data * data    /* parameters for item
69409 +                                                        * creation */ ,
69410 +                           coord_t * coord /* resulting insertion coord */ ,
69411 +                           lock_handle * lh    /* resulting lock
69412 +                                                * handle */ ,
69413 +                           tree_level stop_level /** level where to insert */ ,
69414 +                           __u32 flags /* insertion flags */ )
69415 +{
69416 +       int result;
69417 +
69418 +       assert("nikita-358", tree != NULL);
69419 +       assert("nikita-360", coord != NULL);
69420 +
69421 +       result = coord_by_key(tree, key, coord, lh, ZNODE_WRITE_LOCK,
69422 +                             FIND_EXACT, stop_level, stop_level,
69423 +                             flags | CBK_FOR_INSERT, NULL /*ra_info */ );
69424 +       switch (result) {
69425 +       default:
69426 +               break;
69427 +       case CBK_COORD_FOUND:
69428 +               result = IBK_ALREADY_EXISTS;
69429 +               break;
69430 +       case CBK_COORD_NOTFOUND:
69431 +               assert("nikita-2017", coord->node != NULL);
69432 +               result = insert_by_coord(coord, data, key, lh, 0 /*flags */ );
69433 +               break;
69434 +       }
69435 +       return result;
69436 +}
69437 +
69438 +/* insert item by calling carry. Helper function called if short-cut
69439 +   insertion failed  */
69440 +static insert_result insert_with_carry_by_coord(coord_t * coord,       /* coord where to insert */
69441 +                                               lock_handle * lh,       /* lock handle of insertion
69442 +                                                                        * node */
69443 +                                               reiser4_item_data * data,       /* parameters of new
69444 +                                                                                * item */
69445 +                                               const reiser4_key * key,        /* key of new item */
69446 +                                               carry_opcode cop,       /* carry operation to perform */
69447 +                                               cop_insert_flag flags
69448 +                                               /* carry flags */ )
69449 +{
69450 +       int result;
69451 +       carry_pool *pool;
69452 +       carry_level *lowest_level;
69453 +       carry_insert_data *cdata;
69454 +       carry_op *op;
69455 +
69456 +       assert("umka-314", coord != NULL);
69457 +
69458 +       /* allocate carry_pool and 3 carry_level-s */
69459 +       pool =
69460 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69461 +                           sizeof(*cdata));
69462 +       if (IS_ERR(pool))
69463 +               return PTR_ERR(pool);
69464 +       lowest_level = (carry_level *) (pool + 1);
69465 +       init_carry_level(lowest_level, pool);
69466 +
69467 +       op = reiser4_post_carry(lowest_level, cop, coord->node, 0);
69468 +       if (IS_ERR(op) || (op == NULL)) {
69469 +               done_carry_pool(pool);
69470 +               return RETERR(op ? PTR_ERR(op) : -EIO);
69471 +       }
69472 +       cdata = (carry_insert_data *) (lowest_level + 3);
69473 +       cdata->coord = coord;
69474 +       cdata->data = data;
69475 +       cdata->key = key;
69476 +       op->u.insert.d = cdata;
69477 +       if (flags == 0)
69478 +               flags = znode_get_tree(coord->node)->carry.insert_flags;
69479 +       op->u.insert.flags = flags;
69480 +       op->u.insert.type = COPT_ITEM_DATA;
69481 +       op->u.insert.child = NULL;
69482 +       if (lh != NULL) {
69483 +               assert("nikita-3245", lh->node == coord->node);
69484 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
69485 +               lowest_level->tracked = lh;
69486 +       }
69487 +
69488 +       result = reiser4_carry(lowest_level, NULL);
69489 +       done_carry_pool(pool);
69490 +
69491 +       return result;
69492 +}
69493 +
69494 +/* form carry queue to perform paste of @data with @key at @coord, and launch
69495 +   its execution by calling carry().
69496 +
69497 +   Instruct carry to update @lh it after balancing insertion coord moves into
69498 +   different block.
69499 +
69500 +*/
69501 +static int paste_with_carry(coord_t * coord,   /* coord of paste */
69502 +                           lock_handle * lh,   /* lock handle of node
69503 +                                                * where item is
69504 +                                                * pasted */
69505 +                           reiser4_item_data * data,   /* parameters of new
69506 +                                                        * item */
69507 +                           const reiser4_key * key,    /* key of new item */
69508 +                           unsigned flags /* paste flags */ )
69509 +{
69510 +       int result;
69511 +       carry_pool *pool;
69512 +       carry_level *lowest_level;
69513 +       carry_insert_data *cdata;
69514 +       carry_op *op;
69515 +
69516 +       assert("umka-315", coord != NULL);
69517 +       assert("umka-316", key != NULL);
69518 +
69519 +       pool =
69520 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69521 +                           sizeof(*cdata));
69522 +       if (IS_ERR(pool))
69523 +               return PTR_ERR(pool);
69524 +       lowest_level = (carry_level *) (pool + 1);
69525 +       init_carry_level(lowest_level, pool);
69526 +
69527 +       op = reiser4_post_carry(lowest_level, COP_PASTE, coord->node, 0);
69528 +       if (IS_ERR(op) || (op == NULL)) {
69529 +               done_carry_pool(pool);
69530 +               return RETERR(op ? PTR_ERR(op) : -EIO);
69531 +       }
69532 +       cdata = (carry_insert_data *) (lowest_level + 3);
69533 +       cdata->coord = coord;
69534 +       cdata->data = data;
69535 +       cdata->key = key;
69536 +       op->u.paste.d = cdata;
69537 +       if (flags == 0)
69538 +               flags = znode_get_tree(coord->node)->carry.paste_flags;
69539 +       op->u.paste.flags = flags;
69540 +       op->u.paste.type = COPT_ITEM_DATA;
69541 +       if (lh != NULL) {
69542 +               lowest_level->track_type = CARRY_TRACK_CHANGE;
69543 +               lowest_level->tracked = lh;
69544 +       }
69545 +
69546 +       result = reiser4_carry(lowest_level, NULL);
69547 +       done_carry_pool(pool);
69548 +
69549 +       return result;
69550 +}
69551 +
69552 +/* insert item at the given coord.
69553 +
69554 +   First try to skip carry by directly calling ->create_item() method of node
69555 +   plugin. If this is impossible (there is not enough free space in the node,
69556 +   or leftmost item in the node is created), call insert_with_carry_by_coord()
69557 +   that will do full carry().
69558 +
69559 +*/
69560 +insert_result insert_by_coord(coord_t * coord  /* coord where to
69561 +                                                * insert. coord->node has
69562 +                                                * to be write locked by
69563 +                                                * caller */ ,
69564 +                             reiser4_item_data * data  /* data to be
69565 +                                                        * inserted */ ,
69566 +                             const reiser4_key * key /* key of new item */ ,
69567 +                             lock_handle * lh  /* lock handle of write
69568 +                                                * lock on node */ ,
69569 +                             __u32 flags /* insertion flags */ )
69570 +{
69571 +       unsigned item_size;
69572 +       int result;
69573 +       znode *node;
69574 +
69575 +       assert("vs-247", coord != NULL);
69576 +       assert("vs-248", data != NULL);
69577 +       assert("vs-249", data->length >= 0);
69578 +       assert("nikita-1191", znode_is_write_locked(coord->node));
69579 +
69580 +       node = coord->node;
69581 +       coord_clear_iplug(coord);
69582 +       result = zload(node);
69583 +       if (result != 0)
69584 +               return result;
69585 +
69586 +       item_size = space_needed(node, NULL, data, 1);
69587 +       if (item_size > znode_free_space(node) &&
69588 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69589 +           && (flags & COPI_DONT_ALLOCATE)) {
69590 +               /* we are forced to use free space of coord->node and new item
69591 +                  does not fit into it.
69592 +
69593 +                  Currently we get here only when we allocate and copy units
69594 +                  of extent item from a node to its left neighbor during
69595 +                  "squalloc"-ing.  If @node (this is left neighbor) does not
69596 +                  have enough free space - we do not want to attempt any
69597 +                  shifting and allocations because we are in squeezing and
69598 +                  everything to the left of @node is tightly packed.
69599 +                */
69600 +               result = -E_NODE_FULL;
69601 +       } else if ((item_size <= znode_free_space(node)) &&
69602 +                  !coord_is_before_leftmost(coord) &&
69603 +                  (node_plugin_by_node(node)->fast_insert != NULL)
69604 +                  && node_plugin_by_node(node)->fast_insert(coord)) {
69605 +               /* shortcut insertion without carry() overhead.
69606 +
69607 +                  Only possible if:
69608 +
69609 +                  - there is enough free space
69610 +
69611 +                  - insertion is not into the leftmost position in a node
69612 +                  (otherwise it would require updating of delimiting key in a
69613 +                  parent)
69614 +
69615 +                  - node plugin agrees with this
69616 +
69617 +                */
69618 +               result =
69619 +                   node_plugin_by_node(node)->create_item(coord, key, data,
69620 +                                                          NULL);
69621 +               znode_make_dirty(node);
69622 +       } else {
69623 +               /* otherwise do full-fledged carry(). */
69624 +               result =
69625 +                   insert_with_carry_by_coord(coord, lh, data, key, COP_INSERT,
69626 +                                              flags);
69627 +       }
69628 +       zrelse(node);
69629 +       return result;
69630 +}
69631 +
69632 +/* @coord is set to leaf level and @data is to be inserted to twig level */
69633 +insert_result
69634 +insert_extent_by_coord(coord_t *
69635 +                      coord
69636 +                      /* coord where to insert. coord->node * has to be write * locked by caller */
69637 +                      ,
69638 +                      reiser4_item_data * data /* data to be inserted */ ,
69639 +                      const reiser4_key * key /* key of new item */ ,
69640 +                      lock_handle *
69641 +                      lh /* lock handle of write lock on * node */ )
69642 +{
69643 +       assert("vs-405", coord != NULL);
69644 +       assert("vs-406", data != NULL);
69645 +       assert("vs-407", data->length > 0);
69646 +       assert("vs-408", znode_is_write_locked(coord->node));
69647 +       assert("vs-409", znode_get_level(coord->node) == LEAF_LEVEL);
69648 +
69649 +       return insert_with_carry_by_coord(coord, lh, data, key, COP_EXTENT,
69650 +                                         0 /*flags */ );
69651 +}
69652 +
69653 +/* Insert into the item at the given coord.
69654 +
69655 +   First try to skip carry by directly calling ->paste() method of item
69656 +   plugin. If this is impossible (there is not enough free space in the node,
69657 +   or we are pasting into leftmost position in the node), call
69658 +   paste_with_carry() that will do full carry().
69659 +
69660 +*/
69661 +/* paste_into_item */
69662 +int insert_into_item(coord_t * coord /* coord of pasting */ ,
69663 +                    lock_handle * lh /* lock handle on node involved */ ,
69664 +                    const reiser4_key * key /* key of unit being pasted */ ,
69665 +                    reiser4_item_data * data /* parameters for new unit */ ,
69666 +                    unsigned flags /* insert/paste flags */ )
69667 +{
69668 +       int result;
69669 +       int size_change;
69670 +       node_plugin *nplug;
69671 +       item_plugin *iplug;
69672 +
69673 +       assert("umka-317", coord != NULL);
69674 +       assert("umka-318", key != NULL);
69675 +
69676 +       iplug = item_plugin_by_coord(coord);
69677 +       nplug = node_plugin_by_coord(coord);
69678 +
69679 +       assert("nikita-1480", iplug == data->iplug);
69680 +
69681 +       size_change = space_needed(coord->node, coord, data, 0);
69682 +       if (size_change > (int)znode_free_space(coord->node) &&
69683 +           (flags & COPI_DONT_SHIFT_LEFT) && (flags & COPI_DONT_SHIFT_RIGHT)
69684 +           && (flags & COPI_DONT_ALLOCATE)) {
69685 +               /* we are forced to use free space of coord->node and new data
69686 +                  does not fit into it. */
69687 +               return -E_NODE_FULL;
69688 +       }
69689 +
69690 +       /* shortcut paste without carry() overhead.
69691 +
69692 +          Only possible if:
69693 +
69694 +          - there is enough free space
69695 +
69696 +          - paste is not into the leftmost unit in a node (otherwise
69697 +          it would require updating of delimiting key in a parent)
69698 +
69699 +          - node plugin agrees with this
69700 +
69701 +          - item plugin agrees with us
69702 +        */
69703 +       if (size_change <= (int)znode_free_space(coord->node) &&
69704 +           (coord->item_pos != 0 ||
69705 +            coord->unit_pos != 0 || coord->between == AFTER_UNIT) &&
69706 +           coord->unit_pos != 0 && nplug->fast_paste != NULL &&
69707 +           nplug->fast_paste(coord) &&
69708 +           iplug->b.fast_paste != NULL && iplug->b.fast_paste(coord)) {
69709 +               if (size_change > 0)
69710 +                       nplug->change_item_size(coord, size_change);
69711 +               /* NOTE-NIKITA: huh? where @key is used? */
69712 +               result = iplug->b.paste(coord, data, NULL);
69713 +               if (size_change < 0)
69714 +                       nplug->change_item_size(coord, size_change);
69715 +               znode_make_dirty(coord->node);
69716 +       } else
69717 +               /* otherwise do full-fledged carry(). */
69718 +               result = paste_with_carry(coord, lh, data, key, flags);
69719 +       return result;
69720 +}
69721 +
69722 +/* this either appends or truncates item @coord */
69723 +int reiser4_resize_item(coord_t * coord /* coord of item being resized */ ,
69724 +                       reiser4_item_data * data /* parameters of resize */ ,
69725 +                       reiser4_key * key /* key of new unit */ ,
69726 +                       lock_handle * lh        /* lock handle of node
69727 +                                                * being modified */ ,
69728 +                       cop_insert_flag flags /* carry flags */ )
69729 +{
69730 +       int result;
69731 +       znode *node;
69732 +
69733 +       assert("nikita-362", coord != NULL);
69734 +       assert("nikita-363", data != NULL);
69735 +       assert("vs-245", data->length != 0);
69736 +
69737 +       node = coord->node;
69738 +       coord_clear_iplug(coord);
69739 +       result = zload(node);
69740 +       if (result != 0)
69741 +               return result;
69742 +
69743 +       if (data->length < 0)
69744 +               result = node_plugin_by_coord(coord)->shrink_item(coord,
69745 +                                                                 -data->length);
69746 +       else
69747 +               result = insert_into_item(coord, lh, key, data, flags);
69748 +
69749 +       zrelse(node);
69750 +       return result;
69751 +}
69752 +
69753 +/* insert flow @f */
69754 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f)
69755 +{
69756 +       int result;
69757 +       carry_pool *pool;
69758 +       carry_level *lowest_level;
69759 +       reiser4_item_data *data;
69760 +       carry_op *op;
69761 +
69762 +       pool =
69763 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
69764 +                           sizeof(*data));
69765 +       if (IS_ERR(pool))
69766 +               return PTR_ERR(pool);
69767 +       lowest_level = (carry_level *) (pool + 1);
69768 +       init_carry_level(lowest_level, pool);
69769 +
69770 +       op = reiser4_post_carry(lowest_level, COP_INSERT_FLOW, coord->node,
69771 +                       0 /* operate directly on coord -> node */ );
69772 +       if (IS_ERR(op) || (op == NULL)) {
69773 +               done_carry_pool(pool);
69774 +               return RETERR(op ? PTR_ERR(op) : -EIO);
69775 +       }
69776 +
69777 +       /* these are permanent during insert_flow */
69778 +       data = (reiser4_item_data *) (lowest_level + 3);
69779 +       data->user = 1;
69780 +       data->iplug = item_plugin_by_id(FORMATTING_ID);
69781 +       data->arg = NULL;
69782 +       /* data.length and data.data will be set before calling paste or
69783 +          insert */
69784 +       data->length = 0;
69785 +       data->data = NULL;
69786 +
69787 +       op->u.insert_flow.flags = 0;
69788 +       op->u.insert_flow.insert_point = coord;
69789 +       op->u.insert_flow.flow = f;
69790 +       op->u.insert_flow.data = data;
69791 +       op->u.insert_flow.new_nodes = 0;
69792 +
69793 +       lowest_level->track_type = CARRY_TRACK_CHANGE;
69794 +       lowest_level->tracked = lh;
69795 +
69796 +       result = reiser4_carry(lowest_level, NULL);
69797 +       done_carry_pool(pool);
69798 +
69799 +       return result;
69800 +}
69801 +
69802 +/* Given a coord in parent node, obtain a znode for the corresponding child */
69803 +znode *child_znode(const coord_t * parent_coord        /* coord of pointer to
69804 +                                                * child */ ,
69805 +                  znode * parent /* parent of child */ ,
69806 +                  int incore_p /* if !0 only return child if already in
69807 +                                * memory */ ,
69808 +                  int setup_dkeys_p    /* if !0 update delimiting keys of
69809 +                                        * child */ )
69810 +{
69811 +       znode *child;
69812 +
69813 +       assert("nikita-1374", parent_coord != NULL);
69814 +       assert("nikita-1482", parent != NULL);
69815 +#if REISER4_DEBUG
69816 +       if (setup_dkeys_p)
69817 +               assert_rw_not_locked(&(znode_get_tree(parent)->dk_lock));
69818 +#endif
69819 +       assert("nikita-2947", znode_is_any_locked(parent));
69820 +
69821 +       if (znode_get_level(parent) <= LEAF_LEVEL) {
69822 +               /* trying to get child of leaf node */
69823 +               warning("nikita-1217", "Child of maize?");
69824 +               return ERR_PTR(RETERR(-EIO));
69825 +       }
69826 +       if (item_is_internal(parent_coord)) {
69827 +               reiser4_block_nr addr;
69828 +               item_plugin *iplug;
69829 +               reiser4_tree *tree;
69830 +
69831 +               iplug = item_plugin_by_coord(parent_coord);
69832 +               assert("vs-512", iplug->s.internal.down_link);
69833 +               iplug->s.internal.down_link(parent_coord, NULL, &addr);
69834 +
69835 +               tree = znode_get_tree(parent);
69836 +               if (incore_p)
69837 +                       child = zlook(tree, &addr);
69838 +               else
69839 +                       child =
69840 +                           zget(tree, &addr, parent,
69841 +                                znode_get_level(parent) - 1,
69842 +                                reiser4_ctx_gfp_mask_get());
69843 +               if ((child != NULL) && !IS_ERR(child) && setup_dkeys_p)
69844 +                       set_child_delimiting_keys(parent, parent_coord, child);
69845 +       } else {
69846 +               warning("nikita-1483", "Internal item expected");
69847 +               child = ERR_PTR(RETERR(-EIO));
69848 +       }
69849 +       return child;
69850 +}
69851 +
69852 +/* remove znode from transaction */
69853 +static void uncapture_znode(znode * node)
69854 +{
69855 +       struct page *page;
69856 +
69857 +       assert("zam-1001", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69858 +
69859 +       if (!reiser4_blocknr_is_fake(znode_get_block(node))) {
69860 +               int ret;
69861 +
69862 +               /* An already allocated block goes right to the atom's delete set. */
69863 +               ret =
69864 +                   reiser4_dealloc_block(znode_get_block(node), 0,
69865 +                                         BA_DEFER | BA_FORMATTED);
69866 +               if (ret)
69867 +                       warning("zam-942",
69868 +                               "can\'t add a block (%llu) number to atom's delete set\n",
69869 +                               (unsigned long long)(*znode_get_block(node)));
69870 +
69871 +               spin_lock_znode(node);
69872 +               /* Here we return flush reserved block which was reserved at the
69873 +                * moment when this allocated node was marked dirty and still
69874 +                * not used by flush in node relocation procedure.  */
69875 +               if (ZF_ISSET(node, JNODE_FLUSH_RESERVED)) {
69876 +                       txn_atom *atom;
69877 +
69878 +                       atom = jnode_get_atom(ZJNODE(node));
69879 +                       assert("zam-939", atom != NULL);
69880 +                       spin_unlock_znode(node);
69881 +                       flush_reserved2grabbed(atom, (__u64) 1);
69882 +                       spin_unlock_atom(atom);
69883 +               } else
69884 +                       spin_unlock_znode(node);
69885 +       } else {
69886 +               /* znode has assigned block which is counted as "fake
69887 +                  allocated". Return it back to "free blocks") */
69888 +               fake_allocated2free((__u64) 1, BA_FORMATTED);
69889 +       }
69890 +
69891 +       /*
69892 +        * uncapture page from transaction. There is a possibility of a race
69893 +        * with ->releasepage(): reiser4_releasepage() detaches page from this
69894 +        * jnode and we have nothing to uncapture. To avoid this, get
69895 +        * reference of node->pg under jnode spin lock. reiser4_uncapture_page()
69896 +        * will deal with released page itself.
69897 +        */
69898 +       spin_lock_znode(node);
69899 +       page = znode_page(node);
69900 +       if (likely(page != NULL)) {
69901 +               /*
69902 +                * reiser4_uncapture_page() can only be called when we are sure
69903 +                * that znode is pinned in memory, which we are, because
69904 +                * forget_znode() is only called from longterm_unlock_znode().
69905 +                */
69906 +               page_cache_get(page);
69907 +               spin_unlock_znode(node);
69908 +               lock_page(page);
69909 +               reiser4_uncapture_page(page);
69910 +               unlock_page(page);
69911 +               page_cache_release(page);
69912 +       } else {
69913 +               txn_atom *atom;
69914 +
69915 +               /* handle "flush queued" znodes */
69916 +               while (1) {
69917 +                       atom = jnode_get_atom(ZJNODE(node));
69918 +                       assert("zam-943", atom != NULL);
69919 +
69920 +                       if (!ZF_ISSET(node, JNODE_FLUSH_QUEUED)
69921 +                           || !atom->nr_running_queues)
69922 +                               break;
69923 +
69924 +                       spin_unlock_znode(node);
69925 +                       reiser4_atom_wait_event(atom);
69926 +                       spin_lock_znode(node);
69927 +               }
69928 +
69929 +               reiser4_uncapture_block(ZJNODE(node));
69930 +               spin_unlock_atom(atom);
69931 +               zput(node);
69932 +       }
69933 +}
69934 +
69935 +/* This is called from longterm_unlock_znode() when last lock is released from
69936 +   the node that has been removed from the tree. At this point node is removed
69937 +   from sibling list and its lock is invalidated. */
69938 +void forget_znode(lock_handle * handle)
69939 +{
69940 +       znode *node;
69941 +       reiser4_tree *tree;
69942 +
69943 +       assert("umka-319", handle != NULL);
69944 +
69945 +       node = handle->node;
69946 +       tree = znode_get_tree(node);
69947 +
69948 +       assert("vs-164", znode_is_write_locked(node));
69949 +       assert("nikita-1280", ZF_ISSET(node, JNODE_HEARD_BANSHEE));
69950 +       assert_rw_locked(&(node->lock.guard));
69951 +
69952 +       /* We assume that this node was detached from its parent before
69953 +        * unlocking, it gives no way to reach this node from parent through a
69954 +        * down link.  The node should have no children and, thereby, can't be
69955 +        * reached from them by their parent pointers.  The only way to obtain a
69956 +        * reference to the node is to use sibling pointers from its left and
69957 +        * right neighbors.  In the next several lines we remove the node from
69958 +        * the sibling list. */
69959 +
69960 +       write_lock_tree(tree);
69961 +       sibling_list_remove(node);
69962 +       znode_remove(node, tree);
69963 +       write_unlock_tree(tree);
69964 +
69965 +       /* Here we set JNODE_DYING and cancel all pending lock requests.  It
69966 +        * forces all lock requestor threads to repeat iterations of getting
69967 +        * lock on a child, neighbor or parent node.  But, those threads can't
69968 +        * come to this node again, because this node is no longer a child,
69969 +        * neighbor or parent of any other node.  This order of znode
69970 +        * invalidation does not allow other threads to waste cpu time is a busy
69971 +        * loop, trying to lock dying object.  The exception is in the flush
69972 +        * code when we take node directly from atom's capture list.*/
69973 +       reiser4_invalidate_lock(handle);
69974 +       uncapture_znode(node);
69975 +}
69976 +
69977 +/* Check that internal item at @pointer really contains pointer to @child. */
69978 +int check_tree_pointer(const coord_t * pointer /* would-be pointer to
69979 +                                                * @child */ ,
69980 +                      const znode * child /* child znode */ )
69981 +{
69982 +       assert("nikita-1016", pointer != NULL);
69983 +       assert("nikita-1017", child != NULL);
69984 +       assert("nikita-1018", pointer->node != NULL);
69985 +
69986 +       assert("nikita-1325", znode_is_any_locked(pointer->node));
69987 +
69988 +       assert("nikita-2985",
69989 +              znode_get_level(pointer->node) == znode_get_level(child) + 1);
69990 +
69991 +       coord_clear_iplug((coord_t *) pointer);
69992 +
69993 +       if (coord_is_existing_unit(pointer)) {
69994 +               item_plugin *iplug;
69995 +               reiser4_block_nr addr;
69996 +
69997 +               if (item_is_internal(pointer)) {
69998 +                       iplug = item_plugin_by_coord(pointer);
69999 +                       assert("vs-513", iplug->s.internal.down_link);
70000 +                       iplug->s.internal.down_link(pointer, NULL, &addr);
70001 +                       /* check that cached value is correct */
70002 +                       if (disk_addr_eq(&addr, znode_get_block(child))) {
70003 +                               return NS_FOUND;
70004 +                       }
70005 +               }
70006 +       }
70007 +       /* warning ("jmacd-1002", "tree pointer incorrect"); */
70008 +       return NS_NOT_FOUND;
70009 +}
70010 +
70011 +/* find coord of pointer to new @child in @parent.
70012 +
70013 +   Find the &coord_t in the @parent where pointer to a given @child will
70014 +   be in.
70015 +
70016 +*/
70017 +int find_new_child_ptr(znode * parent /* parent znode, passed locked */ ,
70018 +                      znode *
70019 +                      child UNUSED_ARG /* child znode, passed locked */ ,
70020 +                      znode * left /* left brother of new node */ ,
70021 +                      coord_t * result /* where result is stored in */ )
70022 +{
70023 +       int ret;
70024 +
70025 +       assert("nikita-1486", parent != NULL);
70026 +       assert("nikita-1487", child != NULL);
70027 +       assert("nikita-1488", result != NULL);
70028 +
70029 +       ret = find_child_ptr(parent, left, result);
70030 +       if (ret != NS_FOUND) {
70031 +               warning("nikita-1489", "Cannot find brother position: %i", ret);
70032 +               return RETERR(-EIO);
70033 +       } else {
70034 +               result->between = AFTER_UNIT;
70035 +               return RETERR(NS_NOT_FOUND);
70036 +       }
70037 +}
70038 +
70039 +/* find coord of pointer to @child in @parent.
70040 +
70041 +   Find the &coord_t in the @parent where pointer to a given @child is in.
70042 +
70043 +*/
70044 +int find_child_ptr(znode * parent /* parent znode, passed locked */ ,
70045 +                  znode * child /* child znode, passed locked */ ,
70046 +                  coord_t * result /* where result is stored in */ )
70047 +{
70048 +       int lookup_res;
70049 +       node_plugin *nplug;
70050 +       /* left delimiting key of a child */
70051 +       reiser4_key ld;
70052 +       reiser4_tree *tree;
70053 +
70054 +       assert("nikita-934", parent != NULL);
70055 +       assert("nikita-935", child != NULL);
70056 +       assert("nikita-936", result != NULL);
70057 +       assert("zam-356", znode_is_loaded(parent));
70058 +
70059 +       coord_init_zero(result);
70060 +       result->node = parent;
70061 +
70062 +       nplug = parent->nplug;
70063 +       assert("nikita-939", nplug != NULL);
70064 +
70065 +       tree = znode_get_tree(parent);
70066 +       /* NOTE-NIKITA taking read-lock on tree here assumes that @result is
70067 +        * not aliased to ->in_parent of some znode. Otherwise,
70068 +        * parent_coord_to_coord() below would modify data protected by tree
70069 +        * lock. */
70070 +       read_lock_tree(tree);
70071 +       /* fast path. Try to use cached value. Lock tree to keep
70072 +          node->pos_in_parent and pos->*_blocknr consistent. */
70073 +       if (child->in_parent.item_pos + 1 != 0) {
70074 +               parent_coord_to_coord(&child->in_parent, result);
70075 +               if (check_tree_pointer(result, child) == NS_FOUND) {
70076 +                       read_unlock_tree(tree);
70077 +                       return NS_FOUND;
70078 +               }
70079 +
70080 +               child->in_parent.item_pos = (unsigned short)~0;
70081 +       }
70082 +       read_unlock_tree(tree);
70083 +
70084 +       /* is above failed, find some key from @child. We are looking for the
70085 +          least key in a child. */
70086 +       read_lock_dk(tree);
70087 +       ld = *znode_get_ld_key(child);
70088 +       read_unlock_dk(tree);
70089 +       /*
70090 +        * now, lookup parent with key just found. Note, that left delimiting
70091 +        * key doesn't identify node uniquely, because (in extremely rare
70092 +        * case) two nodes can have equal left delimiting keys, if one of them
70093 +        * is completely filled with directory entries that all happened to be
70094 +        * hash collision. But, we check block number in check_tree_pointer()
70095 +        * and, so, are safe.
70096 +        */
70097 +       lookup_res = nplug->lookup(parent, &ld, FIND_EXACT, result);
70098 +       /* update cached pos_in_node */
70099 +       if (lookup_res == NS_FOUND) {
70100 +               write_lock_tree(tree);
70101 +               coord_to_parent_coord(result, &child->in_parent);
70102 +               write_unlock_tree(tree);
70103 +               lookup_res = check_tree_pointer(result, child);
70104 +       }
70105 +       if (lookup_res == NS_NOT_FOUND)
70106 +               lookup_res = find_child_by_addr(parent, child, result);
70107 +       return lookup_res;
70108 +}
70109 +
70110 +/* find coord of pointer to @child in @parent by scanning
70111 +
70112 +   Find the &coord_t in the @parent where pointer to a given @child
70113 +   is in by scanning all internal items in @parent and comparing block
70114 +   numbers in them with that of @child.
70115 +
70116 +*/
70117 +static int find_child_by_addr(znode * parent /* parent znode, passed locked */ ,
70118 +                             znode * child /* child znode, passed locked */ ,
70119 +                             coord_t * result /* where result is stored in */ )
70120 +{
70121 +       int ret;
70122 +
70123 +       assert("nikita-1320", parent != NULL);
70124 +       assert("nikita-1321", child != NULL);
70125 +       assert("nikita-1322", result != NULL);
70126 +
70127 +       ret = NS_NOT_FOUND;
70128 +
70129 +       for_all_units(result, parent) {
70130 +               if (check_tree_pointer(result, child) == NS_FOUND) {
70131 +                       write_lock_tree(znode_get_tree(parent));
70132 +                       coord_to_parent_coord(result, &child->in_parent);
70133 +                       write_unlock_tree(znode_get_tree(parent));
70134 +                       ret = NS_FOUND;
70135 +                       break;
70136 +               }
70137 +       }
70138 +       return ret;
70139 +}
70140 +
70141 +/* true, if @addr is "unallocated block number", which is just address, with
70142 +   highest bit set. */
70143 +int is_disk_addr_unallocated(const reiser4_block_nr * addr     /* address to
70144 +                                                                * check */ )
70145 +{
70146 +       assert("nikita-1766", addr != NULL);
70147 +       cassert(sizeof(reiser4_block_nr) == 8);
70148 +       return (*addr & REISER4_BLOCKNR_STATUS_BIT_MASK) ==
70149 +           REISER4_UNALLOCATED_STATUS_VALUE;
70150 +}
70151 +
70152 +/* returns true if removing bytes of given range of key [from_key, to_key]
70153 +   causes removing of whole item @from */
70154 +static int
70155 +item_removed_completely(coord_t * from, const reiser4_key * from_key,
70156 +                       const reiser4_key * to_key)
70157 +{
70158 +       item_plugin *iplug;
70159 +       reiser4_key key_in_item;
70160 +
70161 +       assert("umka-325", from != NULL);
70162 +       assert("", item_is_extent(from));
70163 +
70164 +       /* check first key just for case */
70165 +       item_key_by_coord(from, &key_in_item);
70166 +       if (keygt(from_key, &key_in_item))
70167 +               return 0;
70168 +
70169 +       /* check last key */
70170 +       iplug = item_plugin_by_coord(from);
70171 +       assert("vs-611", iplug && iplug->s.file.append_key);
70172 +
70173 +       iplug->s.file.append_key(from, &key_in_item);
70174 +       set_key_offset(&key_in_item, get_key_offset(&key_in_item) - 1);
70175 +
70176 +       if (keylt(to_key, &key_in_item))
70177 +               /* last byte is not removed */
70178 +               return 0;
70179 +       return 1;
70180 +}
70181 +
70182 +/* helper function for prepare_twig_kill(): @left and @right are formatted
70183 + * neighbors of extent item being completely removed. Load and lock neighbors
70184 + * and store lock handles into @cdata for later use by kill_hook_extent() */
70185 +static int
70186 +prepare_children(znode * left, znode * right, carry_kill_data * kdata)
70187 +{
70188 +       int result;
70189 +       int left_loaded;
70190 +       int right_loaded;
70191 +
70192 +       result = 0;
70193 +       left_loaded = right_loaded = 0;
70194 +
70195 +       if (left != NULL) {
70196 +               result = zload(left);
70197 +               if (result == 0) {
70198 +                       left_loaded = 1;
70199 +                       result = longterm_lock_znode(kdata->left, left,
70200 +                                                    ZNODE_READ_LOCK,
70201 +                                                    ZNODE_LOCK_LOPRI);
70202 +               }
70203 +       }
70204 +       if (result == 0 && right != NULL) {
70205 +               result = zload(right);
70206 +               if (result == 0) {
70207 +                       right_loaded = 1;
70208 +                       result = longterm_lock_znode(kdata->right, right,
70209 +                                                    ZNODE_READ_LOCK,
70210 +                                                    ZNODE_LOCK_HIPRI |
70211 +                                                    ZNODE_LOCK_NONBLOCK);
70212 +               }
70213 +       }
70214 +       if (result != 0) {
70215 +               done_lh(kdata->left);
70216 +               done_lh(kdata->right);
70217 +               if (left_loaded != 0)
70218 +                       zrelse(left);
70219 +               if (right_loaded != 0)
70220 +                       zrelse(right);
70221 +       }
70222 +       return result;
70223 +}
70224 +
70225 +static void done_children(carry_kill_data * kdata)
70226 +{
70227 +       if (kdata->left != NULL && kdata->left->node != NULL) {
70228 +               zrelse(kdata->left->node);
70229 +               done_lh(kdata->left);
70230 +       }
70231 +       if (kdata->right != NULL && kdata->right->node != NULL) {
70232 +               zrelse(kdata->right->node);
70233 +               done_lh(kdata->right);
70234 +       }
70235 +}
70236 +
70237 +/* part of cut_node. It is called when cut_node is called to remove or cut part
70238 +   of extent item. When head of that item is removed - we have to update right
70239 +   delimiting of left neighbor of extent. When item is removed completely - we
70240 +   have to set sibling link between left and right neighbor of removed
70241 +   extent. This may return -E_DEADLOCK because of trying to get left neighbor
70242 +   locked. So, caller should repeat an attempt
70243 +*/
70244 +/* Audited by: umka (2002.06.16) */
70245 +static int
70246 +prepare_twig_kill(carry_kill_data * kdata, znode * locked_left_neighbor)
70247 +{
70248 +       int result;
70249 +       reiser4_key key;
70250 +       lock_handle left_lh;
70251 +       lock_handle right_lh;
70252 +       coord_t left_coord;
70253 +       coord_t *from;
70254 +       znode *left_child;
70255 +       znode *right_child;
70256 +       reiser4_tree *tree;
70257 +       int left_zloaded_here, right_zloaded_here;
70258 +
70259 +       from = kdata->params.from;
70260 +       assert("umka-326", from != NULL);
70261 +       assert("umka-327", kdata->params.to != NULL);
70262 +
70263 +       /* for one extent item only yet */
70264 +       assert("vs-591", item_is_extent(from));
70265 +       assert("vs-592", from->item_pos == kdata->params.to->item_pos);
70266 +
70267 +       if ((kdata->params.from_key
70268 +            && keygt(kdata->params.from_key, item_key_by_coord(from, &key)))
70269 +           || from->unit_pos != 0) {
70270 +               /* head of item @from is not removed, there is nothing to
70271 +                  worry about */
70272 +               return 0;
70273 +       }
70274 +
70275 +       result = 0;
70276 +       left_zloaded_here = 0;
70277 +       right_zloaded_here = 0;
70278 +
70279 +       left_child = right_child = NULL;
70280 +
70281 +       coord_dup(&left_coord, from);
70282 +       init_lh(&left_lh);
70283 +       init_lh(&right_lh);
70284 +       if (coord_prev_unit(&left_coord)) {
70285 +               /* @from is leftmost item in its node */
70286 +               if (!locked_left_neighbor) {
70287 +                       result =
70288 +                           reiser4_get_left_neighbor(&left_lh, from->node,
70289 +                                                     ZNODE_READ_LOCK,
70290 +                                                     GN_CAN_USE_UPPER_LEVELS);
70291 +                       switch (result) {
70292 +                       case 0:
70293 +                               break;
70294 +                       case -E_NO_NEIGHBOR:
70295 +                               /* there is no formatted node to the left of
70296 +                                  from->node */
70297 +                               warning("vs-605",
70298 +                                       "extent item has smallest key in "
70299 +                                       "the tree and it is about to be removed");
70300 +                               return 0;
70301 +                       case -E_DEADLOCK:
70302 +                               /* need to restart */
70303 +                       default:
70304 +                               return result;
70305 +                       }
70306 +
70307 +                       /* we have acquired left neighbor of from->node */
70308 +                       result = zload(left_lh.node);
70309 +                       if (result)
70310 +                               goto done;
70311 +
70312 +                       locked_left_neighbor = left_lh.node;
70313 +               } else {
70314 +                       /* squalloc_right_twig_cut should have supplied locked
70315 +                        * left neighbor */
70316 +                       assert("vs-834",
70317 +                              znode_is_write_locked(locked_left_neighbor));
70318 +                       result = zload(locked_left_neighbor);
70319 +                       if (result)
70320 +                               return result;
70321 +               }
70322 +
70323 +               left_zloaded_here = 1;
70324 +               coord_init_last_unit(&left_coord, locked_left_neighbor);
70325 +       }
70326 +
70327 +       if (!item_is_internal(&left_coord)) {
70328 +               /* what else but extent can be on twig level */
70329 +               assert("vs-606", item_is_extent(&left_coord));
70330 +
70331 +               /* there is no left formatted child */
70332 +               if (left_zloaded_here)
70333 +                       zrelse(locked_left_neighbor);
70334 +               done_lh(&left_lh);
70335 +               return 0;
70336 +       }
70337 +
70338 +       tree = znode_get_tree(left_coord.node);
70339 +       left_child = child_znode(&left_coord, left_coord.node, 1, 0);
70340 +
70341 +       if (IS_ERR(left_child)) {
70342 +               result = PTR_ERR(left_child);
70343 +               goto done;
70344 +       }
70345 +
70346 +       /* left child is acquired, calculate new right delimiting key for it
70347 +          and get right child if it is necessary */
70348 +       if (item_removed_completely
70349 +           (from, kdata->params.from_key, kdata->params.to_key)) {
70350 +               /* try to get right child of removed item */
70351 +               coord_t right_coord;
70352 +
70353 +               assert("vs-607",
70354 +                      kdata->params.to->unit_pos ==
70355 +                      coord_last_unit_pos(kdata->params.to));
70356 +               coord_dup(&right_coord, kdata->params.to);
70357 +               if (coord_next_unit(&right_coord)) {
70358 +                       /* @to is rightmost unit in the node */
70359 +                       result =
70360 +                           reiser4_get_right_neighbor(&right_lh, from->node,
70361 +                                                      ZNODE_READ_LOCK,
70362 +                                                      GN_CAN_USE_UPPER_LEVELS);
70363 +                       switch (result) {
70364 +                       case 0:
70365 +                               result = zload(right_lh.node);
70366 +                               if (result)
70367 +                                       goto done;
70368 +
70369 +                               right_zloaded_here = 1;
70370 +                               coord_init_first_unit(&right_coord,
70371 +                                                     right_lh.node);
70372 +                               item_key_by_coord(&right_coord, &key);
70373 +                               break;
70374 +
70375 +                       case -E_NO_NEIGHBOR:
70376 +                               /* there is no formatted node to the right of
70377 +                                  from->node */
70378 +                               read_lock_dk(tree);
70379 +                               key = *znode_get_rd_key(from->node);
70380 +                               read_unlock_dk(tree);
70381 +                               right_coord.node = NULL;
70382 +                               result = 0;
70383 +                               break;
70384 +                       default:
70385 +                               /* real error */
70386 +                               goto done;
70387 +                       }
70388 +               } else {
70389 +                       /* there is an item to the right of @from - take its key */
70390 +                       item_key_by_coord(&right_coord, &key);
70391 +               }
70392 +
70393 +               /* try to get right child of @from */
70394 +               if (right_coord.node && /* there is right neighbor of @from */
70395 +                   item_is_internal(&right_coord)) {   /* it is internal item */
70396 +                       right_child = child_znode(&right_coord,
70397 +                                                 right_coord.node, 1, 0);
70398 +
70399 +                       if (IS_ERR(right_child)) {
70400 +                               result = PTR_ERR(right_child);
70401 +                               goto done;
70402 +                       }
70403 +
70404 +               }
70405 +               /* whole extent is removed between znodes left_child and right_child. Prepare them for linking and
70406 +                  update of right delimiting key of left_child */
70407 +               result = prepare_children(left_child, right_child, kdata);
70408 +       } else {
70409 +               /* head of item @to is removed. left_child has to get right delimting key update. Prepare it for that */
70410 +               result = prepare_children(left_child, NULL, kdata);
70411 +       }
70412 +
70413 +      done:
70414 +       if (right_child)
70415 +               zput(right_child);
70416 +       if (right_zloaded_here)
70417 +               zrelse(right_lh.node);
70418 +       done_lh(&right_lh);
70419 +
70420 +       if (left_child)
70421 +               zput(left_child);
70422 +       if (left_zloaded_here)
70423 +               zrelse(locked_left_neighbor);
70424 +       done_lh(&left_lh);
70425 +       return result;
70426 +}
70427 +
70428 +/* this is used to remove part of node content between coordinates @from and @to. Units to which @from and @to are set
70429 +   are to be cut completely */
70430 +/* for try_to_merge_with_left, delete_copied, reiser4_delete_node */
70431 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,       /* first key to be removed */
70432 +                    const reiser4_key * to_key,        /* last key to be removed */
70433 +                    reiser4_key *
70434 +                    smallest_removed /* smallest key actually removed */ )
70435 +{
70436 +       int result;
70437 +       carry_pool *pool;
70438 +       carry_level *lowest_level;
70439 +       carry_cut_data *cut_data;
70440 +       carry_op *op;
70441 +
70442 +       assert("vs-1715", coord_compare(from, to) != COORD_CMP_ON_RIGHT);
70443 +
70444 +       pool =
70445 +           init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
70446 +                           sizeof(*cut_data));
70447 +       if (IS_ERR(pool))
70448 +               return PTR_ERR(pool);
70449 +       lowest_level = (carry_level *) (pool + 1);
70450 +       init_carry_level(lowest_level, pool);
70451 +
70452 +       op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
70453 +       assert("vs-1509", op != 0);
70454 +       if (IS_ERR(op)) {
70455 +               done_carry_pool(pool);
70456 +               return PTR_ERR(op);
70457 +       }
70458 +
70459 +       cut_data = (carry_cut_data *) (lowest_level + 3);
70460 +       cut_data->params.from = from;
70461 +       cut_data->params.to = to;
70462 +       cut_data->params.from_key = from_key;
70463 +       cut_data->params.to_key = to_key;
70464 +       cut_data->params.smallest_removed = smallest_removed;
70465 +
70466 +       op->u.cut_or_kill.is_cut = 1;
70467 +       op->u.cut_or_kill.u.cut = cut_data;
70468 +
70469 +       result = reiser4_carry(lowest_level, NULL);
70470 +       done_carry_pool(pool);
70471 +
70472 +       return result;
70473 +}
70474 +
70475 +/* cut part of the node
70476 +
70477 +   Cut part or whole content of node.
70478 +
70479 +   cut data between @from and @to of @from->node and call carry() to make
70480 +   corresponding changes in the tree. @from->node may become empty. If so -
70481 +   pointer to it will be removed. Neighboring nodes are not changed. Smallest
70482 +   removed key is stored in @smallest_removed
70483 +
70484 +*/
70485 +int kill_node_content(coord_t * from,  /* coord of the first unit/item that will be eliminated */
70486 +                     coord_t * to,     /* coord of the last unit/item that will be eliminated */
70487 +                     const reiser4_key * from_key,     /* first key to be removed */
70488 +                     const reiser4_key * to_key,       /* last key to be removed */
70489 +                     reiser4_key * smallest_removed,   /* smallest key actually removed */
70490 +                     znode * locked_left_neighbor,     /* this is set when kill_node_content is called with left neighbor
70491 +                                                        * locked (in squalloc_right_twig_cut, namely) */
70492 +                     struct inode *inode,      /* inode of file whose item (or its part) is to be killed. This is necessary to
70493 +                                                  invalidate pages together with item pointing to them */
70494 +                     int truncate)
70495 +{                              /* this call is made for file truncate)  */
70496 +       int result;
70497 +       carry_pool *pool;
70498 +       carry_level *lowest_level;
70499 +       carry_kill_data *kdata;
70500 +       lock_handle *left_child;
70501 +       lock_handle *right_child;
70502 +       carry_op *op;
70503 +
70504 +       assert("umka-328", from != NULL);
70505 +       assert("vs-316", !node_is_empty(from->node));
70506 +       assert("nikita-1812", coord_is_existing_unit(from)
70507 +              && coord_is_existing_unit(to));
70508 +
70509 +       /* allocate carry_pool, 3 carry_level-s, carry_kill_data and structures for kill_hook_extent */
70510 +       pool = init_carry_pool(sizeof(*pool) + 3 * sizeof(*lowest_level) +
70511 +                              sizeof(carry_kill_data) +
70512 +                              2 * sizeof(lock_handle) +
70513 +                              5 * sizeof(reiser4_key) + 2 * sizeof(coord_t));
70514 +       if (IS_ERR(pool))
70515 +               return PTR_ERR(pool);
70516 +
70517 +       lowest_level = (carry_level *) (pool + 1);
70518 +       init_carry_level(lowest_level, pool);
70519 +
70520 +       kdata = (carry_kill_data *) (lowest_level + 3);
70521 +       left_child = (lock_handle *) (kdata + 1);
70522 +       right_child = left_child + 1;
70523 +
70524 +       init_lh(left_child);
70525 +       init_lh(right_child);
70526 +
70527 +       kdata->params.from = from;
70528 +       kdata->params.to = to;
70529 +       kdata->params.from_key = from_key;
70530 +       kdata->params.to_key = to_key;
70531 +       kdata->params.smallest_removed = smallest_removed;
70532 +       kdata->params.truncate = truncate;
70533 +       kdata->flags = 0;
70534 +       kdata->inode = inode;
70535 +       kdata->left = left_child;
70536 +       kdata->right = right_child;
70537 +       /* memory for 5 reiser4_key and 2 coord_t will be used in kill_hook_extent */
70538 +       kdata->buf = (char *)(right_child + 1);
70539 +
70540 +       if (znode_get_level(from->node) == TWIG_LEVEL && item_is_extent(from)) {
70541 +               /* left child of extent item may have to get updated right
70542 +                  delimiting key and to get linked with right child of extent
70543 +                  @from if it will be removed completely */
70544 +               result = prepare_twig_kill(kdata, locked_left_neighbor);
70545 +               if (result) {
70546 +                       done_children(kdata);
70547 +                       done_carry_pool(pool);
70548 +                       return result;
70549 +               }
70550 +       }
70551 +
70552 +       op = reiser4_post_carry(lowest_level, COP_CUT, from->node, 0);
70553 +       if (IS_ERR(op) || (op == NULL)) {
70554 +               done_children(kdata);
70555 +               done_carry_pool(pool);
70556 +               return RETERR(op ? PTR_ERR(op) : -EIO);
70557 +       }
70558 +
70559 +       op->u.cut_or_kill.is_cut = 0;
70560 +       op->u.cut_or_kill.u.kill = kdata;
70561 +
70562 +       result = reiser4_carry(lowest_level, NULL);
70563 +
70564 +       done_children(kdata);
70565 +       done_carry_pool(pool);
70566 +       return result;
70567 +}
70568 +
70569 +void
70570 +fake_kill_hook_tail(struct inode *inode, loff_t start, loff_t end, int truncate)
70571 +{
70572 +       if (reiser4_inode_get_flag(inode, REISER4_HAS_MMAP)) {
70573 +               pgoff_t start_pg, end_pg;
70574 +
70575 +               start_pg = start >> PAGE_CACHE_SHIFT;
70576 +               end_pg = (end - 1) >> PAGE_CACHE_SHIFT;
70577 +
70578 +               if ((start & (PAGE_CACHE_SIZE - 1)) == 0) {
70579 +                       /*
70580 +                        * kill up to the page boundary.
70581 +                        */
70582 +                       assert("vs-123456", start_pg == end_pg);
70583 +                       reiser4_invalidate_pages(inode->i_mapping, start_pg, 1,
70584 +                                                truncate);
70585 +               } else if (start_pg != end_pg) {
70586 +                       /*
70587 +                        * page boundary is within killed portion of node.
70588 +                        */
70589 +                       assert("vs-654321", end_pg - start_pg == 1);
70590 +                       reiser4_invalidate_pages(inode->i_mapping, end_pg,
70591 +                                                end_pg - start_pg, 1);
70592 +               }
70593 +       }
70594 +       inode_sub_bytes(inode, end - start);
70595 +}
70596 +
70597 +/**
70598 + * Delete whole @node from the reiser4 tree without loading it.
70599 + *
70600 + * @left: locked left neighbor,
70601 + * @node: node to be deleted,
70602 + * @smallest_removed: leftmost key of deleted node,
70603 + * @object: inode pointer, if we truncate a file body.
70604 + * @truncate: true if called for file truncate.
70605 + *
70606 + * @return: 0 if success, error code otherwise.
70607 + *
70608 + * NOTE: if @object!=NULL we assume that @smallest_removed != NULL and it
70609 + * contains the right value of the smallest removed key from the previous
70610 + * cut_worker() iteration.  This is needed for proper accounting of
70611 + * "i_blocks" and "i_bytes" fields of the @object.
70612 + */
70613 +int reiser4_delete_node(znode * node, reiser4_key * smallest_removed,
70614 +                       struct inode *object, int truncate)
70615 +{
70616 +       lock_handle parent_lock;
70617 +       coord_t cut_from;
70618 +       coord_t cut_to;
70619 +       reiser4_tree *tree;
70620 +       int ret;
70621 +
70622 +       assert("zam-937", node != NULL);
70623 +       assert("zam-933", znode_is_write_locked(node));
70624 +       assert("zam-999", smallest_removed != NULL);
70625 +
70626 +       init_lh(&parent_lock);
70627 +
70628 +       ret = reiser4_get_parent(&parent_lock, node, ZNODE_WRITE_LOCK);
70629 +       if (ret)
70630 +               return ret;
70631 +
70632 +       assert("zam-934", !znode_above_root(parent_lock.node));
70633 +
70634 +       ret = zload(parent_lock.node);
70635 +       if (ret)
70636 +               goto failed_nozrelse;
70637 +
70638 +       ret = find_child_ptr(parent_lock.node, node, &cut_from);
70639 +       if (ret)
70640 +               goto failed;
70641 +
70642 +       /* decrement child counter and set parent pointer to NULL before
70643 +          deleting the list from parent node because of checks in
70644 +          internal_kill_item_hook (we can delete the last item from the parent
70645 +          node, the parent node is going to be deleted and its c_count should
70646 +          be zero). */
70647 +
70648 +       tree = znode_get_tree(node);
70649 +       write_lock_tree(tree);
70650 +       init_parent_coord(&node->in_parent, NULL);
70651 +       --parent_lock.node->c_count;
70652 +       write_unlock_tree(tree);
70653 +
70654 +       assert("zam-989", item_is_internal(&cut_from));
70655 +
70656 +       /* @node should be deleted after unlocking. */
70657 +       ZF_SET(node, JNODE_HEARD_BANSHEE);
70658 +
70659 +       /* remove a pointer from the parent node to the node being deleted. */
70660 +       coord_dup(&cut_to, &cut_from);
70661 +       /* FIXME: shouldn't this be kill_node_content */
70662 +       ret = cut_node_content(&cut_from, &cut_to, NULL, NULL, NULL);
70663 +       if (ret)
70664 +               /* FIXME(Zam): Should we re-connect the node to its parent if
70665 +                * cut_node fails? */
70666 +               goto failed;
70667 +
70668 +       {
70669 +               reiser4_tree *tree = current_tree;
70670 +               __u64 start_offset = 0, end_offset = 0;
70671 +
70672 +               read_lock_tree(tree);
70673 +               write_lock_dk(tree);
70674 +               if (object) {
70675 +                       /* We use @smallest_removed and the left delimiting of
70676 +                        * the current node for @object->i_blocks, i_bytes
70677 +                        * calculation.  We assume that the items after the
70678 +                        * *@smallest_removed key have been deleted from the
70679 +                        * file body. */
70680 +                       start_offset = get_key_offset(znode_get_ld_key(node));
70681 +                       end_offset = get_key_offset(smallest_removed);
70682 +               }
70683 +
70684 +               assert("zam-1021", znode_is_connected(node));
70685 +               if (node->left)
70686 +                       znode_set_rd_key(node->left, znode_get_rd_key(node));
70687 +
70688 +               *smallest_removed = *znode_get_ld_key(node);
70689 +
70690 +               write_unlock_dk(tree);
70691 +               read_unlock_tree(tree);
70692 +
70693 +               if (object) {
70694 +                       /* we used to perform actions which are to be performed on items on their removal from tree in
70695 +                          special item method - kill_hook. Here for optimization reasons we avoid reading node
70696 +                          containing item we remove and can not call item's kill hook. Instead we call function which
70697 +                          does exactly the same things as tail kill hook in assumption that node we avoid reading
70698 +                          contains only one item and that item is a tail one. */
70699 +                       fake_kill_hook_tail(object, start_offset, end_offset,
70700 +                                           truncate);
70701 +               }
70702 +       }
70703 +      failed:
70704 +       zrelse(parent_lock.node);
70705 +      failed_nozrelse:
70706 +       done_lh(&parent_lock);
70707 +
70708 +       return ret;
70709 +}
70710 +
70711 +static int can_delete(const reiser4_key *key, znode *node)
70712 +{
70713 +       int result;
70714 +
70715 +       read_lock_dk(current_tree);
70716 +       result = keyle(key, znode_get_ld_key(node));
70717 +       read_unlock_dk(current_tree);
70718 +       return result;
70719 +}
70720 +
70721 +/**
70722 + * This subroutine is not optimal but implementation seems to
70723 + * be easier).
70724 + *
70725 + * @tap: the point deletion process begins from,
70726 + * @from_key: the beginning of the deleted key range,
70727 + * @to_key: the end of the deleted key range,
70728 + * @smallest_removed: the smallest removed key,
70729 + * @truncate: true if called for file truncate.
70730 + * @progress: return true if a progress in file items deletions was made,
70731 + *            @smallest_removed value is actual in that case.
70732 + *
70733 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long
70734 + * reiser4_cut_tree operation was interrupted for allowing atom commit.
70735 + */
70736 +int
70737 +cut_tree_worker_common(tap_t * tap, const reiser4_key * from_key,
70738 +                      const reiser4_key * to_key,
70739 +                      reiser4_key * smallest_removed, struct inode *object,
70740 +                      int truncate, int *progress)
70741 +{
70742 +       lock_handle next_node_lock;
70743 +       coord_t left_coord;
70744 +       int result;
70745 +
70746 +       assert("zam-931", tap->coord->node != NULL);
70747 +       assert("zam-932", znode_is_write_locked(tap->coord->node));
70748 +
70749 +       *progress = 0;
70750 +       init_lh(&next_node_lock);
70751 +
70752 +       while (1) {
70753 +               znode *node;    /* node from which items are cut */
70754 +               node_plugin *nplug;     /* node plugin for @node */
70755 +
70756 +               node = tap->coord->node;
70757 +
70758 +               /* Move next_node_lock to the next node on the left. */
70759 +               result =
70760 +                   reiser4_get_left_neighbor(&next_node_lock, node,
70761 +                                             ZNODE_WRITE_LOCK,
70762 +                                             GN_CAN_USE_UPPER_LEVELS);
70763 +               if (result != 0 && result != -E_NO_NEIGHBOR)
70764 +                       break;
70765 +               /* Check can we delete the node as a whole. */
70766 +               if (*progress && znode_get_level(node) == LEAF_LEVEL &&
70767 +                   can_delete(from_key, node)) {
70768 +                       result = reiser4_delete_node(node, smallest_removed,
70769 +                                                    object, truncate);
70770 +               } else {
70771 +                       result = reiser4_tap_load(tap);
70772 +                       if (result)
70773 +                               return result;
70774 +
70775 +                       /* Prepare the second (right) point for cut_node() */
70776 +                       if (*progress)
70777 +                               coord_init_last_unit(tap->coord, node);
70778 +
70779 +                       else if (item_plugin_by_coord(tap->coord)->b.lookup ==
70780 +                                NULL)
70781 +                               /* set rightmost unit for the items without lookup method */
70782 +                               tap->coord->unit_pos =
70783 +                                   coord_last_unit_pos(tap->coord);
70784 +
70785 +                       nplug = node->nplug;
70786 +
70787 +                       assert("vs-686", nplug);
70788 +                       assert("vs-687", nplug->lookup);
70789 +
70790 +                       /* left_coord is leftmost unit cut from @node */
70791 +                       result = nplug->lookup(node, from_key,
70792 +                                              FIND_MAX_NOT_MORE_THAN,
70793 +                                              &left_coord);
70794 +
70795 +                       if (IS_CBKERR(result))
70796 +                               break;
70797 +
70798 +                       /* adjust coordinates so that they are set to existing units */
70799 +                       if (coord_set_to_right(&left_coord)
70800 +                           || coord_set_to_left(tap->coord)) {
70801 +                               result = 0;
70802 +                               break;
70803 +                       }
70804 +
70805 +                       if (coord_compare(&left_coord, tap->coord) ==
70806 +                           COORD_CMP_ON_RIGHT) {
70807 +                               /* keys from @from_key to @to_key are not in the tree */
70808 +                               result = 0;
70809 +                               break;
70810 +                       }
70811 +
70812 +                       if (left_coord.item_pos != tap->coord->item_pos) {
70813 +                               /* do not allow to cut more than one item. It is added to solve problem of truncating
70814 +                                  partially converted files. If file is partially converted there may exist a twig node
70815 +                                  containing both internal item or items pointing to leaf nodes with formatting items
70816 +                                  and extent item. We do not want to kill internal items being at twig node here
70817 +                                  because cut_tree_worker assumes killing them from level level */
70818 +                               coord_dup(&left_coord, tap->coord);
70819 +                               assert("vs-1652",
70820 +                                      coord_is_existing_unit(&left_coord));
70821 +                               left_coord.unit_pos = 0;
70822 +                       }
70823 +
70824 +                       /* cut data from one node */
70825 +                       // *smallest_removed = *reiser4_min_key();
70826 +                       result =
70827 +                           kill_node_content(&left_coord, tap->coord, from_key,
70828 +                                             to_key, smallest_removed,
70829 +                                             next_node_lock.node, object,
70830 +                                             truncate);
70831 +                       reiser4_tap_relse(tap);
70832 +               }
70833 +               if (result)
70834 +                       break;
70835 +
70836 +               ++(*progress);
70837 +
70838 +               /* Check whether all items with keys >= from_key were removed
70839 +                * from the tree. */
70840 +               if (keyle(smallest_removed, from_key))
70841 +                       /* result = 0; */
70842 +                       break;
70843 +
70844 +               if (next_node_lock.node == NULL)
70845 +                       break;
70846 +
70847 +               result = reiser4_tap_move(tap, &next_node_lock);
70848 +               done_lh(&next_node_lock);
70849 +               if (result)
70850 +                       break;
70851 +
70852 +               /* Break long reiser4_cut_tree operation (deletion of a large
70853 +                  file) if atom requires commit. */
70854 +               if (*progress > CUT_TREE_MIN_ITERATIONS
70855 +                   && current_atom_should_commit()) {
70856 +                       result = -E_REPEAT;
70857 +                       break;
70858 +               }
70859 +       }
70860 +       done_lh(&next_node_lock);
70861 +       // assert("vs-301", !keyeq(&smallest_removed, reiser4_min_key()));
70862 +       return result;
70863 +}
70864 +
70865 +/* there is a fundamental problem with optimizing deletes: VFS does it
70866 +   one file at a time.  Another problem is that if an item can be
70867 +   anything, then deleting items must be done one at a time.  It just
70868 +   seems clean to writes this to specify a from and a to key, and cut
70869 +   everything between them though.  */
70870 +
70871 +/* use this function with care if deleting more than what is part of a single file. */
70872 +/* do not use this when cutting a single item, it is suboptimal for that */
70873 +
70874 +/* You are encouraged to write plugin specific versions of this.  It
70875 +   cannot be optimal for all plugins because it works item at a time,
70876 +   and some plugins could sometimes work node at a time. Regular files
70877 +   however are not optimizable to work node at a time because of
70878 +   extents needing to free the blocks they point to.
70879 +
70880 +   Optimizations compared to v3 code:
70881 +
70882 +   It does not balance (that task is left to memory pressure code).
70883 +
70884 +   Nodes are deleted only if empty.
70885 +
70886 +   Uses extents.
70887 +
70888 +   Performs read-ahead of formatted nodes whose contents are part of
70889 +   the deletion.
70890 +*/
70891 +
70892 +/**
70893 + * Delete everything from the reiser4 tree between two keys: @from_key and
70894 + * @to_key.
70895 + *
70896 + * @from_key: the beginning of the deleted key range,
70897 + * @to_key: the end of the deleted key range,
70898 + * @smallest_removed: the smallest removed key,
70899 + * @object: owner of cutting items.
70900 + * @truncate: true if called for file truncate.
70901 + * @progress: return true if a progress in file items deletions was made,
70902 + *            @smallest_removed value is actual in that case.
70903 + *
70904 + * @return: 0 if success, error code otherwise, -E_REPEAT means that long cut_tree
70905 + * operation was interrupted for allowing atom commit .
70906 + */
70907 +
70908 +int reiser4_cut_tree_object(reiser4_tree * tree, const reiser4_key * from_key,
70909 +                           const reiser4_key * to_key,
70910 +                           reiser4_key * smallest_removed_p,
70911 +                           struct inode *object, int truncate, int *progress)
70912 +{
70913 +       lock_handle lock;
70914 +       int result;
70915 +       tap_t tap;
70916 +       coord_t right_coord;
70917 +       reiser4_key smallest_removed;
70918 +       int (*cut_tree_worker) (tap_t *, const reiser4_key *,
70919 +                               const reiser4_key *, reiser4_key *,
70920 +                               struct inode *, int, int *);
70921 +       STORE_COUNTERS;
70922 +
70923 +       assert("umka-329", tree != NULL);
70924 +       assert("umka-330", from_key != NULL);
70925 +       assert("umka-331", to_key != NULL);
70926 +       assert("zam-936", keyle(from_key, to_key));
70927 +
70928 +       if (smallest_removed_p == NULL)
70929 +               smallest_removed_p = &smallest_removed;
70930 +
70931 +       init_lh(&lock);
70932 +
70933 +       do {
70934 +               /* Find rightmost item to cut away from the tree. */
70935 +               result = reiser4_object_lookup(object, to_key, &right_coord,
70936 +                                              &lock, ZNODE_WRITE_LOCK,
70937 +                                              FIND_MAX_NOT_MORE_THAN,
70938 +                                              TWIG_LEVEL, LEAF_LEVEL,
70939 +                                              CBK_UNIQUE, NULL /*ra_info */);
70940 +               if (result != CBK_COORD_FOUND)
70941 +                       break;
70942 +               if (object == NULL
70943 +                   || inode_file_plugin(object)->cut_tree_worker == NULL)
70944 +                       cut_tree_worker = cut_tree_worker_common;
70945 +               else
70946 +                       cut_tree_worker =
70947 +                           inode_file_plugin(object)->cut_tree_worker;
70948 +               reiser4_tap_init(&tap, &right_coord, &lock, ZNODE_WRITE_LOCK);
70949 +               result =
70950 +                   cut_tree_worker(&tap, from_key, to_key, smallest_removed_p,
70951 +                                   object, truncate, progress);
70952 +               reiser4_tap_done(&tap);
70953 +
70954 +               reiser4_preempt_point();
70955 +
70956 +       } while (0);
70957 +
70958 +       done_lh(&lock);
70959 +
70960 +       if (result) {
70961 +               switch (result) {
70962 +               case -E_NO_NEIGHBOR:
70963 +                       result = 0;
70964 +                       break;
70965 +               case -E_DEADLOCK:
70966 +                       result = -E_REPEAT;
70967 +               case -E_REPEAT:
70968 +               case -ENOMEM:
70969 +               case -ENOENT:
70970 +                       break;
70971 +               default:
70972 +                       warning("nikita-2861", "failure: %i", result);
70973 +               }
70974 +       }
70975 +
70976 +       CHECK_COUNTERS;
70977 +       return result;
70978 +}
70979 +
70980 +/* repeat reiser4_cut_tree_object until everything is deleted.
70981 + * unlike cut_file_items, it does not end current transaction if -E_REPEAT
70982 + * is returned by cut_tree_object. */
70983 +int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
70984 +                    const reiser4_key * to, struct inode *inode, int truncate)
70985 +{
70986 +       int result;
70987 +       int progress;
70988 +
70989 +       do {
70990 +               result = reiser4_cut_tree_object(tree, from, to, NULL,
70991 +                                                inode, truncate, &progress);
70992 +       } while (result == -E_REPEAT);
70993 +
70994 +       return result;
70995 +}
70996 +
70997 +/* finishing reiser4 initialization */
70998 +int reiser4_init_tree(reiser4_tree * tree      /* pointer to structure being
70999 +                                        * initialized */ ,
71000 +             const reiser4_block_nr * root_block       /* address of a root block
71001 +                                                        * on a disk */ ,
71002 +             tree_level height /* height of a tree */ ,
71003 +             node_plugin * nplug /* default node plugin */ )
71004 +{
71005 +       int result;
71006 +
71007 +       assert("nikita-306", tree != NULL);
71008 +       assert("nikita-307", root_block != NULL);
71009 +       assert("nikita-308", height > 0);
71010 +       assert("nikita-309", nplug != NULL);
71011 +       assert("zam-587", tree->super != NULL);
71012 +
71013 +       tree->root_block = *root_block;
71014 +       tree->height = height;
71015 +       tree->estimate_one_insert = calc_estimate_one_insert(height);
71016 +       tree->nplug = nplug;
71017 +
71018 +       tree->znode_epoch = 1ull;
71019 +
71020 +       cbk_cache_init(&tree->cbk_cache);
71021 +
71022 +       result = znodes_tree_init(tree);
71023 +       if (result == 0)
71024 +               result = jnodes_tree_init(tree);
71025 +       if (result == 0) {
71026 +               tree->uber = zget(tree, &UBER_TREE_ADDR, NULL, 0,
71027 +                                 reiser4_ctx_gfp_mask_get());
71028 +               if (IS_ERR(tree->uber)) {
71029 +                       result = PTR_ERR(tree->uber);
71030 +                       tree->uber = NULL;
71031 +               }
71032 +       }
71033 +       return result;
71034 +}
71035 +
71036 +/* release resources associated with @tree */
71037 +void reiser4_done_tree(reiser4_tree * tree /* tree to release */ )
71038 +{
71039 +       if (tree == NULL)
71040 +               return;
71041 +
71042 +       if (tree->uber != NULL) {
71043 +               zput(tree->uber);
71044 +               tree->uber = NULL;
71045 +       }
71046 +       znodes_tree_done(tree);
71047 +       jnodes_tree_done(tree);
71048 +       cbk_cache_done(&tree->cbk_cache);
71049 +}
71050 +
71051 +/* Make Linus happy.
71052 +   Local variables:
71053 +   c-indentation-style: "K&R"
71054 +   mode-name: "LC"
71055 +   c-basic-offset: 8
71056 +   tab-width: 8
71057 +   fill-column: 120
71058 +   scroll-step: 1
71059 +   End:
71060 +*/
71061 diff --git a/fs/reiser4/tree.h b/fs/reiser4/tree.h
71062 new file mode 100644
71063 index 0000000..73aa70a
71064 --- /dev/null
71065 +++ b/fs/reiser4/tree.h
71066 @@ -0,0 +1,577 @@
71067 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71068 + * reiser4/README */
71069 +
71070 +/* Tree operations. See fs/reiser4/tree.c for comments */
71071 +
71072 +#if !defined( __REISER4_TREE_H__ )
71073 +#define __REISER4_TREE_H__
71074 +
71075 +#include "forward.h"
71076 +#include "debug.h"
71077 +#include "dformat.h"
71078 +#include "plugin/node/node.h"
71079 +#include "plugin/plugin.h"
71080 +#include "znode.h"
71081 +#include "tap.h"
71082 +
71083 +#include <linux/types.h>       /* for __u??  */
71084 +#include <linux/fs.h>          /* for struct super_block  */
71085 +#include <linux/spinlock.h>
71086 +#include <linux/sched.h>       /* for struct task_struct */
71087 +
71088 +/* fictive block number never actually used */
71089 +extern const reiser4_block_nr UBER_TREE_ADDR;
71090 +
71091 +/* &cbk_cache_slot - entry in a coord cache.
71092 +
71093 +   This is entry in a coord_by_key (cbk) cache, represented by
71094 +   &cbk_cache.
71095 +
71096 +*/
71097 +typedef struct cbk_cache_slot {
71098 +       /* cached node */
71099 +       znode *node;
71100 +       /* linkage to the next cbk cache slot in a LRU order */
71101 +       struct list_head lru;
71102 +} cbk_cache_slot;
71103 +
71104 +/* &cbk_cache - coord cache. This is part of reiser4_tree.
71105 +
71106 +   cbk_cache is supposed to speed up tree lookups by caching results of recent
71107 +   successful lookups (we don't cache negative results as dentry cache
71108 +   does). Cache consists of relatively small number of entries kept in a LRU
71109 +   order. Each entry (&cbk_cache_slot) contains a pointer to znode, from
71110 +   which we can obtain a range of keys that covered by this znode. Before
71111 +   embarking into real tree traversal we scan cbk_cache slot by slot and for
71112 +   each slot check whether key we are looking for is between minimal and
71113 +   maximal keys for node pointed to by this slot. If no match is found, real
71114 +   tree traversal is performed and if result is successful, appropriate entry
71115 +   is inserted into cache, possibly pulling least recently used entry out of
71116 +   it.
71117 +
71118 +   Tree spin lock is used to protect coord cache. If contention for this
71119 +   lock proves to be too high, more finer grained locking can be added.
71120 +
71121 +   Invariants involving parts of this data-type:
71122 +
71123 +      [cbk-cache-invariant]
71124 +*/
71125 +typedef struct cbk_cache {
71126 +       /* serializator */
71127 +       rwlock_t guard;
71128 +       int nr_slots;
71129 +       /* head of LRU list of cache slots */
71130 +       struct list_head lru;
71131 +       /* actual array of slots */
71132 +       cbk_cache_slot *slot;
71133 +} cbk_cache;
71134 +
71135 +/* level_lookup_result - possible outcome of looking up key at some level.
71136 +   This is used by coord_by_key when traversing tree downward. */
71137 +typedef enum {
71138 +       /* continue to the next level */
71139 +       LOOKUP_CONT,
71140 +       /* done. Either required item was found, or we can prove it
71141 +          doesn't exist, or some error occurred. */
71142 +       LOOKUP_DONE,
71143 +       /* restart traversal from the root. Infamous "repetition". */
71144 +       LOOKUP_REST
71145 +} level_lookup_result;
71146 +
71147 +/*    This is representation of internal reiser4 tree where all file-system
71148 +   data and meta-data are stored. This structure is passed to all tree
71149 +   manipulation functions. It's different from the super block because:
71150 +   we don't want to limit ourselves to strictly one to one mapping
71151 +   between super blocks and trees, and, because they are logically
71152 +   different: there are things in a super block that have no relation to
71153 +   the tree (bitmaps, journalling area, mount options, etc.) and there
71154 +   are things in a tree that bear no relation to the super block, like
71155 +   tree of znodes.
71156 +
71157 +   At this time, there is only one tree
71158 +   per filesystem, and this struct is part of the super block.  We only
71159 +   call the super block the super block for historical reasons (most
71160 +   other filesystems call the per filesystem metadata the super block).
71161 +*/
71162 +
71163 +struct reiser4_tree {
71164 +       /* block_nr == 0 is fake znode. Write lock it, while changing
71165 +          tree height. */
71166 +       /* disk address of root node of a tree */
71167 +       reiser4_block_nr root_block;
71168 +
71169 +       /* level of the root node. If this is 1, tree consists of root
71170 +          node only */
71171 +       tree_level height;
71172 +
71173 +       /*
71174 +        * this is cached here avoid calling plugins through function
71175 +        * dereference all the time.
71176 +        */
71177 +       __u64 estimate_one_insert;
71178 +
71179 +       /* cache of recent tree lookup results */
71180 +       cbk_cache cbk_cache;
71181 +
71182 +       /* hash table to look up znodes by block number. */
71183 +       z_hash_table zhash_table;
71184 +       z_hash_table zfake_table;
71185 +       /* hash table to look up jnodes by inode and offset. */
71186 +       j_hash_table jhash_table;
71187 +
71188 +       /* lock protecting:
71189 +          - parent pointers,
71190 +          - sibling pointers,
71191 +          - znode hash table
71192 +          - coord cache
71193 +        */
71194 +       /* NOTE: The "giant" tree lock can be replaced by more spin locks,
71195 +          hoping they will be less contented. We can use one spin lock per one
71196 +          znode hash bucket.  With adding of some code complexity, sibling
71197 +          pointers can be protected by both znode spin locks.  However it looks
71198 +          more SMP scalable we should test this locking change on n-ways (n >
71199 +          4) SMP machines.  Current 4-ways machine test does not show that tree
71200 +          lock is contented and it is a bottleneck (2003.07.25). */
71201 +
71202 +       rwlock_t tree_lock;
71203 +
71204 +       /* lock protecting delimiting keys */
71205 +       rwlock_t dk_lock;
71206 +
71207 +       /* spin lock protecting znode_epoch */
71208 +       spinlock_t epoch_lock;
71209 +       /* version stamp used to mark znode updates. See seal.[ch] for more
71210 +        * information. */
71211 +       __u64 znode_epoch;
71212 +
71213 +       znode *uber;
71214 +       node_plugin *nplug;
71215 +       struct super_block *super;
71216 +       struct {
71217 +               /* carry flags used for insertion of new nodes */
71218 +               __u32 new_node_flags;
71219 +               /* carry flags used for insertion of new extents */
71220 +               __u32 new_extent_flags;
71221 +               /* carry flags used for paste operations */
71222 +               __u32 paste_flags;
71223 +               /* carry flags used for insert operations */
71224 +               __u32 insert_flags;
71225 +       } carry;
71226 +};
71227 +
71228 +extern int reiser4_init_tree(reiser4_tree * tree,
71229 +                            const reiser4_block_nr * root_block,
71230 +                            tree_level height, node_plugin * default_plugin);
71231 +extern void reiser4_done_tree(reiser4_tree * tree);
71232 +
71233 +/* cbk flags: options for coord_by_key() */
71234 +typedef enum {
71235 +       /* coord_by_key() is called for insertion. This is necessary because
71236 +          of extents being located at the twig level. For explanation, see
71237 +          comment just above is_next_item_internal().
71238 +        */
71239 +       CBK_FOR_INSERT = (1 << 0),
71240 +       /* coord_by_key() is called with key that is known to be unique */
71241 +       CBK_UNIQUE = (1 << 1),
71242 +       /* coord_by_key() can trust delimiting keys. This options is not user
71243 +          accessible. coord_by_key() will set it automatically. It will be
71244 +          only cleared by special-case in extents-on-the-twig-level handling
71245 +          where it is necessary to insert item with a key smaller than
71246 +          leftmost key in a node. This is necessary because of extents being
71247 +          located at the twig level. For explanation, see comment just above
71248 +          is_next_item_internal().
71249 +        */
71250 +       CBK_TRUST_DK = (1 << 2),
71251 +       CBK_READA = (1 << 3),   /* original: readahead leaves which contain items of certain file */
71252 +       CBK_READDIR_RA = (1 << 4),      /* readdir: readahead whole directory and all its stat datas */
71253 +       CBK_DKSET = (1 << 5),
71254 +       CBK_EXTENDED_COORD = (1 << 6),  /* coord_t is actually */
71255 +       CBK_IN_CACHE = (1 << 7),        /* node is already in cache */
71256 +       CBK_USE_CRABLOCK = (1 << 8)     /* use crab_lock in stead of long term
71257 +                                        * lock */
71258 +} cbk_flags;
71259 +
71260 +/* insertion outcome. IBK = insert by key */
71261 +typedef enum {
71262 +       IBK_INSERT_OK = 0,
71263 +       IBK_ALREADY_EXISTS = -EEXIST,
71264 +       IBK_IO_ERROR = -EIO,
71265 +       IBK_NO_SPACE = -E_NODE_FULL,
71266 +       IBK_OOM = -ENOMEM
71267 +} insert_result;
71268 +
71269 +#define IS_CBKERR(err) ((err) != CBK_COORD_FOUND && (err) != CBK_COORD_NOTFOUND)
71270 +
71271 +typedef int (*tree_iterate_actor_t) (reiser4_tree * tree, coord_t * coord,
71272 +                                    lock_handle * lh, void *arg);
71273 +extern int reiser4_iterate_tree(reiser4_tree * tree, coord_t * coord,
71274 +                               lock_handle * lh,
71275 +                               tree_iterate_actor_t actor, void *arg,
71276 +                               znode_lock_mode mode, int through_units_p);
71277 +extern int get_uber_znode(reiser4_tree * tree, znode_lock_mode mode,
71278 +                         znode_lock_request pri, lock_handle * lh);
71279 +
71280 +/* return node plugin of @node */
71281 +static inline node_plugin *node_plugin_by_node(const znode *
71282 +                                              node /* node to query */ )
71283 +{
71284 +       assert("vs-213", node != NULL);
71285 +       assert("vs-214", znode_is_loaded(node));
71286 +
71287 +       return node->nplug;
71288 +}
71289 +
71290 +/* number of items in @node */
71291 +static inline pos_in_node_t node_num_items(const znode * node)
71292 +{
71293 +       assert("nikita-2754", znode_is_loaded(node));
71294 +       assert("nikita-2468",
71295 +              node_plugin_by_node(node)->num_of_items(node) == node->nr_items);
71296 +
71297 +       return node->nr_items;
71298 +}
71299 +
71300 +/* Return the number of items at the present node.  Asserts coord->node !=
71301 +   NULL. */
71302 +static inline unsigned coord_num_items(const coord_t * coord)
71303 +{
71304 +       assert("jmacd-9805", coord->node != NULL);
71305 +
71306 +       return node_num_items(coord->node);
71307 +}
71308 +
71309 +/* true if @node is empty */
71310 +static inline int node_is_empty(const znode * node)
71311 +{
71312 +       return node_num_items(node) == 0;
71313 +}
71314 +
71315 +typedef enum {
71316 +       SHIFTED_SOMETHING = 0,
71317 +       SHIFT_NO_SPACE = -E_NODE_FULL,
71318 +       SHIFT_IO_ERROR = -EIO,
71319 +       SHIFT_OOM = -ENOMEM,
71320 +} shift_result;
71321 +
71322 +extern node_plugin *node_plugin_by_coord(const coord_t * coord);
71323 +extern int is_coord_in_node(const coord_t * coord);
71324 +extern int key_in_node(const reiser4_key *, const coord_t *);
71325 +extern void coord_item_move_to(coord_t * coord, int items);
71326 +extern void coord_unit_move_to(coord_t * coord, int units);
71327 +
71328 +/* there are two types of repetitive accesses (ra): intra-syscall
71329 +   (local) and inter-syscall (global). Local ra is used when
71330 +   during single syscall we add/delete several items and units in the
71331 +   same place in a tree. Note that plan-A fragments local ra by
71332 +   separating stat-data and file body in key-space. Global ra is
71333 +   used when user does repetitive modifications in the same place in a
71334 +   tree.
71335 +
71336 +   Our ra implementation serves following purposes:
71337 +    1 it affects balancing decisions so that next operation in a row
71338 +      can be performed faster;
71339 +    2 it affects lower-level read-ahead in page-cache;
71340 +    3 it allows to avoid unnecessary lookups by maintaining some state
71341 +      across several operations (this is only for local ra);
71342 +    4 it leaves room for lazy-micro-balancing: when we start a sequence of
71343 +      operations they are performed without actually doing any intra-node
71344 +      shifts, until we finish sequence or scope of sequence leaves
71345 +      current node, only then we really pack node (local ra only).
71346 +*/
71347 +
71348 +/* another thing that can be useful is to keep per-tree and/or
71349 +   per-process cache of recent lookups. This cache can be organised as a
71350 +   list of block numbers of formatted nodes sorted by starting key in
71351 +   this node. Balancings should invalidate appropriate parts of this
71352 +   cache.
71353 +*/
71354 +
71355 +lookup_result coord_by_key(reiser4_tree * tree, const reiser4_key * key,
71356 +                          coord_t * coord, lock_handle * handle,
71357 +                          znode_lock_mode lock, lookup_bias bias,
71358 +                          tree_level lock_level, tree_level stop_level,
71359 +                          __u32 flags, ra_info_t *);
71360 +
71361 +lookup_result reiser4_object_lookup(struct inode *object,
71362 +                                   const reiser4_key * key,
71363 +                                   coord_t * coord,
71364 +                                   lock_handle * lh,
71365 +                                   znode_lock_mode lock_mode,
71366 +                                   lookup_bias bias,
71367 +                                   tree_level lock_level,
71368 +                                   tree_level stop_level,
71369 +                                   __u32 flags, ra_info_t * info);
71370 +
71371 +insert_result insert_by_key(reiser4_tree * tree, const reiser4_key * key,
71372 +                           reiser4_item_data * data, coord_t * coord,
71373 +                           lock_handle * lh,
71374 +                           tree_level stop_level, __u32 flags);
71375 +insert_result insert_by_coord(coord_t * coord,
71376 +                             reiser4_item_data * data, const reiser4_key * key,
71377 +                             lock_handle * lh, __u32);
71378 +insert_result insert_extent_by_coord(coord_t * coord,
71379 +                                    reiser4_item_data * data,
71380 +                                    const reiser4_key * key, lock_handle * lh);
71381 +int cut_node_content(coord_t * from, coord_t * to, const reiser4_key * from_key,
71382 +                    const reiser4_key * to_key,
71383 +                    reiser4_key * smallest_removed);
71384 +int kill_node_content(coord_t * from, coord_t * to,
71385 +                     const reiser4_key * from_key, const reiser4_key * to_key,
71386 +                     reiser4_key * smallest_removed,
71387 +                     znode * locked_left_neighbor, struct inode *inode,
71388 +                     int truncate);
71389 +
71390 +int reiser4_resize_item(coord_t * coord, reiser4_item_data * data,
71391 +                       reiser4_key * key, lock_handle * lh, cop_insert_flag);
71392 +int insert_into_item(coord_t * coord, lock_handle * lh, const reiser4_key * key,
71393 +                    reiser4_item_data * data, unsigned);
71394 +int reiser4_insert_flow(coord_t * coord, lock_handle * lh, flow_t * f);
71395 +int find_new_child_ptr(znode * parent, znode * child, znode * left,
71396 +                      coord_t * result);
71397 +
71398 +int shift_right_of_but_excluding_insert_coord(coord_t * insert_coord);
71399 +int shift_left_of_and_including_insert_coord(coord_t * insert_coord);
71400 +
71401 +void fake_kill_hook_tail(struct inode *, loff_t start, loff_t end, int);
71402 +
71403 +extern int cut_tree_worker_common(tap_t *, const reiser4_key *,
71404 +                                 const reiser4_key *, reiser4_key *,
71405 +                                 struct inode *, int, int *);
71406 +extern int reiser4_cut_tree_object(reiser4_tree *, const reiser4_key *,
71407 +                                  const reiser4_key *, reiser4_key *,
71408 +                                  struct inode *, int, int *);
71409 +extern int reiser4_cut_tree(reiser4_tree * tree, const reiser4_key * from,
71410 +                           const reiser4_key * to, struct inode *, int);
71411 +
71412 +extern int reiser4_delete_node(znode *, reiser4_key *, struct inode *, int);
71413 +extern int check_tree_pointer(const coord_t * pointer, const znode * child);
71414 +extern int find_new_child_ptr(znode * parent, znode * child UNUSED_ARG,
71415 +                             znode * left, coord_t * result);
71416 +extern int find_child_ptr(znode * parent, znode * child, coord_t * result);
71417 +extern int set_child_delimiting_keys(znode * parent, const coord_t * in_parent,
71418 +                                    znode * child);
71419 +extern znode *child_znode(const coord_t * in_parent, znode * parent,
71420 +                         int incore_p, int setup_dkeys_p);
71421 +
71422 +extern int cbk_cache_init(cbk_cache * cache);
71423 +extern void cbk_cache_done(cbk_cache * cache);
71424 +extern void cbk_cache_invalidate(const znode * node, reiser4_tree * tree);
71425 +
71426 +extern char *sprint_address(const reiser4_block_nr * block);
71427 +
71428 +#if REISER4_DEBUG
71429 +extern void print_coord_content(const char *prefix, coord_t * p);
71430 +extern void reiser4_print_address(const char *prefix,
71431 +                       const reiser4_block_nr * block);
71432 +extern void print_tree_rec(const char *prefix, reiser4_tree * tree,
71433 +                          __u32 flags);
71434 +extern void check_dkeys(znode *node);
71435 +#else
71436 +#define print_coord_content(p, c) noop
71437 +#define reiser4_print_address(p, b) noop
71438 +#endif
71439 +
71440 +extern void forget_znode(lock_handle * handle);
71441 +extern int deallocate_znode(znode * node);
71442 +
71443 +extern int is_disk_addr_unallocated(const reiser4_block_nr * addr);
71444 +
71445 +/* struct used internally to pack all numerous arguments of tree lookup.
71446 +    Used to avoid passing a lot of arguments to helper functions. */
71447 +typedef struct cbk_handle {
71448 +       /* tree we are in */
71449 +       reiser4_tree *tree;
71450 +       /* key we are going after */
71451 +       const reiser4_key *key;
71452 +       /* coord we will store result in */
71453 +       coord_t *coord;
71454 +       /* type of lock to take on target node */
71455 +       znode_lock_mode lock_mode;
71456 +       /* lookup bias. See comments at the declaration of lookup_bias */
71457 +       lookup_bias bias;
71458 +       /* lock level: level starting from which tree traversal starts taking
71459 +        * write locks. */
71460 +       tree_level lock_level;
71461 +       /* level where search will stop. Either item will be found between
71462 +          lock_level and stop_level, or CBK_COORD_NOTFOUND will be
71463 +          returned.
71464 +        */
71465 +       tree_level stop_level;
71466 +       /* level we are currently at */
71467 +       tree_level level;
71468 +       /* block number of @active node. Tree traversal operates on two
71469 +          nodes: active and parent.  */
71470 +       reiser4_block_nr block;
71471 +       /* put here error message to be printed by caller */
71472 +       const char *error;
71473 +       /* result passed back to caller */
71474 +       lookup_result result;
71475 +       /* lock handles for active and parent */
71476 +       lock_handle *parent_lh;
71477 +       lock_handle *active_lh;
71478 +       reiser4_key ld_key;
71479 +       reiser4_key rd_key;
71480 +       /* flags, passed to the cbk routine. Bits of this bitmask are defined
71481 +          in tree.h:cbk_flags enum. */
71482 +       __u32 flags;
71483 +       ra_info_t *ra_info;
71484 +       struct inode *object;
71485 +} cbk_handle;
71486 +
71487 +extern znode_lock_mode cbk_lock_mode(tree_level level, cbk_handle * h);
71488 +
71489 +/* eottl.c */
71490 +extern int handle_eottl(cbk_handle *h, int *outcome);
71491 +
71492 +int lookup_multikey(cbk_handle * handle, int nr_keys);
71493 +int lookup_couple(reiser4_tree * tree,
71494 +                 const reiser4_key * key1, const reiser4_key * key2,
71495 +                 coord_t * coord1, coord_t * coord2,
71496 +                 lock_handle * lh1, lock_handle * lh2,
71497 +                 znode_lock_mode lock_mode, lookup_bias bias,
71498 +                 tree_level lock_level, tree_level stop_level, __u32 flags,
71499 +                 int *result1, int *result2);
71500 +
71501 +static inline void read_lock_tree(reiser4_tree *tree)
71502 +{
71503 +       /* check that tree is not locked */
71504 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71505 +                   LOCK_CNT_NIL(read_locked_tree) &&
71506 +                   LOCK_CNT_NIL(write_locked_tree)));
71507 +       /* check that spinlocks of lower priorities are not held */
71508 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71509 +                   LOCK_CNT_NIL(rw_locked_dk) &&
71510 +                   LOCK_CNT_NIL(spin_locked_stack)));
71511 +
71512 +       read_lock(&(tree->tree_lock));
71513 +
71514 +       LOCK_CNT_INC(read_locked_tree);
71515 +       LOCK_CNT_INC(rw_locked_tree);
71516 +       LOCK_CNT_INC(spin_locked);
71517 +}
71518 +
71519 +static inline void read_unlock_tree(reiser4_tree *tree)
71520 +{
71521 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_tree));
71522 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71523 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71524 +
71525 +       LOCK_CNT_DEC(read_locked_tree);
71526 +       LOCK_CNT_DEC(rw_locked_tree);
71527 +       LOCK_CNT_DEC(spin_locked);
71528 +
71529 +       read_unlock(&(tree->tree_lock));
71530 +}
71531 +
71532 +static inline void write_lock_tree(reiser4_tree *tree)
71533 +{
71534 +       /* check that tree is not locked */
71535 +       assert("", (LOCK_CNT_NIL(rw_locked_tree) &&
71536 +                   LOCK_CNT_NIL(read_locked_tree) &&
71537 +                   LOCK_CNT_NIL(write_locked_tree)));
71538 +       /* check that spinlocks of lower priorities are not held */
71539 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
71540 +                   LOCK_CNT_NIL(rw_locked_dk) &&
71541 +                   LOCK_CNT_NIL(spin_locked_stack)));
71542 +
71543 +       write_lock(&(tree->tree_lock));
71544 +
71545 +       LOCK_CNT_INC(write_locked_tree);
71546 +       LOCK_CNT_INC(rw_locked_tree);
71547 +       LOCK_CNT_INC(spin_locked);
71548 +}
71549 +
71550 +static inline void write_unlock_tree(reiser4_tree *tree)
71551 +{
71552 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_tree));
71553 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_tree));
71554 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71555 +
71556 +       LOCK_CNT_DEC(write_locked_tree);
71557 +       LOCK_CNT_DEC(rw_locked_tree);
71558 +       LOCK_CNT_DEC(spin_locked);
71559 +
71560 +       write_unlock(&(tree->tree_lock));
71561 +}
71562 +
71563 +static inline void read_lock_dk(reiser4_tree *tree)
71564 +{
71565 +       /* check that dk is not locked */
71566 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71567 +                   LOCK_CNT_NIL(read_locked_dk) &&
71568 +                   LOCK_CNT_NIL(write_locked_dk)));
71569 +       /* check that spinlocks of lower priorities are not held */
71570 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
71571 +
71572 +       read_lock(&((tree)->dk_lock));
71573 +
71574 +       LOCK_CNT_INC(read_locked_dk);
71575 +       LOCK_CNT_INC(rw_locked_dk);
71576 +       LOCK_CNT_INC(spin_locked);
71577 +}
71578 +
71579 +static inline void read_unlock_dk(reiser4_tree *tree)
71580 +{
71581 +       assert("nikita-1375", LOCK_CNT_GTZ(read_locked_dk));
71582 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71583 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71584 +
71585 +       LOCK_CNT_DEC(read_locked_dk);
71586 +       LOCK_CNT_DEC(rw_locked_dk);
71587 +       LOCK_CNT_DEC(spin_locked);
71588 +
71589 +       read_unlock(&(tree->dk_lock));
71590 +}
71591 +
71592 +static inline void write_lock_dk(reiser4_tree *tree)
71593 +{
71594 +       /* check that dk is not locked */
71595 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
71596 +                   LOCK_CNT_NIL(read_locked_dk) &&
71597 +                   LOCK_CNT_NIL(write_locked_dk)));
71598 +       /* check that spinlocks of lower priorities are not held */
71599 +       assert("", LOCK_CNT_NIL(spin_locked_stack));
71600 +
71601 +       write_lock(&((tree)->dk_lock));
71602 +
71603 +       LOCK_CNT_INC(write_locked_dk);
71604 +       LOCK_CNT_INC(rw_locked_dk);
71605 +       LOCK_CNT_INC(spin_locked);
71606 +}
71607 +
71608 +static inline void write_unlock_dk(reiser4_tree *tree)
71609 +{
71610 +       assert("nikita-1375", LOCK_CNT_GTZ(write_locked_dk));
71611 +       assert("nikita-1376", LOCK_CNT_GTZ(rw_locked_dk));
71612 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
71613 +
71614 +       LOCK_CNT_DEC(write_locked_dk);
71615 +       LOCK_CNT_DEC(rw_locked_dk);
71616 +       LOCK_CNT_DEC(spin_locked);
71617 +
71618 +       write_unlock(&(tree->dk_lock));
71619 +}
71620 +
71621 +/* estimate api. Implementation is in estimate.c */
71622 +reiser4_block_nr estimate_one_insert_item(reiser4_tree *);
71623 +reiser4_block_nr estimate_one_insert_into_item(reiser4_tree *);
71624 +reiser4_block_nr estimate_insert_flow(tree_level);
71625 +reiser4_block_nr estimate_one_item_removal(reiser4_tree *);
71626 +reiser4_block_nr calc_estimate_one_insert(tree_level);
71627 +reiser4_block_nr estimate_dirty_cluster(struct inode *);
71628 +reiser4_block_nr estimate_insert_cluster(struct inode *);
71629 +reiser4_block_nr estimate_update_cluster(struct inode *);
71630 +
71631 +/* __REISER4_TREE_H__ */
71632 +#endif
71633 +
71634 +/* Make Linus happy.
71635 +   Local variables:
71636 +   c-indentation-style: "K&R"
71637 +   mode-name: "LC"
71638 +   c-basic-offset: 8
71639 +   tab-width: 8
71640 +   fill-column: 120
71641 +   scroll-step: 1
71642 +   End:
71643 +*/
71644 diff --git a/fs/reiser4/tree_mod.c b/fs/reiser4/tree_mod.c
71645 new file mode 100644
71646 index 0000000..bcc6548
71647 --- /dev/null
71648 +++ b/fs/reiser4/tree_mod.c
71649 @@ -0,0 +1,386 @@
71650 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
71651 + * reiser4/README */
71652 +
71653 +/*
71654 + * Functions to add/delete new nodes to/from the tree.
71655 + *
71656 + * Functions from this file are used by carry (see carry*) to handle:
71657 + *
71658 + *     . insertion of new formatted node into tree
71659 + *
71660 + *     . addition of new tree root, increasing tree height
71661 + *
71662 + *     . removing tree root, decreasing tree height
71663 + *
71664 + */
71665 +
71666 +#include "forward.h"
71667 +#include "debug.h"
71668 +#include "dformat.h"
71669 +#include "key.h"
71670 +#include "coord.h"
71671 +#include "plugin/plugin.h"
71672 +#include "jnode.h"
71673 +#include "znode.h"
71674 +#include "tree_mod.h"
71675 +#include "block_alloc.h"
71676 +#include "tree_walk.h"
71677 +#include "tree.h"
71678 +#include "super.h"
71679 +
71680 +#include <linux/err.h>
71681 +
71682 +static int add_child_ptr(znode * parent, znode * child);
71683 +/* warning only issued if error is not -E_REPEAT */
71684 +#define ewarning( error, ... )                 \
71685 +       if( ( error ) != -E_REPEAT )            \
71686 +               warning( __VA_ARGS__ )
71687 +
71688 +/* allocate new node on the @level and immediately on the right of @brother. */
71689 +znode * reiser4_new_node(znode * brother /* existing left neighbor
71690 +                                         *  of new node */,
71691 +                        tree_level level /* tree level at which new node is to
71692 +                                          * be allocated */)
71693 +{
71694 +       znode *result;
71695 +       int retcode;
71696 +       reiser4_block_nr blocknr;
71697 +
71698 +       assert("nikita-930", brother != NULL);
71699 +       assert("umka-264", level < REAL_MAX_ZTREE_HEIGHT);
71700 +
71701 +       retcode = assign_fake_blocknr_formatted(&blocknr);
71702 +       if (retcode == 0) {
71703 +               result =
71704 +                   zget(znode_get_tree(brother), &blocknr, NULL, level,
71705 +                        reiser4_ctx_gfp_mask_get());
71706 +               if (IS_ERR(result)) {
71707 +                       ewarning(PTR_ERR(result), "nikita-929",
71708 +                                "Cannot allocate znode for carry: %li",
71709 +                                PTR_ERR(result));
71710 +                       return result;
71711 +               }
71712 +               /* cheap test, can be executed even when debugging is off */
71713 +               if (!znode_just_created(result)) {
71714 +                       warning("nikita-2213",
71715 +                               "Allocated already existing block: %llu",
71716 +                               (unsigned long long)blocknr);
71717 +                       zput(result);
71718 +                       return ERR_PTR(RETERR(-EIO));
71719 +               }
71720 +
71721 +               assert("nikita-931", result != NULL);
71722 +               result->nplug = znode_get_tree(brother)->nplug;
71723 +               assert("nikita-933", result->nplug != NULL);
71724 +
71725 +               retcode = zinit_new(result, reiser4_ctx_gfp_mask_get());
71726 +               if (retcode == 0) {
71727 +                       ZF_SET(result, JNODE_CREATED);
71728 +                       zrelse(result);
71729 +               } else {
71730 +                       zput(result);
71731 +                       result = ERR_PTR(retcode);
71732 +               }
71733 +       } else {
71734 +               /* failure to allocate new node during balancing.
71735 +                  This should never happen. Ever. Returning -E_REPEAT
71736 +                  is not viable solution, because "out of disk space"
71737 +                  is not transient error that will go away by itself.
71738 +                */
71739 +               ewarning(retcode, "nikita-928",
71740 +                        "Cannot allocate block for carry: %i", retcode);
71741 +               result = ERR_PTR(retcode);
71742 +       }
71743 +       assert("nikita-1071", result != NULL);
71744 +       return result;
71745 +}
71746 +
71747 +/* allocate new root and add it to the tree
71748 +
71749 +   This helper function is called by add_new_root().
71750 +
71751 +*/
71752 +znode *reiser4_add_tree_root(znode * old_root /* existing tree root */ ,
71753 +                    znode * fake /* "fake" znode */ )
71754 +{
71755 +       reiser4_tree *tree = znode_get_tree(old_root);
71756 +       znode *new_root = NULL; /* to shut gcc up */
71757 +       int result;
71758 +
71759 +       assert("nikita-1069", old_root != NULL);
71760 +       assert("umka-262", fake != NULL);
71761 +       assert("umka-263", tree != NULL);
71762 +
71763 +       /* "fake" znode---one always hanging just above current root. This
71764 +          node is locked when new root is created or existing root is
71765 +          deleted. Downward tree traversal takes lock on it before taking
71766 +          lock on a root node. This avoids race conditions with root
71767 +          manipulations.
71768 +
71769 +        */
71770 +       assert("nikita-1348", znode_above_root(fake));
71771 +       assert("nikita-1211", znode_is_root(old_root));
71772 +
71773 +       result = 0;
71774 +       if (tree->height >= REAL_MAX_ZTREE_HEIGHT) {
71775 +               warning("nikita-1344", "Tree is too tall: %i", tree->height);
71776 +               /* ext2 returns -ENOSPC when it runs out of free inodes with a
71777 +                  following comment (fs/ext2/ialloc.c:441): Is it really
71778 +                  ENOSPC?
71779 +
71780 +                  -EXFULL? -EINVAL?
71781 +                */
71782 +               result = RETERR(-ENOSPC);
71783 +       } else {
71784 +               /* Allocate block for new root. It's not that
71785 +                  important where it will be allocated, as root is
71786 +                  almost always in memory. Moreover, allocate on
71787 +                  flush can be going here.
71788 +                */
71789 +               assert("nikita-1448", znode_is_root(old_root));
71790 +               new_root = reiser4_new_node(fake, tree->height + 1);
71791 +               if (!IS_ERR(new_root) && (result = zload(new_root)) == 0) {
71792 +                       lock_handle rlh;
71793 +
71794 +                       init_lh(&rlh);
71795 +                       result =
71796 +                           longterm_lock_znode(&rlh, new_root,
71797 +                                               ZNODE_WRITE_LOCK,
71798 +                                               ZNODE_LOCK_LOPRI);
71799 +                       if (result == 0) {
71800 +                               parent_coord_t *in_parent;
71801 +
71802 +                               znode_make_dirty(fake);
71803 +
71804 +                               /* new root is a child of "fake" node */
71805 +                               write_lock_tree(tree);
71806 +
71807 +                               ++tree->height;
71808 +
71809 +                               /* recalculate max balance overhead */
71810 +                               tree->estimate_one_insert =
71811 +                                   estimate_one_insert_item(tree);
71812 +
71813 +                               tree->root_block = *znode_get_block(new_root);
71814 +                               in_parent = &new_root->in_parent;
71815 +                               init_parent_coord(in_parent, fake);
71816 +                               /* manually insert new root into sibling
71817 +                                * list. With this all nodes involved into
71818 +                                * balancing are connected after balancing is
71819 +                                * done---useful invariant to check. */
71820 +                               sibling_list_insert_nolock(new_root, NULL);
71821 +                               write_unlock_tree(tree);
71822 +
71823 +                               /* insert into new root pointer to the
71824 +                                  @old_root. */
71825 +                               assert("nikita-1110",
71826 +                                      WITH_DATA(new_root,
71827 +                                                node_is_empty(new_root)));
71828 +                               write_lock_dk(tree);
71829 +                               znode_set_ld_key(new_root, reiser4_min_key());
71830 +                               znode_set_rd_key(new_root, reiser4_max_key());
71831 +                               write_unlock_dk(tree);
71832 +                               if (REISER4_DEBUG) {
71833 +                                       ZF_CLR(old_root, JNODE_LEFT_CONNECTED);
71834 +                                       ZF_CLR(old_root, JNODE_RIGHT_CONNECTED);
71835 +                                       ZF_SET(old_root, JNODE_ORPHAN);
71836 +                               }
71837 +                               result = add_child_ptr(new_root, old_root);
71838 +                               done_lh(&rlh);
71839 +                       }
71840 +                       zrelse(new_root);
71841 +               }
71842 +       }
71843 +       if (result != 0)
71844 +               new_root = ERR_PTR(result);
71845 +       return new_root;
71846 +}
71847 +
71848 +/* build &reiser4_item_data for inserting child pointer
71849 +
71850 +   Build &reiser4_item_data that can be later used to insert pointer to @child
71851 +   in its parent.
71852 +
71853 +*/
71854 +void build_child_ptr_data(znode * child        /* node pointer to which will be
71855 +                                        * inserted */ ,
71856 +                         reiser4_item_data * data /* where to store result */ )
71857 +{
71858 +       assert("nikita-1116", child != NULL);
71859 +       assert("nikita-1117", data != NULL);
71860 +
71861 +       /*
71862 +        * NOTE: use address of child's blocknr as address of data to be
71863 +        * inserted. As result of this data gets into on-disk structure in cpu
71864 +        * byte order. internal's create_hook converts it to little endian byte
71865 +        * order.
71866 +        */
71867 +       data->data = (char *)znode_get_block(child);
71868 +       /* data -> data is kernel space */
71869 +       data->user = 0;
71870 +       data->length = sizeof(reiser4_block_nr);
71871 +       /* FIXME-VS: hardcoded internal item? */
71872 +
71873 +       /* AUDIT: Is it possible that "item_plugin_by_id" may find nothing? */
71874 +       data->iplug = item_plugin_by_id(NODE_POINTER_ID);
71875 +}
71876 +
71877 +/* add pointer to @child into empty @parent.
71878 +
71879 +   This is used when pointer to old root is inserted into new root which is
71880 +   empty.
71881 +*/
71882 +static int add_child_ptr(znode * parent, znode * child)
71883 +{
71884 +       coord_t coord;
71885 +       reiser4_item_data data;
71886 +       int result;
71887 +       reiser4_key key;
71888 +
71889 +       assert("nikita-1111", parent != NULL);
71890 +       assert("nikita-1112", child != NULL);
71891 +       assert("nikita-1115",
71892 +              znode_get_level(parent) == znode_get_level(child) + 1);
71893 +
71894 +       result = zload(parent);
71895 +       if (result != 0)
71896 +               return result;
71897 +       assert("nikita-1113", node_is_empty(parent));
71898 +       coord_init_first_unit(&coord, parent);
71899 +
71900 +       build_child_ptr_data(child, &data);
71901 +       data.arg = NULL;
71902 +
71903 +       read_lock_dk(znode_get_tree(parent));
71904 +       key = *znode_get_ld_key(child);
71905 +       read_unlock_dk(znode_get_tree(parent));
71906 +
71907 +       result = node_plugin_by_node(parent)->create_item(&coord, &key, &data,
71908 +                                                         NULL);
71909 +       znode_make_dirty(parent);
71910 +       zrelse(parent);
71911 +       return result;
71912 +}
71913 +
71914 +/* actually remove tree root */
71915 +static int reiser4_kill_root(reiser4_tree * tree /* tree from which root is
71916 +                                                 * being removed */,
71917 +                            znode * old_root /* root node that is being
71918 +                                              * removed */ ,
71919 +                            znode * new_root   /* new root---sole child of
71920 +                                                * @old_root */,
71921 +                    const reiser4_block_nr * new_root_blk /* disk address of
71922 +                                                           * @new_root */)
71923 +{
71924 +       znode *uber;
71925 +       int result;
71926 +       lock_handle handle_for_uber;
71927 +
71928 +       assert("umka-265", tree != NULL);
71929 +       assert("nikita-1198", new_root != NULL);
71930 +       assert("nikita-1199",
71931 +              znode_get_level(new_root) + 1 == znode_get_level(old_root));
71932 +
71933 +       assert("nikita-1201", znode_is_write_locked(old_root));
71934 +
71935 +       assert("nikita-1203",
71936 +              disk_addr_eq(new_root_blk, znode_get_block(new_root)));
71937 +
71938 +       init_lh(&handle_for_uber);
71939 +       /* obtain and lock "fake" znode protecting changes in tree height. */
71940 +       result = get_uber_znode(tree, ZNODE_WRITE_LOCK, ZNODE_LOCK_HIPRI,
71941 +                               &handle_for_uber);
71942 +       if (result == 0) {
71943 +               uber = handle_for_uber.node;
71944 +
71945 +               znode_make_dirty(uber);
71946 +
71947 +               /* don't take long term lock a @new_root. Take spinlock. */
71948 +
71949 +               write_lock_tree(tree);
71950 +
71951 +               tree->root_block = *new_root_blk;
71952 +               --tree->height;
71953 +
71954 +               /* recalculate max balance overhead */
71955 +               tree->estimate_one_insert = estimate_one_insert_item(tree);
71956 +
71957 +               assert("nikita-1202",
71958 +                      tree->height == znode_get_level(new_root));
71959 +
71960 +               /* new root is child on "fake" node */
71961 +               init_parent_coord(&new_root->in_parent, uber);
71962 +               ++uber->c_count;
71963 +
71964 +               /* sibling_list_insert_nolock(new_root, NULL); */
71965 +               write_unlock_tree(tree);
71966 +
71967 +               /* reinitialise old root. */
71968 +               result = node_plugin_by_node(old_root)->init(old_root);
71969 +               znode_make_dirty(old_root);
71970 +               if (result == 0) {
71971 +                       assert("nikita-1279", node_is_empty(old_root));
71972 +                       ZF_SET(old_root, JNODE_HEARD_BANSHEE);
71973 +                       old_root->c_count = 0;
71974 +               }
71975 +       }
71976 +       done_lh(&handle_for_uber);
71977 +
71978 +       return result;
71979 +}
71980 +
71981 +/* remove tree root
71982 +
71983 +   This function removes tree root, decreasing tree height by one.  Tree root
71984 +   and its only child (that is going to become new tree root) are write locked
71985 +   at the entry.
71986 +
71987 +   To remove tree root we need to take lock on special "fake" znode that
71988 +   protects changes of tree height. See comments in reiser4_add_tree_root() for
71989 +   more on this.
71990 +
71991 +   Also parent pointers have to be updated in
71992 +   old and new root. To simplify code, function is split into two parts: outer
71993 +   reiser4_kill_tree_root() collects all necessary arguments and calls
71994 +   reiser4_kill_root() to do the actual job.
71995 +
71996 +*/
71997 +int reiser4_kill_tree_root(znode * old_root /* tree root that we are
71998 +                                              removing*/)
71999 +{
72000 +       int result;
72001 +       coord_t down_link;
72002 +       znode *new_root;
72003 +       reiser4_tree *tree;
72004 +
72005 +       assert("umka-266", current_tree != NULL);
72006 +       assert("nikita-1194", old_root != NULL);
72007 +       assert("nikita-1196", znode_is_root(old_root));
72008 +       assert("nikita-1200", node_num_items(old_root) == 1);
72009 +       assert("nikita-1401", znode_is_write_locked(old_root));
72010 +
72011 +       coord_init_first_unit(&down_link, old_root);
72012 +
72013 +       tree = znode_get_tree(old_root);
72014 +       new_root = child_znode(&down_link, old_root, 0, 1);
72015 +       if (!IS_ERR(new_root)) {
72016 +               result =
72017 +                       reiser4_kill_root(tree, old_root, new_root,
72018 +                                         znode_get_block(new_root));
72019 +               zput(new_root);
72020 +       } else
72021 +               result = PTR_ERR(new_root);
72022 +
72023 +       return result;
72024 +}
72025 +
72026 +/* Make Linus happy.
72027 +   Local variables:
72028 +   c-indentation-style: "K&R"
72029 +   mode-name: "LC"
72030 +   c-basic-offset: 8
72031 +   tab-width: 8
72032 +   fill-column: 120
72033 +   scroll-step: 1
72034 +   End:
72035 +*/
72036 diff --git a/fs/reiser4/tree_mod.h b/fs/reiser4/tree_mod.h
72037 new file mode 100644
72038 index 0000000..1519641
72039 --- /dev/null
72040 +++ b/fs/reiser4/tree_mod.h
72041 @@ -0,0 +1,29 @@
72042 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72043 + * reiser4/README */
72044 +
72045 +/* Functions to add/delete new nodes to/from the tree. See tree_mod.c for
72046 + * comments. */
72047 +
72048 +#if !defined( __REISER4_TREE_MOD_H__ )
72049 +#define __REISER4_TREE_MOD_H__
72050 +
72051 +#include "forward.h"
72052 +
72053 +znode *reiser4_new_node(znode * brother, tree_level level);
72054 +znode *reiser4_add_tree_root(znode * old_root, znode * fake);
72055 +int reiser4_kill_tree_root(znode * old_root);
72056 +void build_child_ptr_data(znode * child, reiser4_item_data * data);
72057 +
72058 +/* __REISER4_TREE_MOD_H__ */
72059 +#endif
72060 +
72061 +/* Make Linus happy.
72062 +   Local variables:
72063 +   c-indentation-style: "K&R"
72064 +   mode-name: "LC"
72065 +   c-basic-offset: 8
72066 +   tab-width: 8
72067 +   fill-column: 120
72068 +   scroll-step: 1
72069 +   End:
72070 +*/
72071 diff --git a/fs/reiser4/tree_walk.c b/fs/reiser4/tree_walk.c
72072 new file mode 100644
72073 index 0000000..cde4875
72074 --- /dev/null
72075 +++ b/fs/reiser4/tree_walk.c
72076 @@ -0,0 +1,927 @@
72077 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
72078 + * reiser4/README */
72079 +
72080 +/* Routines and macros to:
72081 +
72082 +   get_left_neighbor()
72083 +
72084 +   get_right_neighbor()
72085 +
72086 +   get_parent()
72087 +
72088 +   get_first_child()
72089 +
72090 +   get_last_child()
72091 +
72092 +   various routines to walk the whole tree and do things to it like
72093 +   repack it, or move it to tertiary storage.  Please make them as
72094 +   generic as is reasonable.
72095 +
72096 +*/
72097 +
72098 +#include "forward.h"
72099 +#include "debug.h"
72100 +#include "dformat.h"
72101 +#include "coord.h"
72102 +#include "plugin/item/item.h"
72103 +#include "jnode.h"
72104 +#include "znode.h"
72105 +#include "tree_walk.h"
72106 +#include "tree.h"
72107 +#include "super.h"
72108 +
72109 +/* These macros are used internally in tree_walk.c in attempt to make
72110 +   lock_neighbor() code usable to build lock_parent(), lock_right_neighbor,
72111 +   lock_left_neighbor */
72112 +#define GET_NODE_BY_PTR_OFFSET(node, off) (*(znode**)(((unsigned long)(node)) + (off)))
72113 +#define FIELD_OFFSET(name)  offsetof(znode, name)
72114 +#define PARENT_PTR_OFFSET FIELD_OFFSET(in_parent.node)
72115 +#define LEFT_PTR_OFFSET   FIELD_OFFSET(left)
72116 +#define RIGHT_PTR_OFFSET  FIELD_OFFSET(right)
72117 +
72118 +/* This is the generic procedure to get and lock `generic' neighbor (left or
72119 +    right neighbor or parent). It implements common algorithm for all cases of
72120 +    getting lock on neighbor node, only znode structure field is different in
72121 +    each case. This is parameterized by ptr_offset argument, which is byte
72122 +    offset for the pointer to the desired neighbor within the current node's
72123 +    znode structure. This function should be called with the tree lock held */
72124 +static int lock_neighbor(
72125 +                               /* resulting lock handle */
72126 +                               lock_handle * result,
72127 +                               /* znode to lock */
72128 +                               znode * node,
72129 +                               /* pointer to neighbor (or parent) znode field offset, in bytes from
72130 +                                  the base address of znode structure  */
72131 +                               int ptr_offset,
72132 +                               /* lock mode for longterm_lock_znode call */
72133 +                               znode_lock_mode mode,
72134 +                               /* lock request for longterm_lock_znode call */
72135 +                               znode_lock_request req,
72136 +                               /* GN_* flags */
72137 +                               int flags, int rlocked)
72138 +{
72139 +       reiser4_tree *tree = znode_get_tree(node);
72140 +       znode *neighbor;
72141 +       int ret;
72142 +
72143 +       assert("umka-236", node != NULL);
72144 +       assert("umka-237", tree != NULL);
72145 +       assert_rw_locked(&(tree->tree_lock));
72146 +
72147 +       if (flags & GN_TRY_LOCK)
72148 +               req |= ZNODE_LOCK_NONBLOCK;
72149 +       if (flags & GN_SAME_ATOM)
72150 +               req |= ZNODE_LOCK_DONT_FUSE;
72151 +
72152 +       /* get neighbor's address by using of sibling link, quit while loop
72153 +          (and return) if link is not available. */
72154 +       while (1) {
72155 +               neighbor = GET_NODE_BY_PTR_OFFSET(node, ptr_offset);
72156 +
72157 +               /* return -E_NO_NEIGHBOR if parent or side pointer is NULL or if
72158 +                * node pointed by it is not connected.
72159 +                *
72160 +                * However, GN_ALLOW_NOT_CONNECTED option masks "connected"
72161 +                * check and allows passing reference to not connected znode to
72162 +                * subsequent longterm_lock_znode() call.  This kills possible
72163 +                * busy loop if we are trying to get longterm lock on locked but
72164 +                * not yet connected parent node. */
72165 +               if (neighbor == NULL || !((flags & GN_ALLOW_NOT_CONNECTED)
72166 +                                         || znode_is_connected(neighbor))) {
72167 +                       return RETERR(-E_NO_NEIGHBOR);
72168 +               }
72169 +
72170 +               /* protect it from deletion. */
72171 +               zref(neighbor);
72172 +
72173 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
72174 +
72175 +               ret = longterm_lock_znode(result, neighbor, mode, req);
72176 +
72177 +               /* The lock handle obtains its own reference, release the one from above. */
72178 +               zput(neighbor);
72179 +
72180 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
72181 +
72182 +               /* restart if node we got reference to is being
72183 +                  invalidated. we should not get reference to this node
72184 +                  again. */
72185 +               if (ret == -EINVAL)
72186 +                       continue;
72187 +               if (ret)
72188 +                       return ret;
72189 +
72190 +               /* check if neighbor link still points to just locked znode;
72191 +                  the link could have been changed while the process slept. */
72192 +               if (neighbor == GET_NODE_BY_PTR_OFFSET(node, ptr_offset))
72193 +                       return 0;
72194 +
72195 +               /* znode was locked by mistake; unlock it and restart locking
72196 +                  process from beginning. */
72197 +               rlocked ? read_unlock_tree(tree) : write_unlock_tree(tree);
72198 +               longterm_unlock_znode(result);
72199 +               rlocked ? read_lock_tree(tree) : write_lock_tree(tree);
72200 +       }
72201 +}
72202 +
72203 +/* get parent node with longterm lock, accepts GN* flags. */
72204 +int reiser4_get_parent_flags(lock_handle * lh /* resulting lock handle */ ,
72205 +                            znode * node /* child node */ ,
72206 +                            znode_lock_mode mode
72207 +                            /* type of lock: read or write */ ,
72208 +                            int flags /* GN_* flags */ )
72209 +{
72210 +       int result;
72211 +
72212 +       read_lock_tree(znode_get_tree(node));
72213 +       result = lock_neighbor(lh, node, PARENT_PTR_OFFSET, mode,
72214 +                              ZNODE_LOCK_HIPRI, flags, 1);
72215 +       read_unlock_tree(znode_get_tree(node));
72216 +       return result;
72217 +}
72218 +
72219 +/* wrapper function to lock right or left neighbor depending on GN_GO_LEFT
72220 +   bit in @flags parameter  */
72221 +/* Audited by: umka (2002.06.14) */
72222 +static inline int
72223 +lock_side_neighbor(lock_handle * result,
72224 +                  znode * node, znode_lock_mode mode, int flags, int rlocked)
72225 +{
72226 +       int ret;
72227 +       int ptr_offset;
72228 +       znode_lock_request req;
72229 +
72230 +       if (flags & GN_GO_LEFT) {
72231 +               ptr_offset = LEFT_PTR_OFFSET;
72232 +               req = ZNODE_LOCK_LOPRI;
72233 +       } else {
72234 +               ptr_offset = RIGHT_PTR_OFFSET;
72235 +               req = ZNODE_LOCK_HIPRI;
72236 +       }
72237 +
72238 +       ret =
72239 +           lock_neighbor(result, node, ptr_offset, mode, req, flags, rlocked);
72240 +
72241 +       if (ret == -E_NO_NEIGHBOR)      /* if we walk left or right -E_NO_NEIGHBOR does not
72242 +                                        * guarantee that neighbor is absent in the
72243 +                                        * tree; in this case we return -ENOENT --
72244 +                                        * means neighbor at least not found in
72245 +                                        * cache */
72246 +               return RETERR(-ENOENT);
72247 +
72248 +       return ret;
72249 +}
72250 +
72251 +#if REISER4_DEBUG
72252 +
72253 +int check_sibling_list(znode * node)
72254 +{
72255 +       znode *scan;
72256 +       znode *next;
72257 +
72258 +       assert("nikita-3283", LOCK_CNT_GTZ(write_locked_tree));
72259 +
72260 +       if (node == NULL)
72261 +               return 1;
72262 +
72263 +       if (ZF_ISSET(node, JNODE_RIP))
72264 +               return 1;
72265 +
72266 +       assert("nikita-3270", node != NULL);
72267 +       assert_rw_write_locked(&(znode_get_tree(node)->tree_lock));
72268 +
72269 +       for (scan = node; znode_is_left_connected(scan); scan = next) {
72270 +               next = scan->left;
72271 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
72272 +                       assert("nikita-3271", znode_is_right_connected(next));
72273 +                       assert("nikita-3272", next->right == scan);
72274 +               } else
72275 +                       break;
72276 +       }
72277 +       for (scan = node; znode_is_right_connected(scan); scan = next) {
72278 +               next = scan->right;
72279 +               if (next != NULL && !ZF_ISSET(next, JNODE_RIP)) {
72280 +                       assert("nikita-3273", znode_is_left_connected(next));
72281 +                       assert("nikita-3274", next->left == scan);
72282 +               } else
72283 +                       break;
72284 +       }
72285 +       return 1;
72286 +}
72287 +
72288 +#endif
72289 +
72290 +/* Znode sibling pointers maintenence. */
72291 +
72292 +/* Znode sibling pointers are established between any neighbored nodes which are
72293 +   in cache.  There are two znode state bits (JNODE_LEFT_CONNECTED,
72294 +   JNODE_RIGHT_CONNECTED), if left or right sibling pointer contains actual
72295 +   value (even NULL), corresponded JNODE_*_CONNECTED bit is set.
72296 +
72297 +   Reiser4 tree operations which may allocate new znodes (CBK, tree balancing)
72298 +   take care about searching (hash table lookup may be required) of znode
72299 +   neighbors, establishing sibling pointers between them and setting
72300 +   JNODE_*_CONNECTED state bits. */
72301 +
72302 +/* adjusting of sibling pointers and `connected' states for two
72303 +   neighbors; works if one neighbor is NULL (was not found). */
72304 +
72305 +/* FIXME-VS: this is unstatic-ed to use in tree.c in prepare_twig_cut */
72306 +void link_left_and_right(znode * left, znode * right)
72307 +{
72308 +       assert("nikita-3275", check_sibling_list(left));
72309 +       assert("nikita-3275", check_sibling_list(right));
72310 +
72311 +       if (left != NULL) {
72312 +               if (left->right == NULL) {
72313 +                       left->right = right;
72314 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
72315 +
72316 +                       ON_DEBUG(left->right_version =
72317 +                                atomic_inc_return(&delim_key_version);
72318 +                           );
72319 +
72320 +               } else if (ZF_ISSET(left->right, JNODE_HEARD_BANSHEE)
72321 +                          && left->right != right) {
72322 +
72323 +                       ON_DEBUG(left->right->left_version =
72324 +                                atomic_inc_return(&delim_key_version);
72325 +                                left->right_version =
72326 +                                atomic_inc_return(&delim_key_version););
72327 +
72328 +                       left->right->left = NULL;
72329 +                       left->right = right;
72330 +                       ZF_SET(left, JNODE_RIGHT_CONNECTED);
72331 +               } else
72332 +                       /*
72333 +                        * there is a race condition in renew_sibling_link()
72334 +                        * and assertions below check that it is only one
72335 +                        * there. Thread T1 calls renew_sibling_link() without
72336 +                        * GN_NO_ALLOC flag. zlook() doesn't find neighbor
72337 +                        * node, but before T1 gets to the
72338 +                        * link_left_and_right(), another thread T2 creates
72339 +                        * neighbor node and connects it. check for
72340 +                        * left->right == NULL above protects T1 from
72341 +                        * overwriting correct left->right pointer installed
72342 +                        * by T2.
72343 +                        */
72344 +                       assert("nikita-3302",
72345 +                              right == NULL || left->right == right);
72346 +       }
72347 +       if (right != NULL) {
72348 +               if (right->left == NULL) {
72349 +                       right->left = left;
72350 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
72351 +
72352 +                       ON_DEBUG(right->left_version =
72353 +                                atomic_inc_return(&delim_key_version);
72354 +                           );
72355 +
72356 +               } else if (ZF_ISSET(right->left, JNODE_HEARD_BANSHEE)
72357 +                          && right->left != left) {
72358 +
72359 +                       ON_DEBUG(right->left->right_version =
72360 +                                atomic_inc_return(&delim_key_version);
72361 +                                right->left_version =
72362 +                                atomic_inc_return(&delim_key_version););
72363 +
72364 +                       right->left->right = NULL;
72365 +                       right->left = left;
72366 +                       ZF_SET(right, JNODE_LEFT_CONNECTED);
72367 +
72368 +               } else
72369 +                       assert("nikita-3303",
72370 +                              left == NULL || right->left == left);
72371 +       }
72372 +       assert("nikita-3275", check_sibling_list(left));
72373 +       assert("nikita-3275", check_sibling_list(right));
72374 +}
72375 +
72376 +/* Audited by: umka (2002.06.14) */
72377 +static void link_znodes(znode * first, znode * second, int to_left)
72378 +{
72379 +       if (to_left)
72380 +               link_left_and_right(second, first);
72381 +       else
72382 +               link_left_and_right(first, second);
72383 +}
72384 +
72385 +/* getting of next (to left or to right, depend on gn_to_left bit in flags)
72386 +   coord's unit position in horizontal direction, even across node
72387 +   boundary. Should be called under tree lock, it protects nonexistence of
72388 +   sibling link on parent level, if lock_side_neighbor() fails with
72389 +   -ENOENT. */
72390 +static int far_next_coord(coord_t * coord, lock_handle * handle, int flags)
72391 +{
72392 +       int ret;
72393 +       znode *node;
72394 +       reiser4_tree *tree;
72395 +
72396 +       assert("umka-243", coord != NULL);
72397 +       assert("umka-244", handle != NULL);
72398 +       assert("zam-1069", handle->node == NULL);
72399 +
72400 +       ret =
72401 +           (flags & GN_GO_LEFT) ? coord_prev_unit(coord) :
72402 +           coord_next_unit(coord);
72403 +       if (!ret)
72404 +               return 0;
72405 +
72406 +       ret =
72407 +           lock_side_neighbor(handle, coord->node, ZNODE_READ_LOCK, flags, 0);
72408 +       if (ret)
72409 +               return ret;
72410 +
72411 +       node = handle->node;
72412 +       tree = znode_get_tree(node);
72413 +       write_unlock_tree(tree);
72414 +
72415 +       coord_init_zero(coord);
72416 +
72417 +       /* We avoid synchronous read here if it is specified by flag. */
72418 +       if ((flags & GN_ASYNC) && znode_page(handle->node) == NULL) {
72419 +               ret = jstartio(ZJNODE(handle->node));
72420 +               if (!ret)
72421 +                       ret = -E_REPEAT;
72422 +               goto error_locked;
72423 +       }
72424 +
72425 +       /* corresponded zrelse() should be called by the clients of
72426 +          far_next_coord(), in place when this node gets unlocked. */
72427 +       ret = zload(handle->node);
72428 +       if (ret)
72429 +               goto error_locked;
72430 +
72431 +       if (flags & GN_GO_LEFT)
72432 +               coord_init_last_unit(coord, node);
72433 +       else
72434 +               coord_init_first_unit(coord, node);
72435 +
72436 +       if (0) {
72437 +             error_locked:
72438 +               longterm_unlock_znode(handle);
72439 +       }
72440 +       write_lock_tree(tree);
72441 +       return ret;
72442 +}
72443 +
72444 +/* Very significant function which performs a step in horizontal direction
72445 +   when sibling pointer is not available.  Actually, it is only function which
72446 +   does it.
72447 +   Note: this function does not restore locking status at exit,
72448 +   caller should does care about proper unlocking and zrelsing */
72449 +static int
72450 +renew_sibling_link(coord_t * coord, lock_handle * handle, znode * child,
72451 +                  tree_level level, int flags, int *nr_locked)
72452 +{
72453 +       int ret;
72454 +       int to_left = flags & GN_GO_LEFT;
72455 +       reiser4_block_nr da;
72456 +       /* parent of the neighbor node; we set it to parent until not sharing
72457 +          of one parent between child and neighbor node is detected */
72458 +       znode *side_parent = coord->node;
72459 +       reiser4_tree *tree = znode_get_tree(child);
72460 +       znode *neighbor = NULL;
72461 +
72462 +       assert("umka-245", coord != NULL);
72463 +       assert("umka-246", handle != NULL);
72464 +       assert("umka-247", child != NULL);
72465 +       assert("umka-303", tree != NULL);
72466 +
72467 +       init_lh(handle);
72468 +       write_lock_tree(tree);
72469 +       ret = far_next_coord(coord, handle, flags);
72470 +
72471 +       if (ret) {
72472 +               if (ret != -ENOENT) {
72473 +                       write_unlock_tree(tree);
72474 +                       return ret;
72475 +               }
72476 +       } else {
72477 +               item_plugin *iplug;
72478 +
72479 +               if (handle->node != NULL) {
72480 +                       (*nr_locked)++;
72481 +                       side_parent = handle->node;
72482 +               }
72483 +
72484 +               /* does coord object points to internal item? We do not
72485 +                  support sibling pointers between znode for formatted and
72486 +                  unformatted nodes and return -E_NO_NEIGHBOR in that case. */
72487 +               iplug = item_plugin_by_coord(coord);
72488 +               if (!item_is_internal(coord)) {
72489 +                       link_znodes(child, NULL, to_left);
72490 +                       write_unlock_tree(tree);
72491 +                       /* we know there can't be formatted neighbor */
72492 +                       return RETERR(-E_NO_NEIGHBOR);
72493 +               }
72494 +               write_unlock_tree(tree);
72495 +
72496 +               iplug->s.internal.down_link(coord, NULL, &da);
72497 +
72498 +               if (flags & GN_NO_ALLOC) {
72499 +                       neighbor = zlook(tree, &da);
72500 +               } else {
72501 +                       neighbor =
72502 +                           zget(tree, &da, side_parent, level,
72503 +                                reiser4_ctx_gfp_mask_get());
72504 +               }
72505 +
72506 +               if (IS_ERR(neighbor)) {
72507 +                       ret = PTR_ERR(neighbor);
72508 +                       return ret;
72509 +               }
72510 +
72511 +               if (neighbor)
72512 +                       /* update delimiting keys */
72513 +                       set_child_delimiting_keys(coord->node, coord, neighbor);
72514 +
72515 +               write_lock_tree(tree);
72516 +       }
72517 +
72518 +       if (likely(neighbor == NULL ||
72519 +                  (znode_get_level(child) == znode_get_level(neighbor)
72520 +                   && child != neighbor)))
72521 +               link_znodes(child, neighbor, to_left);
72522 +       else {
72523 +               warning("nikita-3532",
72524 +                       "Sibling nodes on the different levels: %i != %i\n",
72525 +                       znode_get_level(child), znode_get_level(neighbor));
72526 +               ret = RETERR(-EIO);
72527 +       }
72528 +
72529 +       write_unlock_tree(tree);
72530 +
72531 +       /* if GN_NO_ALLOC isn't set we keep reference to neighbor znode */
72532 +       if (neighbor != NULL && (flags & GN_NO_ALLOC))
72533 +               /* atomic_dec(&ZJNODE(neighbor)->x_count); */
72534 +               zput(neighbor);
72535 +
72536 +       return ret;
72537 +}
72538 +
72539 +/* This function is for establishing of one side relation. */
72540 +/* Audited by: umka (2002.06.14) */
72541 +static int connect_one_side(coord_t * coord, znode * node, int flags)
72542 +{
72543 +       coord_t local;
72544 +       lock_handle handle;
72545 +       int nr_locked;
72546 +       int ret;
72547 +
72548 +       assert("umka-248", coord != NULL);
72549 +       assert("umka-249", node != NULL);
72550 +
72551 +       coord_dup_nocheck(&local, coord);
72552 +
72553 +       init_lh(&handle);
72554 +
72555 +       ret =
72556 +           renew_sibling_link(&local, &handle, node, znode_get_level(node),
72557 +                              flags | GN_NO_ALLOC, &nr_locked);
72558 +
72559 +       if (handle.node != NULL) {
72560 +               /* complementary operations for zload() and lock() in far_next_coord() */
72561 +               zrelse(handle.node);
72562 +               longterm_unlock_znode(&handle);
72563 +       }
72564 +
72565 +       /* we catch error codes which are not interesting for us because we
72566 +          run renew_sibling_link() only for znode connection. */
72567 +       if (ret == -ENOENT || ret == -E_NO_NEIGHBOR)
72568 +               return 0;
72569 +
72570 +       return ret;
72571 +}
72572 +
72573 +/* if @child is not in `connected' state, performs hash searches for left and
72574 +   right neighbor nodes and establishes horizontal sibling links */
72575 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72576 +int connect_znode(coord_t * parent_coord, znode * child)
72577 +{
72578 +       reiser4_tree *tree = znode_get_tree(child);
72579 +       int ret = 0;
72580 +
72581 +       assert("zam-330", parent_coord != NULL);
72582 +       assert("zam-331", child != NULL);
72583 +       assert("zam-332", parent_coord->node != NULL);
72584 +       assert("umka-305", tree != NULL);
72585 +
72586 +       /* it is trivial to `connect' root znode because it can't have
72587 +          neighbors */
72588 +       if (znode_above_root(parent_coord->node)) {
72589 +               child->left = NULL;
72590 +               child->right = NULL;
72591 +               ZF_SET(child, JNODE_LEFT_CONNECTED);
72592 +               ZF_SET(child, JNODE_RIGHT_CONNECTED);
72593 +
72594 +               ON_DEBUG(child->left_version =
72595 +                        atomic_inc_return(&delim_key_version);
72596 +                        child->right_version =
72597 +                        atomic_inc_return(&delim_key_version););
72598 +
72599 +               return 0;
72600 +       }
72601 +
72602 +       /* load parent node */
72603 +       coord_clear_iplug(parent_coord);
72604 +       ret = zload(parent_coord->node);
72605 +
72606 +       if (ret != 0)
72607 +               return ret;
72608 +
72609 +       /* protect `connected' state check by tree_lock */
72610 +       read_lock_tree(tree);
72611 +
72612 +       if (!znode_is_right_connected(child)) {
72613 +               read_unlock_tree(tree);
72614 +               /* connect right (default is right) */
72615 +               ret = connect_one_side(parent_coord, child, GN_NO_ALLOC);
72616 +               if (ret)
72617 +                       goto zrelse_and_ret;
72618 +
72619 +               read_lock_tree(tree);
72620 +       }
72621 +
72622 +       ret = znode_is_left_connected(child);
72623 +
72624 +       read_unlock_tree(tree);
72625 +
72626 +       if (!ret) {
72627 +               ret =
72628 +                   connect_one_side(parent_coord, child,
72629 +                                    GN_NO_ALLOC | GN_GO_LEFT);
72630 +       } else
72631 +               ret = 0;
72632 +
72633 +      zrelse_and_ret:
72634 +       zrelse(parent_coord->node);
72635 +
72636 +       return ret;
72637 +}
72638 +
72639 +/* this function is like renew_sibling_link() but allocates neighbor node if
72640 +   it doesn't exist and `connects' it. It may require making two steps in
72641 +   horizontal direction, first one for neighbor node finding/allocation,
72642 +   second one is for finding neighbor of neighbor to connect freshly allocated
72643 +   znode. */
72644 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72645 +static int
72646 +renew_neighbor(coord_t * coord, znode * node, tree_level level, int flags)
72647 +{
72648 +       coord_t local;
72649 +       lock_handle empty[2];
72650 +       reiser4_tree *tree = znode_get_tree(node);
72651 +       znode *neighbor = NULL;
72652 +       int nr_locked = 0;
72653 +       int ret;
72654 +
72655 +       assert("umka-250", coord != NULL);
72656 +       assert("umka-251", node != NULL);
72657 +       assert("umka-307", tree != NULL);
72658 +       assert("umka-308", level <= tree->height);
72659 +
72660 +       /* umka (2002.06.14)
72661 +          Here probably should be a check for given "level" validness.
72662 +          Something like assert("xxx-yyy", level < REAL_MAX_ZTREE_HEIGHT);
72663 +        */
72664 +
72665 +       coord_dup(&local, coord);
72666 +
72667 +       ret =
72668 +           renew_sibling_link(&local, &empty[0], node, level,
72669 +                              flags & ~GN_NO_ALLOC, &nr_locked);
72670 +       if (ret)
72671 +               goto out;
72672 +
72673 +       /* tree lock is not needed here because we keep parent node(s) locked
72674 +          and reference to neighbor znode incremented */
72675 +       neighbor = (flags & GN_GO_LEFT) ? node->left : node->right;
72676 +
72677 +       read_lock_tree(tree);
72678 +       ret = znode_is_connected(neighbor);
72679 +       read_unlock_tree(tree);
72680 +       if (ret) {
72681 +               ret = 0;
72682 +               goto out;
72683 +       }
72684 +
72685 +       ret =
72686 +           renew_sibling_link(&local, &empty[nr_locked], neighbor, level,
72687 +                              flags | GN_NO_ALLOC, &nr_locked);
72688 +       /* second renew_sibling_link() call is used for znode connection only,
72689 +          so we can live with these errors */
72690 +       if (-ENOENT == ret || -E_NO_NEIGHBOR == ret)
72691 +               ret = 0;
72692 +
72693 +      out:
72694 +
72695 +       for (--nr_locked; nr_locked >= 0; --nr_locked) {
72696 +               zrelse(empty[nr_locked].node);
72697 +               longterm_unlock_znode(&empty[nr_locked]);
72698 +       }
72699 +
72700 +       if (neighbor != NULL)
72701 +               /* decrement znode reference counter without actually
72702 +                  releasing it. */
72703 +               atomic_dec(&ZJNODE(neighbor)->x_count);
72704 +
72705 +       return ret;
72706 +}
72707 +
72708 +/*
72709 +   reiser4_get_neighbor() -- lock node's neighbor.
72710 +
72711 +   reiser4_get_neighbor() locks node's neighbor (left or right one, depends on
72712 +   given parameter) using sibling link to it. If sibling link is not available
72713 +   (i.e. neighbor znode is not in cache) and flags allow read blocks, we go one
72714 +   level up for information about neighbor's disk address. We lock node's
72715 +   parent, if it is common parent for both 'node' and its neighbor, neighbor's
72716 +   disk address is in next (to left or to right) down link from link that points
72717 +   to original node. If not, we need to lock parent's neighbor, read its content
72718 +   and take first(last) downlink with neighbor's disk address.  That locking
72719 +   could be done by using sibling link and lock_neighbor() function, if sibling
72720 +   link exists. In another case we have to go level up again until we find
72721 +   common parent or valid sibling link. Then go down
72722 +   allocating/connecting/locking/reading nodes until neighbor of first one is
72723 +   locked.
72724 +
72725 +   @neighbor:  result lock handle,
72726 +   @node: a node which we lock neighbor of,
72727 +   @lock_mode: lock mode {LM_READ, LM_WRITE},
72728 +   @flags: logical OR of {GN_*} (see description above) subset.
72729 +
72730 +   @return: 0 if success, negative value if lock was impossible due to an error
72731 +   or lack of neighbor node.
72732 +*/
72733 +
72734 +/* Audited by: umka (2002.06.14), umka (2002.06.15) */
72735 +int
72736 +reiser4_get_neighbor(lock_handle * neighbor, znode * node,
72737 +                    znode_lock_mode lock_mode, int flags)
72738 +{
72739 +       reiser4_tree *tree = znode_get_tree(node);
72740 +       lock_handle path[REAL_MAX_ZTREE_HEIGHT];
72741 +
72742 +       coord_t coord;
72743 +
72744 +       tree_level base_level;
72745 +       tree_level h = 0;
72746 +       int ret;
72747 +
72748 +       assert("umka-252", tree != NULL);
72749 +       assert("umka-253", neighbor != NULL);
72750 +       assert("umka-254", node != NULL);
72751 +
72752 +       base_level = znode_get_level(node);
72753 +
72754 +       assert("umka-310", base_level <= tree->height);
72755 +
72756 +       coord_init_zero(&coord);
72757 +
72758 +      again:
72759 +       /* first, we try to use simple lock_neighbor() which requires sibling
72760 +          link existence */
72761 +       read_lock_tree(tree);
72762 +       ret = lock_side_neighbor(neighbor, node, lock_mode, flags, 1);
72763 +       read_unlock_tree(tree);
72764 +       if (!ret) {
72765 +               /* load znode content if it was specified */
72766 +               if (flags & GN_LOAD_NEIGHBOR) {
72767 +                       ret = zload(node);
72768 +                       if (ret)
72769 +                               longterm_unlock_znode(neighbor);
72770 +               }
72771 +               return ret;
72772 +       }
72773 +
72774 +       /* only -ENOENT means we may look upward and try to connect
72775 +          @node with its neighbor (if @flags allow us to do it) */
72776 +       if (ret != -ENOENT || !(flags & GN_CAN_USE_UPPER_LEVELS))
72777 +               return ret;
72778 +
72779 +       /* before establishing of sibling link we lock parent node; it is
72780 +          required by renew_neighbor() to work.  */
72781 +       init_lh(&path[0]);
72782 +       ret = reiser4_get_parent(&path[0], node, ZNODE_READ_LOCK);
72783 +       if (ret)
72784 +               return ret;
72785 +       if (znode_above_root(path[0].node)) {
72786 +               longterm_unlock_znode(&path[0]);
72787 +               return RETERR(-E_NO_NEIGHBOR);
72788 +       }
72789 +
72790 +       while (1) {
72791 +               znode *child = (h == 0) ? node : path[h - 1].node;
72792 +               znode *parent = path[h].node;
72793 +
72794 +               ret = zload(parent);
72795 +               if (ret)
72796 +                       break;
72797 +
72798 +               ret = find_child_ptr(parent, child, &coord);
72799 +
72800 +               if (ret) {
72801 +                       zrelse(parent);
72802 +                       break;
72803 +               }
72804 +
72805 +               /* try to establish missing sibling link */
72806 +               ret = renew_neighbor(&coord, child, h + base_level, flags);
72807 +
72808 +               zrelse(parent);
72809 +
72810 +               switch (ret) {
72811 +               case 0:
72812 +                       /* unlocking of parent znode prevents simple
72813 +                          deadlock situation */
72814 +                       done_lh(&path[h]);
72815 +
72816 +                       /* depend on tree level we stay on we repeat first
72817 +                          locking attempt ...  */
72818 +                       if (h == 0)
72819 +                               goto again;
72820 +
72821 +                       /* ... or repeat establishing of sibling link at
72822 +                          one level below. */
72823 +                       --h;
72824 +                       break;
72825 +
72826 +               case -ENOENT:
72827 +                       /* sibling link is not available -- we go
72828 +                          upward. */
72829 +                       init_lh(&path[h + 1]);
72830 +                       ret =
72831 +                           reiser4_get_parent(&path[h + 1], parent,
72832 +                                              ZNODE_READ_LOCK);
72833 +                       if (ret)
72834 +                               goto fail;
72835 +                       ++h;
72836 +                       if (znode_above_root(path[h].node)) {
72837 +                               ret = RETERR(-E_NO_NEIGHBOR);
72838 +                               goto fail;
72839 +                       }
72840 +                       break;
72841 +
72842 +               case -E_DEADLOCK:
72843 +                       /* there was lock request from hi-pri locker. if
72844 +                          it is possible we unlock last parent node and
72845 +                          re-lock it again. */
72846 +                       for (; reiser4_check_deadlock(); h--) {
72847 +                               done_lh(&path[h]);
72848 +                               if (h == 0)
72849 +                                       goto fail;
72850 +                       }
72851 +
72852 +                       break;
72853 +
72854 +               default:        /* other errors. */
72855 +                       goto fail;
72856 +               }
72857 +       }
72858 +      fail:
72859 +       ON_DEBUG(check_lock_node_data(node));
72860 +       ON_DEBUG(check_lock_data());
72861 +
72862 +       /* unlock path */
72863 +       do {
72864 +               /* FIXME-Zam: when we get here from case -E_DEADLOCK's goto
72865 +                  fail; path[0] is already done_lh-ed, therefore
72866 +                  longterm_unlock_znode(&path[h]); is not applicable */
72867 +               done_lh(&path[h]);
72868 +               --h;
72869 +       } while (h + 1 != 0);
72870 +
72871 +       return ret;
72872 +}
72873 +
72874 +/* remove node from sibling list */
72875 +/* Audited by: umka (2002.06.14) */
72876 +void sibling_list_remove(znode * node)
72877 +{
72878 +       reiser4_tree *tree;
72879 +
72880 +       tree = znode_get_tree(node);
72881 +       assert("umka-255", node != NULL);
72882 +       assert_rw_write_locked(&(tree->tree_lock));
72883 +       assert("nikita-3275", check_sibling_list(node));
72884 +
72885 +       write_lock_dk(tree);
72886 +       if (znode_is_right_connected(node) && node->right != NULL &&
72887 +           znode_is_left_connected(node) && node->left != NULL) {
72888 +               assert("zam-32245",
72889 +                      keyeq(znode_get_rd_key(node),
72890 +                            znode_get_ld_key(node->right)));
72891 +               znode_set_rd_key(node->left, znode_get_ld_key(node->right));
72892 +       }
72893 +       write_unlock_dk(tree);
72894 +
72895 +       if (znode_is_right_connected(node) && node->right != NULL) {
72896 +               assert("zam-322", znode_is_left_connected(node->right));
72897 +               node->right->left = node->left;
72898 +               ON_DEBUG(node->right->left_version =
72899 +                        atomic_inc_return(&delim_key_version);
72900 +                   );
72901 +       }
72902 +       if (znode_is_left_connected(node) && node->left != NULL) {
72903 +               assert("zam-323", znode_is_right_connected(node->left));
72904 +               node->left->right = node->right;
72905 +               ON_DEBUG(node->left->right_version =
72906 +                        atomic_inc_return(&delim_key_version);
72907 +                   );
72908 +       }
72909 +
72910 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
72911 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72912 +       ON_DEBUG(node->left = node->right = NULL;
72913 +                node->left_version = atomic_inc_return(&delim_key_version);
72914 +                node->right_version = atomic_inc_return(&delim_key_version););
72915 +       assert("nikita-3276", check_sibling_list(node));
72916 +}
72917 +
72918 +/* disconnect node from sibling list */
72919 +void sibling_list_drop(znode * node)
72920 +{
72921 +       znode *right;
72922 +       znode *left;
72923 +
72924 +       assert("nikita-2464", node != NULL);
72925 +       assert("nikita-3277", check_sibling_list(node));
72926 +
72927 +       right = node->right;
72928 +       if (right != NULL) {
72929 +               assert("nikita-2465", znode_is_left_connected(right));
72930 +               right->left = NULL;
72931 +               ON_DEBUG(right->left_version =
72932 +                        atomic_inc_return(&delim_key_version);
72933 +                   );
72934 +       }
72935 +       left = node->left;
72936 +       if (left != NULL) {
72937 +               assert("zam-323", znode_is_right_connected(left));
72938 +               left->right = NULL;
72939 +               ON_DEBUG(left->right_version =
72940 +                        atomic_inc_return(&delim_key_version);
72941 +                   );
72942 +       }
72943 +       ZF_CLR(node, JNODE_LEFT_CONNECTED);
72944 +       ZF_CLR(node, JNODE_RIGHT_CONNECTED);
72945 +       ON_DEBUG(node->left = node->right = NULL;
72946 +                node->left_version = atomic_inc_return(&delim_key_version);
72947 +                node->right_version = atomic_inc_return(&delim_key_version););
72948 +}
72949 +
72950 +/* Insert new node into sibling list. Regular balancing inserts new node
72951 +   after (at right side) existing and locked node (@before), except one case
72952 +   of adding new tree root node. @before should be NULL in that case. */
72953 +void sibling_list_insert_nolock(znode * new, znode * before)
72954 +{
72955 +       assert("zam-334", new != NULL);
72956 +       assert("nikita-3298", !znode_is_left_connected(new));
72957 +       assert("nikita-3299", !znode_is_right_connected(new));
72958 +       assert("nikita-3300", new->left == NULL);
72959 +       assert("nikita-3301", new->right == NULL);
72960 +       assert("nikita-3278", check_sibling_list(new));
72961 +       assert("nikita-3279", check_sibling_list(before));
72962 +
72963 +       if (before != NULL) {
72964 +               assert("zam-333", znode_is_connected(before));
72965 +               new->right = before->right;
72966 +               new->left = before;
72967 +               ON_DEBUG(new->right_version =
72968 +                        atomic_inc_return(&delim_key_version);
72969 +                        new->left_version =
72970 +                        atomic_inc_return(&delim_key_version););
72971 +               if (before->right != NULL) {
72972 +                       before->right->left = new;
72973 +                       ON_DEBUG(before->right->left_version =
72974 +                                atomic_inc_return(&delim_key_version);
72975 +                           );
72976 +               }
72977 +               before->right = new;
72978 +               ON_DEBUG(before->right_version =
72979 +                        atomic_inc_return(&delim_key_version);
72980 +                   );
72981 +       } else {
72982 +               new->right = NULL;
72983 +               new->left = NULL;
72984 +               ON_DEBUG(new->right_version =
72985 +                        atomic_inc_return(&delim_key_version);
72986 +                        new->left_version =
72987 +                        atomic_inc_return(&delim_key_version););
72988 +       }
72989 +       ZF_SET(new, JNODE_LEFT_CONNECTED);
72990 +       ZF_SET(new, JNODE_RIGHT_CONNECTED);
72991 +       assert("nikita-3280", check_sibling_list(new));
72992 +       assert("nikita-3281", check_sibling_list(before));
72993 +}
72994 +
72995 +/*
72996 +   Local variables:
72997 +   c-indentation-style: "K&R"
72998 +   mode-name: "LC"
72999 +   c-basic-offset: 8
73000 +   tab-width: 8
73001 +   fill-column: 80
73002 +   End:
73003 +*/
73004 diff --git a/fs/reiser4/tree_walk.h b/fs/reiser4/tree_walk.h
73005 new file mode 100644
73006 index 0000000..3d5f09f
73007 --- /dev/null
73008 +++ b/fs/reiser4/tree_walk.h
73009 @@ -0,0 +1,125 @@
73010 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
73011 +
73012 +/* definitions of reiser4 tree walk functions */
73013 +
73014 +#ifndef __FS_REISER4_TREE_WALK_H__
73015 +#define __FS_REISER4_TREE_WALK_H__
73016 +
73017 +#include "debug.h"
73018 +#include "forward.h"
73019 +
73020 +/* establishes horizontal links between cached znodes */
73021 +int connect_znode(coord_t * coord, znode * node);
73022 +
73023 +/* tree traversal functions (reiser4_get_parent(), reiser4_get_neighbor())
73024 +  have the following common arguments:
73025 +
73026 +  return codes:
73027 +
73028 +  @return : 0        - OK,
73029 +
73030 +ZAM-FIXME-HANS: wrong return code name.  Change them all.
73031 +           -ENOENT  - neighbor is not in cache, what is detected by sibling
73032 +                      link absence.
73033 +
73034 +            -E_NO_NEIGHBOR - we are sure that neighbor (or parent) node cannot be
73035 +                       found (because we are left-/right- most node of the
73036 +                      tree, for example). Also, this return code is for
73037 +                      reiser4_get_parent() when we see no parent link -- it
73038 +                      means that our node is root node.
73039 +
73040 +            -E_DEADLOCK - deadlock detected (request from high-priority process
73041 +                      received), other error codes are conformed to
73042 +                      /usr/include/asm/errno.h .
73043 +*/
73044 +
73045 +int
73046 +reiser4_get_parent_flags(lock_handle * result, znode * node,
73047 +                        znode_lock_mode mode, int flags);
73048 +
73049 +/* bits definition for reiser4_get_neighbor function `flags' arg. */
73050 +typedef enum {
73051 +       /* If sibling pointer is NULL, this flag allows get_neighbor() to try to
73052 +        * find not allocated not connected neigbor by going though upper
73053 +        * levels */
73054 +       GN_CAN_USE_UPPER_LEVELS = 0x1,
73055 +       /* locking left neighbor instead of right one */
73056 +       GN_GO_LEFT = 0x2,
73057 +       /* automatically load neighbor node content */
73058 +       GN_LOAD_NEIGHBOR = 0x4,
73059 +       /* return -E_REPEAT if can't lock  */
73060 +       GN_TRY_LOCK = 0x8,
73061 +       /* used internally in tree_walk.c, causes renew_sibling to not
73062 +          allocate neighbor znode, but only search for it in znode cache */
73063 +       GN_NO_ALLOC = 0x10,
73064 +       /* do not go across atom boundaries */
73065 +       GN_SAME_ATOM = 0x20,
73066 +       /* allow to lock not connected nodes */
73067 +       GN_ALLOW_NOT_CONNECTED = 0x40,
73068 +       /*  Avoid synchronous jload, instead, call jstartio() and return -E_REPEAT. */
73069 +       GN_ASYNC = 0x80
73070 +} znode_get_neigbor_flags;
73071 +
73072 +/* A commonly used wrapper for reiser4_get_parent_flags(). */
73073 +static inline int reiser4_get_parent(lock_handle * result, znode * node,
73074 +                                    znode_lock_mode mode)
73075 +{
73076 +       return reiser4_get_parent_flags(result, node, mode,
73077 +                                       GN_ALLOW_NOT_CONNECTED);
73078 +}
73079 +
73080 +int reiser4_get_neighbor(lock_handle * neighbor, znode * node,
73081 +                        znode_lock_mode lock_mode, int flags);
73082 +
73083 +/* there are wrappers for most common usages of reiser4_get_neighbor() */
73084 +static inline int
73085 +reiser4_get_left_neighbor(lock_handle * result, znode * node, int lock_mode,
73086 +                         int flags)
73087 +{
73088 +       return reiser4_get_neighbor(result, node, lock_mode,
73089 +                                   flags | GN_GO_LEFT);
73090 +}
73091 +
73092 +static inline int
73093 +reiser4_get_right_neighbor(lock_handle * result, znode * node, int lock_mode,
73094 +                          int flags)
73095 +{
73096 +       ON_DEBUG(check_lock_node_data(node));
73097 +       ON_DEBUG(check_lock_data());
73098 +       return reiser4_get_neighbor(result, node, lock_mode,
73099 +                                   flags & (~GN_GO_LEFT));
73100 +}
73101 +
73102 +extern void sibling_list_remove(znode * node);
73103 +extern void sibling_list_drop(znode * node);
73104 +extern void sibling_list_insert_nolock(znode * new, znode * before);
73105 +extern void link_left_and_right(znode * left, znode * right);
73106 +
73107 +/* Functions called by tree_walk() when tree_walk() ...  */
73108 +struct tree_walk_actor {
73109 +       /* ... meets a formatted node, */
73110 +       int (*process_znode) (tap_t *, void *);
73111 +       /* ... meets an extent, */
73112 +       int (*process_extent) (tap_t *, void *);
73113 +       /* ... begins tree traversal or repeats it after -E_REPEAT was returned by
73114 +        * node or extent processing functions. */
73115 +       int (*before) (void *);
73116 +};
73117 +
73118 +#if REISER4_DEBUG
73119 +int check_sibling_list(znode * node);
73120 +#else
73121 +#define check_sibling_list(n) (1)
73122 +#endif
73123 +
73124 +#endif                         /* __FS_REISER4_TREE_WALK_H__ */
73125 +
73126 +/*
73127 +   Local variables:
73128 +   c-indentation-style: "K&R"
73129 +   mode-name: "LC"
73130 +   c-basic-offset: 8
73131 +   tab-width: 8
73132 +   fill-column: 120
73133 +   End:
73134 +*/
73135 diff --git a/fs/reiser4/txnmgr.c b/fs/reiser4/txnmgr.c
73136 new file mode 100644
73137 index 0000000..72d525b
73138 --- /dev/null
73139 +++ b/fs/reiser4/txnmgr.c
73140 @@ -0,0 +1,3164 @@
73141 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
73142 + * reiser4/README */
73143 +
73144 +/* Joshua MacDonald wrote the first draft of this code. */
73145 +
73146 +/* ZAM-LONGTERM-FIXME-HANS: The locking in this file is badly designed, and a
73147 +filesystem scales only as well as its worst locking design.  You need to
73148 +substantially restructure this code. Josh was not as experienced a programmer
73149 +as you.  Particularly review how the locking style differs from what you did
73150 +for znodes usingt hi-lo priority locking, and present to me an opinion on
73151 +whether the differences are well founded.  */
73152 +
73153 +/* I cannot help but to disagree with the sentiment above. Locking of
73154 + * transaction manager is _not_ badly designed, and, at the very least, is not
73155 + * the scaling bottleneck. Scaling bottleneck is _exactly_ hi-lo priority
73156 + * locking on znodes, especially on the root node of the tree. --nikita,
73157 + * 2003.10.13 */
73158 +
73159 +/* The txnmgr is a set of interfaces that keep track of atoms and transcrash handles.  The
73160 +   txnmgr processes capture_block requests and manages the relationship between jnodes and
73161 +   atoms through the various stages of a transcrash, and it also oversees the fusion and
73162 +   capture-on-copy processes.  The main difficulty with this task is maintaining a
73163 +   deadlock-free lock ordering between atoms and jnodes/handles.  The reason for the
73164 +   difficulty is that jnodes, handles, and atoms contain pointer circles, and the cycle
73165 +   must be broken.  The main requirement is that atom-fusion be deadlock free, so once you
73166 +   hold the atom_lock you may then wait to acquire any jnode or handle lock.  This implies
73167 +   that any time you check the atom-pointer of a jnode or handle and then try to lock that
73168 +   atom, you must use trylock() and possibly reverse the order.
73169 +
73170 +   This code implements the design documented at:
73171 +
73172 +     http://namesys.com/txn-doc.html
73173 +
73174 +ZAM-FIXME-HANS: update v4.html to contain all of the information present in the above (but updated), and then remove the
73175 +above document and reference the new.  Be sure to provide some credit to Josh.  I already have some writings on this
73176 +topic in v4.html, but they are lacking in details present in the above.  Cure that.  Remember to write for the bright 12
73177 +year old --- define all technical terms used.
73178 +
73179 +*/
73180 +
73181 +/* Thoughts on the external transaction interface:
73182 +
73183 +   In the current code, a TRANSCRASH handle is created implicitly by reiser4_init_context() (which
73184 +   creates state that lasts for the duration of a system call and is called at the start
73185 +   of ReiserFS methods implementing VFS operations), and closed by reiser4_exit_context(),
73186 +   occupying the scope of a single system call.  We wish to give certain applications an
73187 +   interface to begin and close (commit) transactions.  Since our implementation of
73188 +   transactions does not yet support isolation, allowing an application to open a
73189 +   transaction implies trusting it to later close the transaction.  Part of the
73190 +   transaction interface will be aimed at enabling that trust, but the interface for
73191 +   actually using transactions is fairly narrow.
73192 +
73193 +   BEGIN_TRANSCRASH: Returns a transcrash identifier.  It should be possible to translate
73194 +   this identifier into a string that a shell-script could use, allowing you to start a
73195 +   transaction by issuing a command.  Once open, the transcrash should be set in the task
73196 +   structure, and there should be options (I suppose) to allow it to be carried across
73197 +   fork/exec.  A transcrash has several options:
73198 +
73199 +     - READ_FUSING or WRITE_FUSING: The default policy is for txn-capture to capture only
73200 +     on writes (WRITE_FUSING) and allow "dirty reads".  If the application wishes to
73201 +     capture on reads as well, it should set READ_FUSING.
73202 +
73203 +     - TIMEOUT: Since a non-isolated transcrash cannot be undone, every transcrash must
73204 +     eventually close (or else the machine must crash).  If the application dies an
73205 +     unexpected death with an open transcrash, for example, or if it hangs for a long
73206 +     duration, one solution (to avoid crashing the machine) is to simply close it anyway.
73207 +     This is a dangerous option, but it is one way to solve the problem until isolated
73208 +     transcrashes are available for untrusted applications.
73209 +
73210 +     It seems to be what databases do, though it is unclear how one avoids a DoS attack
73211 +     creating a vulnerability based on resource starvation.  Guaranteeing that some
73212 +     minimum amount of computational resources are made available would seem more correct
73213 +     than guaranteeing some amount of time.  When we again have someone to code the work,
73214 +     this issue should be considered carefully.  -Hans
73215 +
73216 +   RESERVE_BLOCKS: A running transcrash should indicate to the transaction manager how
73217 +   many dirty blocks it expects.  The reserve_blocks interface should be called at a point
73218 +   where it is safe for the application to fail, because the system may not be able to
73219 +   grant the allocation and the application must be able to back-out.  For this reason,
73220 +   the number of reserve-blocks can also be passed as an argument to BEGIN_TRANSCRASH, but
73221 +   the application may also wish to extend the allocation after beginning its transcrash.
73222 +
73223 +   CLOSE_TRANSCRASH: The application closes the transcrash when it is finished making
73224 +   modifications that require transaction protection.  When isolated transactions are
73225 +   supported the CLOSE operation is replaced by either COMMIT or ABORT.  For example, if a
73226 +   RESERVE_BLOCKS call fails for the application, it should "abort" by calling
73227 +   CLOSE_TRANSCRASH, even though it really commits any changes that were made (which is
73228 +   why, for safety, the application should call RESERVE_BLOCKS before making any changes).
73229 +
73230 +   For actually implementing these out-of-system-call-scopped transcrashes, the
73231 +   reiser4_context has a "txn_handle *trans" pointer that may be set to an open
73232 +   transcrash.  Currently there are no dynamically-allocated transcrashes, but there is a
73233 +   "struct kmem_cache *_txnh_slab" created for that purpose in this file.
73234 +*/
73235 +
73236 +/* Extending the other system call interfaces for future transaction features:
73237 +
73238 +   Specialized applications may benefit from passing flags to the ordinary system call
73239 +   interface such as read(), write(), or stat().  For example, the application specifies
73240 +   WRITE_FUSING by default but wishes to add that a certain read() command should be
73241 +   treated as READ_FUSING.  But which read?  Is it the directory-entry read, the stat-data
73242 +   read, or the file-data read?  These issues are straight-forward, but there are a lot of
73243 +   them and adding the necessary flags-passing code will be tedious.
73244 +
73245 +   When supporting isolated transactions, there is a corresponding READ_MODIFY_WRITE (RMW)
73246 +   flag, which specifies that although it is a read operation being requested, a
73247 +   write-lock should be taken.  The reason is that read-locks are shared while write-locks
73248 +   are exclusive, so taking a read-lock when a later-write is known in advance will often
73249 +   leads to deadlock.  If a reader knows it will write later, it should issue read
73250 +   requests with the RMW flag set.
73251 +*/
73252 +
73253 +/*
73254 +   The znode/atom deadlock avoidance.
73255 +
73256 +   FIXME(Zam): writing of this comment is in progress.
73257 +
73258 +   The atom's special stage ASTAGE_CAPTURE_WAIT introduces a kind of atom's
73259 +   long-term locking, which makes reiser4 locking scheme more complex.  It had
73260 +   deadlocks until we implement deadlock avoidance algorithms.  That deadlocks
73261 +   looked as the following: one stopped thread waits for a long-term lock on
73262 +   znode, the thread who owns that lock waits when fusion with another atom will
73263 +   be allowed.
73264 +
73265 +   The source of the deadlocks is an optimization of not capturing index nodes
73266 +   for read.  Let's prove it.  Suppose we have dumb node capturing scheme which
73267 +   unconditionally captures each block before locking it.
73268 +
73269 +   That scheme has no deadlocks.  Let's begin with the thread which stage is
73270 +   ASTAGE_CAPTURE_WAIT and it waits for a znode lock.  The thread can't wait for
73271 +   a capture because it's stage allows fusion with any atom except which are
73272 +   being committed currently. A process of atom commit can't deadlock because
73273 +   atom commit procedure does not acquire locks and does not fuse with other
73274 +   atoms.  Reiser4 does capturing right before going to sleep inside the
73275 +   longtertm_lock_znode() function, it means the znode which we want to lock is
73276 +   already captured and its atom is in ASTAGE_CAPTURE_WAIT stage.  If we
73277 +   continue the analysis we understand that no one process in the sequence may
73278 +   waits atom fusion.  Thereby there are no deadlocks of described kind.
73279 +
73280 +   The capturing optimization makes the deadlocks possible.  A thread can wait a
73281 +   lock which owner did not captured that node.  The lock owner's current atom
73282 +   is not fused with the first atom and it does not get a ASTAGE_CAPTURE_WAIT
73283 +   state. A deadlock is possible when that atom meets another one which is in
73284 +   ASTAGE_CAPTURE_WAIT already.
73285 +
73286 +   The deadlock avoidance scheme includes two algorithms:
73287 +
73288 +   First algorithm is used when a thread captures a node which is locked but not
73289 +   captured by another thread.  Those nodes are marked MISSED_IN_CAPTURE at the
73290 +   moment we skip their capturing.  If such a node (marked MISSED_IN_CAPTURE) is
73291 +   being captured by a thread with current atom is in ASTAGE_CAPTURE_WAIT, the
73292 +   routine which forces all lock owners to join with current atom is executed.
73293 +
73294 +   Second algorithm does not allow to skip capturing of already captured nodes.
73295 +
73296 +   Both algorithms together prevent waiting a longterm lock without atom fusion
73297 +   with atoms of all lock owners, which is a key thing for getting atom/znode
73298 +   locking deadlocks.
73299 +*/
73300 +
73301 +/*
73302 + * Transactions and mmap(2).
73303 + *
73304 + *     1. Transactions are not supported for accesses through mmap(2), because
73305 + *     this would effectively amount to user-level transactions whose duration
73306 + *     is beyond control of the kernel.
73307 + *
73308 + *     2. That said, we still want to preserve some decency with regard to
73309 + *     mmap(2). During normal write(2) call, following sequence of events
73310 + *     happens:
73311 + *
73312 + *         1. page is created;
73313 + *
73314 + *         2. jnode is created, dirtied and captured into current atom.
73315 + *
73316 + *         3. extent is inserted and modified.
73317 + *
73318 + *     Steps (2) and (3) take place under long term lock on the twig node.
73319 + *
73320 + *     When file is accessed through mmap(2) page is always created during
73321 + *     page fault.
73322 + *     After this (in reiser4_readpage()->reiser4_readpage_extent()):
73323 + *
73324 + *         1. if access is made to non-hole page new jnode is created, (if
73325 + *         necessary)
73326 + *
73327 + *         2. if access is made to the hole page, jnode is not created (XXX
73328 + *         not clear why).
73329 + *
73330 + *     Also, even if page is created by write page fault it is not marked
73331 + *     dirty immediately by handle_mm_fault(). Probably this is to avoid races
73332 + *     with page write-out.
73333 + *
73334 + *     Dirty bit installed by hardware is only transferred to the struct page
73335 + *     later, when page is unmapped (in zap_pte_range(), or
73336 + *     try_to_unmap_one()).
73337 + *
73338 + *     So, with mmap(2) we have to handle following irksome situations:
73339 + *
73340 + *         1. there exists modified page (clean or dirty) without jnode
73341 + *
73342 + *         2. there exists modified page (clean or dirty) with clean jnode
73343 + *
73344 + *         3. clean page which is a part of atom can be transparently modified
73345 + *         at any moment through mapping without becoming dirty.
73346 + *
73347 + *     (1) and (2) can lead to the out-of-memory situation: ->writepage()
73348 + *     doesn't know what to do with such pages and ->sync_sb()/->writepages()
73349 + *     don't see them, because these methods operate on atoms.
73350 + *
73351 + *     (3) can lead to the loss of data: suppose we have dirty page with dirty
73352 + *     captured jnode captured by some atom. As part of early flush (for
73353 + *     example) page was written out. Dirty bit was cleared on both page and
73354 + *     jnode. After this page is modified through mapping, but kernel doesn't
73355 + *     notice and just discards page and jnode as part of commit. (XXX
73356 + *     actually it doesn't, because to reclaim page ->releasepage() has to be
73357 + *     called and before this dirty bit will be transferred to the struct
73358 + *     page).
73359 + *
73360 + */
73361 +
73362 +#include "debug.h"
73363 +#include "txnmgr.h"
73364 +#include "jnode.h"
73365 +#include "znode.h"
73366 +#include "block_alloc.h"
73367 +#include "tree.h"
73368 +#include "wander.h"
73369 +#include "ktxnmgrd.h"
73370 +#include "super.h"
73371 +#include "page_cache.h"
73372 +#include "reiser4.h"
73373 +#include "vfs_ops.h"
73374 +#include "inode.h"
73375 +#include "flush.h"
73376 +
73377 +#include <asm/atomic.h>
73378 +#include <linux/types.h>
73379 +#include <linux/fs.h>
73380 +#include <linux/mm.h>
73381 +#include <linux/slab.h>
73382 +#include <linux/pagemap.h>
73383 +#include <linux/writeback.h>
73384 +#include <linux/swap.h>                /* for totalram_pages */
73385 +
73386 +static void atom_free(txn_atom * atom);
73387 +
73388 +static int commit_txnh(txn_handle * txnh);
73389 +
73390 +static void wakeup_atom_waitfor_list(txn_atom * atom);
73391 +static void wakeup_atom_waiting_list(txn_atom * atom);
73392 +
73393 +static void capture_assign_txnh_nolock(txn_atom * atom, txn_handle * txnh);
73394 +
73395 +static void capture_assign_block_nolock(txn_atom * atom, jnode * node);
73396 +
73397 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node);
73398 +
73399 +static int capture_init_fusion(jnode * node, txn_handle * txnh,
73400 +                              txn_capture mode);
73401 +
73402 +static int capture_fuse_wait(txn_handle *, txn_atom *, txn_atom *, txn_capture);
73403 +
73404 +static void capture_fuse_into(txn_atom * small, txn_atom * large);
73405 +
73406 +void reiser4_invalidate_list(struct list_head *);
73407 +
73408 +/* GENERIC STRUCTURES */
73409 +
73410 +typedef struct _txn_wait_links txn_wait_links;
73411 +
73412 +struct _txn_wait_links {
73413 +       lock_stack *_lock_stack;
73414 +       struct list_head _fwaitfor_link;
73415 +       struct list_head _fwaiting_link;
73416 +       int (*waitfor_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
73417 +       int (*waiting_cb) (txn_atom * atom, struct _txn_wait_links * wlinks);
73418 +};
73419 +
73420 +/* FIXME: In theory, we should be using the slab cache init & destructor
73421 +   methods instead of, e.g., jnode_init, etc. */
73422 +static struct kmem_cache *_atom_slab = NULL;
73423 +/* this is for user-visible, cross system-call transactions. */
73424 +static struct kmem_cache *_txnh_slab = NULL;
73425 +
73426 +/**
73427 + * init_txnmgr_static - create transaction manager slab caches
73428 + *
73429 + * Initializes caches of txn-atoms and txn_handle. It is part of reiser4 module
73430 + * initialization.
73431 + */
73432 +int init_txnmgr_static(void)
73433 +{
73434 +       assert("jmacd-600", _atom_slab == NULL);
73435 +       assert("jmacd-601", _txnh_slab == NULL);
73436 +
73437 +       ON_DEBUG(atomic_set(&flush_cnt, 0));
73438 +
73439 +       _atom_slab = kmem_cache_create("txn_atom", sizeof(txn_atom), 0,
73440 +                                      SLAB_HWCACHE_ALIGN |
73441 +                                      SLAB_RECLAIM_ACCOUNT, NULL, NULL);
73442 +       if (_atom_slab == NULL)
73443 +               return RETERR(-ENOMEM);
73444 +
73445 +       _txnh_slab = kmem_cache_create("txn_handle", sizeof(txn_handle), 0,
73446 +                             SLAB_HWCACHE_ALIGN, NULL, NULL);
73447 +       if (_txnh_slab == NULL) {
73448 +               kmem_cache_destroy(_atom_slab);
73449 +               _atom_slab = NULL;
73450 +               return RETERR(-ENOMEM);
73451 +       }
73452 +
73453 +       return 0;
73454 +}
73455 +
73456 +/**
73457 + * done_txnmgr_static - delete txn_atom and txn_handle caches
73458 + *
73459 + * This is called on reiser4 module unloading or system shutdown.
73460 + */
73461 +void done_txnmgr_static(void)
73462 +{
73463 +       destroy_reiser4_cache(&_atom_slab);
73464 +       destroy_reiser4_cache(&_txnh_slab);
73465 +}
73466 +
73467 +/**
73468 + * init_txnmgr - initialize a new transaction manager
73469 + * @mgr: pointer to transaction manager embedded in reiser4 super block
73470 + *
73471 + * This is called on mount. Makes necessary initializations.
73472 + */
73473 +void reiser4_init_txnmgr(txn_mgr *mgr)
73474 +{
73475 +       assert("umka-169", mgr != NULL);
73476 +
73477 +       mgr->atom_count = 0;
73478 +       mgr->id_count = 1;
73479 +       INIT_LIST_HEAD(&mgr->atoms_list);
73480 +       spin_lock_init(&mgr->tmgr_lock);
73481 +       mutex_init(&mgr->commit_mutex);
73482 +}
73483 +
73484 +/**
73485 + * reiser4_done_txnmgr - stop transaction manager
73486 + * @mgr: pointer to transaction manager embedded in reiser4 super block
73487 + *
73488 + * This is called on umount. Does sanity checks.
73489 + */
73490 +void reiser4_done_txnmgr(txn_mgr *mgr)
73491 +{
73492 +       assert("umka-170", mgr != NULL);
73493 +       assert("umka-1701", list_empty_careful(&mgr->atoms_list));
73494 +       assert("umka-1702", mgr->atom_count == 0);
73495 +}
73496 +
73497 +/* Initialize a transaction handle. */
73498 +/* Audited by: umka (2002.06.13) */
73499 +static void txnh_init(txn_handle * txnh, txn_mode mode)
73500 +{
73501 +       assert("umka-171", txnh != NULL);
73502 +
73503 +       txnh->mode = mode;
73504 +       txnh->atom = NULL;
73505 +       reiser4_ctx_gfp_mask_set();
73506 +       txnh->flags = 0;
73507 +       spin_lock_init(&txnh->hlock);
73508 +       INIT_LIST_HEAD(&txnh->txnh_link);
73509 +}
73510 +
73511 +#if REISER4_DEBUG
73512 +/* Check if a transaction handle is clean. */
73513 +static int txnh_isclean(txn_handle * txnh)
73514 +{
73515 +       assert("umka-172", txnh != NULL);
73516 +       return txnh->atom == NULL &&
73517 +               LOCK_CNT_NIL(spin_locked_txnh);
73518 +}
73519 +#endif
73520 +
73521 +/* Initialize an atom. */
73522 +static void atom_init(txn_atom * atom)
73523 +{
73524 +       int level;
73525 +
73526 +       assert("umka-173", atom != NULL);
73527 +
73528 +       memset(atom, 0, sizeof(txn_atom));
73529 +
73530 +       atom->stage = ASTAGE_FREE;
73531 +       atom->start_time = jiffies;
73532 +
73533 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1)
73534 +               INIT_LIST_HEAD(ATOM_DIRTY_LIST(atom, level));
73535 +
73536 +       INIT_LIST_HEAD(ATOM_CLEAN_LIST(atom));
73537 +       INIT_LIST_HEAD(ATOM_OVRWR_LIST(atom));
73538 +       INIT_LIST_HEAD(ATOM_WB_LIST(atom));
73539 +       INIT_LIST_HEAD(&atom->inodes);
73540 +       spin_lock_init(&(atom->alock));
73541 +       /* list of transaction handles */
73542 +       INIT_LIST_HEAD(&atom->txnh_list);
73543 +       /* link to transaction manager's list of atoms */
73544 +       INIT_LIST_HEAD(&atom->atom_link);
73545 +       INIT_LIST_HEAD(&atom->fwaitfor_list);
73546 +       INIT_LIST_HEAD(&atom->fwaiting_list);
73547 +       blocknr_set_init(&atom->delete_set);
73548 +       blocknr_set_init(&atom->wandered_map);
73549 +
73550 +       init_atom_fq_parts(atom);
73551 +}
73552 +
73553 +#if REISER4_DEBUG
73554 +/* Check if an atom is clean. */
73555 +static int atom_isclean(txn_atom * atom)
73556 +{
73557 +       int level;
73558 +
73559 +       assert("umka-174", atom != NULL);
73560 +
73561 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
73562 +               if (!list_empty_careful(ATOM_DIRTY_LIST(atom, level))) {
73563 +                       return 0;
73564 +               }
73565 +       }
73566 +
73567 +       return  atom->stage == ASTAGE_FREE &&
73568 +               atom->txnh_count == 0 &&
73569 +               atom->capture_count == 0 &&
73570 +               atomic_read(&atom->refcount) == 0 &&
73571 +               (&atom->atom_link == atom->atom_link.next &&
73572 +                &atom->atom_link == atom->atom_link.prev) &&
73573 +               list_empty_careful(&atom->txnh_list) &&
73574 +               list_empty_careful(ATOM_CLEAN_LIST(atom)) &&
73575 +               list_empty_careful(ATOM_OVRWR_LIST(atom)) &&
73576 +               list_empty_careful(ATOM_WB_LIST(atom)) &&
73577 +               list_empty_careful(&atom->fwaitfor_list) &&
73578 +               list_empty_careful(&atom->fwaiting_list) &&
73579 +               atom_fq_parts_are_clean(atom);
73580 +}
73581 +#endif
73582 +
73583 +/* Begin a transaction in this context.  Currently this uses the reiser4_context's
73584 +   trans_in_ctx, which means that transaction handles are stack-allocated.  Eventually
73585 +   this will be extended to allow transaction handles to span several contexts. */
73586 +/* Audited by: umka (2002.06.13) */
73587 +void reiser4_txn_begin(reiser4_context * context)
73588 +{
73589 +       assert("jmacd-544", context->trans == NULL);
73590 +
73591 +       context->trans = &context->trans_in_ctx;
73592 +
73593 +       /* FIXME_LATER_JMACD Currently there's no way to begin a TXN_READ_FUSING
73594 +          transcrash.  Default should be TXN_WRITE_FUSING.  Also, the _trans variable is
73595 +          stack allocated right now, but we would like to allow for dynamically allocated
73596 +          transcrashes that span multiple system calls.
73597 +        */
73598 +       txnh_init(context->trans, TXN_WRITE_FUSING);
73599 +}
73600 +
73601 +/* Finish a transaction handle context. */
73602 +int reiser4_txn_end(reiser4_context * context)
73603 +{
73604 +       long ret = 0;
73605 +       txn_handle *txnh;
73606 +
73607 +       assert("umka-283", context != NULL);
73608 +       assert("nikita-3012", reiser4_schedulable());
73609 +       assert("vs-24", context == get_current_context());
73610 +       assert("nikita-2967", lock_stack_isclean(get_current_lock_stack()));
73611 +
73612 +       txnh = context->trans;
73613 +       if (txnh != NULL) {
73614 +               if (txnh->atom != NULL)
73615 +                       ret = commit_txnh(txnh);
73616 +               assert("jmacd-633", txnh_isclean(txnh));
73617 +               context->trans = NULL;
73618 +       }
73619 +       return ret;
73620 +}
73621 +
73622 +void reiser4_txn_restart(reiser4_context * context)
73623 +{
73624 +       reiser4_txn_end(context);
73625 +       reiser4_preempt_point();
73626 +       reiser4_txn_begin(context);
73627 +}
73628 +
73629 +void reiser4_txn_restart_current(void)
73630 +{
73631 +       reiser4_txn_restart(get_current_context());
73632 +}
73633 +
73634 +/* TXN_ATOM */
73635 +
73636 +/* Get the atom belonging to a txnh, which is not locked.  Return txnh locked. Locks atom, if atom
73637 +   is not NULL.  This performs the necessary spin_trylock to break the lock-ordering cycle.  May
73638 +   return NULL. */
73639 +static txn_atom *txnh_get_atom(txn_handle * txnh)
73640 +{
73641 +       txn_atom *atom;
73642 +
73643 +       assert("umka-180", txnh != NULL);
73644 +       assert_spin_not_locked(&(txnh->hlock));
73645 +
73646 +       while (1) {
73647 +               spin_lock_txnh(txnh);
73648 +               atom = txnh->atom;
73649 +
73650 +               if (atom == NULL)
73651 +                       break;
73652 +
73653 +               if (spin_trylock_atom(atom))
73654 +                       break;
73655 +
73656 +               atomic_inc(&atom->refcount);
73657 +
73658 +               spin_unlock_txnh(txnh);
73659 +               spin_lock_atom(atom);
73660 +               spin_lock_txnh(txnh);
73661 +
73662 +               if (txnh->atom == atom) {
73663 +                       atomic_dec(&atom->refcount);
73664 +                       break;
73665 +               }
73666 +
73667 +               spin_unlock_txnh(txnh);
73668 +               atom_dec_and_unlock(atom);
73669 +       }
73670 +
73671 +       return atom;
73672 +}
73673 +
73674 +/* Get the current atom and spinlock it if current atom present. May return NULL  */
73675 +txn_atom *get_current_atom_locked_nocheck(void)
73676 +{
73677 +       reiser4_context *cx;
73678 +       txn_atom *atom;
73679 +       txn_handle *txnh;
73680 +
73681 +       cx = get_current_context();
73682 +       assert("zam-437", cx != NULL);
73683 +
73684 +       txnh = cx->trans;
73685 +       assert("zam-435", txnh != NULL);
73686 +
73687 +       atom = txnh_get_atom(txnh);
73688 +
73689 +       spin_unlock_txnh(txnh);
73690 +       return atom;
73691 +}
73692 +
73693 +/* Get the atom belonging to a jnode, which is initially locked.  Return with
73694 +   both jnode and atom locked.  This performs the necessary spin_trylock to
73695 +   break the lock-ordering cycle.  Assumes the jnode is already locked, and
73696 +   returns NULL if atom is not set. */
73697 +txn_atom *jnode_get_atom(jnode * node)
73698 +{
73699 +       txn_atom *atom;
73700 +
73701 +       assert("umka-181", node != NULL);
73702 +
73703 +       while (1) {
73704 +               assert_spin_locked(&(node->guard));
73705 +
73706 +               atom = node->atom;
73707 +               /* node is not in any atom */
73708 +               if (atom == NULL)
73709 +                       break;
73710 +
73711 +               /* If atom is not locked, grab the lock and return */
73712 +               if (spin_trylock_atom(atom))
73713 +                       break;
73714 +
73715 +               /* At least one jnode belongs to this atom it guarantees that
73716 +                * atom->refcount > 0, we can safely increment refcount. */
73717 +               atomic_inc(&atom->refcount);
73718 +               spin_unlock_jnode(node);
73719 +
73720 +               /* re-acquire spin locks in the right order */
73721 +               spin_lock_atom(atom);
73722 +               spin_lock_jnode(node);
73723 +
73724 +               /* check if node still points to the same atom. */
73725 +               if (node->atom == atom) {
73726 +                       atomic_dec(&atom->refcount);
73727 +                       break;
73728 +               }
73729 +
73730 +               /* releasing of atom lock and reference requires not holding
73731 +                * locks on jnodes.  */
73732 +               spin_unlock_jnode(node);
73733 +
73734 +               /* We do not sure that this atom has extra references except our
73735 +                * one, so we should call proper function which may free atom if
73736 +                * last reference is released. */
73737 +               atom_dec_and_unlock(atom);
73738 +
73739 +               /* lock jnode again for getting valid node->atom pointer
73740 +                * value. */
73741 +               spin_lock_jnode(node);
73742 +       }
73743 +
73744 +       return atom;
73745 +}
73746 +
73747 +/* Returns true if @node is dirty and part of the same atom as one of its neighbors.  Used
73748 +   by flush code to indicate whether the next node (in some direction) is suitable for
73749 +   flushing. */
73750 +int
73751 +same_slum_check(jnode * node, jnode * check, int alloc_check, int alloc_value)
73752 +{
73753 +       int compat;
73754 +       txn_atom *atom;
73755 +
73756 +       assert("umka-182", node != NULL);
73757 +       assert("umka-183", check != NULL);
73758 +
73759 +       /* Not sure what this function is supposed to do if supplied with @check that is
73760 +          neither formatted nor unformatted (bitmap or so). */
73761 +       assert("nikita-2373", jnode_is_znode(check)
73762 +              || jnode_is_unformatted(check));
73763 +
73764 +       /* Need a lock on CHECK to get its atom and to check various state bits.
73765 +          Don't need a lock on NODE once we get the atom lock. */
73766 +       /* It is not enough to lock two nodes and check (node->atom ==
73767 +          check->atom) because atom could be locked and being fused at that
73768 +          moment, jnodes of the atom of that state (being fused) can point to
73769 +          different objects, but the atom is the same. */
73770 +       spin_lock_jnode(check);
73771 +
73772 +       atom = jnode_get_atom(check);
73773 +
73774 +       if (atom == NULL) {
73775 +               compat = 0;
73776 +       } else {
73777 +               compat = (node->atom == atom && JF_ISSET(check, JNODE_DIRTY));
73778 +
73779 +               if (compat && jnode_is_znode(check)) {
73780 +                       compat &= znode_is_connected(JZNODE(check));
73781 +               }
73782 +
73783 +               if (compat && alloc_check) {
73784 +                       compat &= (alloc_value == jnode_is_flushprepped(check));
73785 +               }
73786 +
73787 +               spin_unlock_atom(atom);
73788 +       }
73789 +
73790 +       spin_unlock_jnode(check);
73791 +
73792 +       return compat;
73793 +}
73794 +
73795 +/* Decrement the atom's reference count and if it falls to zero, free it. */
73796 +void atom_dec_and_unlock(txn_atom * atom)
73797 +{
73798 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73799 +
73800 +       assert("umka-186", atom != NULL);
73801 +       assert_spin_locked(&(atom->alock));
73802 +       assert("zam-1039", atomic_read(&atom->refcount) > 0);
73803 +
73804 +       if (atomic_dec_and_test(&atom->refcount)) {
73805 +               /* take txnmgr lock and atom lock in proper order. */
73806 +               if (!spin_trylock_txnmgr(mgr)) {
73807 +                       /* This atom should exist after we re-acquire its
73808 +                        * spinlock, so we increment its reference counter. */
73809 +                       atomic_inc(&atom->refcount);
73810 +                       spin_unlock_atom(atom);
73811 +                       spin_lock_txnmgr(mgr);
73812 +                       spin_lock_atom(atom);
73813 +
73814 +                       if (!atomic_dec_and_test(&atom->refcount)) {
73815 +                               spin_unlock_atom(atom);
73816 +                               spin_unlock_txnmgr(mgr);
73817 +                               return;
73818 +                       }
73819 +               }
73820 +               assert_spin_locked(&(mgr->tmgr_lock));
73821 +               atom_free(atom);
73822 +               spin_unlock_txnmgr(mgr);
73823 +       } else
73824 +               spin_unlock_atom(atom);
73825 +}
73826 +
73827 +/* Create new atom and connect it to given transaction handle.  This adds the
73828 +   atom to the transaction manager's list and sets its reference count to 1, an
73829 +   artificial reference which is kept until it commits.  We play strange games
73830 +   to avoid allocation under jnode & txnh spinlocks.*/
73831 +
73832 +static int atom_begin_and_assign_to_txnh(txn_atom ** atom_alloc, txn_handle * txnh)
73833 +{
73834 +       txn_atom *atom;
73835 +       txn_mgr *mgr;
73836 +
73837 +       if (REISER4_DEBUG && rofs_tree(current_tree)) {
73838 +               warning("nikita-3366", "Creating atom on rofs");
73839 +               dump_stack();
73840 +       }
73841 +
73842 +       if (*atom_alloc == NULL) {
73843 +               (*atom_alloc) = kmem_cache_alloc(_atom_slab,
73844 +                                                reiser4_ctx_gfp_mask_get());
73845 +
73846 +               if (*atom_alloc == NULL)
73847 +                       return RETERR(-ENOMEM);
73848 +       }
73849 +
73850 +       /* and, also, txnmgr spin lock should be taken before jnode and txnh
73851 +          locks. */
73852 +       mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73853 +       spin_lock_txnmgr(mgr);
73854 +       spin_lock_txnh(txnh);
73855 +
73856 +       /* Check whether new atom still needed */
73857 +       if (txnh->atom != NULL) {
73858 +               /* NOTE-NIKITA probably it is rather better to free
73859 +                * atom_alloc here than thread it up to reiser4_try_capture() */
73860 +
73861 +               spin_unlock_txnh(txnh);
73862 +               spin_unlock_txnmgr(mgr);
73863 +
73864 +               return -E_REPEAT;
73865 +       }
73866 +
73867 +       atom = *atom_alloc;
73868 +       *atom_alloc = NULL;
73869 +
73870 +       atom_init(atom);
73871 +
73872 +       assert("jmacd-17", atom_isclean(atom));
73873 +
73874 +        /*
73875 +        * lock ordering is broken here. It is ok, as long as @atom is new
73876 +        * and inaccessible for others. We can't use spin_lock_atom or
73877 +        * spin_lock(&atom->alock) because they care about locking
73878 +        * dependencies. spin_trylock_lock doesn't.
73879 +        */
73880 +       check_me("", spin_trylock_atom(atom));
73881 +
73882 +       /* add atom to the end of transaction manager's list of atoms */
73883 +       list_add_tail(&atom->atom_link, &mgr->atoms_list);
73884 +       atom->atom_id = mgr->id_count++;
73885 +       mgr->atom_count += 1;
73886 +
73887 +       /* Release txnmgr lock */
73888 +       spin_unlock_txnmgr(mgr);
73889 +
73890 +       /* One reference until it commits. */
73891 +       atomic_inc(&atom->refcount);
73892 +       atom->stage = ASTAGE_CAPTURE_FUSE;
73893 +       atom->super = reiser4_get_current_sb();
73894 +       capture_assign_txnh_nolock(atom, txnh);
73895 +
73896 +       spin_unlock_atom(atom);
73897 +       spin_unlock_txnh(txnh);
73898 +
73899 +       return -E_REPEAT;
73900 +}
73901 +
73902 +/* Return true if an atom is currently "open". */
73903 +static int atom_isopen(const txn_atom * atom)
73904 +{
73905 +       assert("umka-185", atom != NULL);
73906 +
73907 +       return atom->stage > 0 && atom->stage < ASTAGE_PRE_COMMIT;
73908 +}
73909 +
73910 +/* Return the number of pointers to this atom that must be updated during fusion.  This
73911 +   approximates the amount of work to be done.  Fusion chooses the atom with fewer
73912 +   pointers to fuse into the atom with more pointers. */
73913 +static int atom_pointer_count(const txn_atom * atom)
73914 +{
73915 +       assert("umka-187", atom != NULL);
73916 +
73917 +       /* This is a measure of the amount of work needed to fuse this atom
73918 +        * into another. */
73919 +       return atom->txnh_count + atom->capture_count;
73920 +}
73921 +
73922 +/* Called holding the atom lock, this removes the atom from the transaction manager list
73923 +   and frees it. */
73924 +static void atom_free(txn_atom * atom)
73925 +{
73926 +       txn_mgr *mgr = &get_super_private(reiser4_get_current_sb())->tmgr;
73927 +
73928 +       assert("umka-188", atom != NULL);
73929 +       assert_spin_locked(&(atom->alock));
73930 +
73931 +       /* Remove from the txn_mgr's atom list */
73932 +       assert_spin_locked(&(mgr->tmgr_lock));
73933 +       mgr->atom_count -= 1;
73934 +       list_del_init(&atom->atom_link);
73935 +
73936 +       /* Clean the atom */
73937 +       assert("jmacd-16",
73938 +              (atom->stage == ASTAGE_INVALID || atom->stage == ASTAGE_DONE));
73939 +       atom->stage = ASTAGE_FREE;
73940 +
73941 +       blocknr_set_destroy(&atom->delete_set);
73942 +       blocknr_set_destroy(&atom->wandered_map);
73943 +
73944 +       assert("jmacd-16", atom_isclean(atom));
73945 +
73946 +       spin_unlock_atom(atom);
73947 +
73948 +       kmem_cache_free(_atom_slab, atom);
73949 +}
73950 +
73951 +static int atom_is_dotard(const txn_atom * atom)
73952 +{
73953 +       return time_after(jiffies, atom->start_time +
73954 +                         get_current_super_private()->tmgr.atom_max_age);
73955 +}
73956 +
73957 +static int atom_can_be_committed(txn_atom * atom)
73958 +{
73959 +       assert_spin_locked(&(atom->alock));
73960 +       assert("zam-885", atom->txnh_count > atom->nr_waiters);
73961 +       return atom->txnh_count == atom->nr_waiters + 1;
73962 +}
73963 +
73964 +/* Return true if an atom should commit now.  This is determined by aging, atom
73965 +   size or atom flags. */
73966 +static int atom_should_commit(const txn_atom * atom)
73967 +{
73968 +       assert("umka-189", atom != NULL);
73969 +       return
73970 +           (atom->flags & ATOM_FORCE_COMMIT) ||
73971 +           ((unsigned)atom_pointer_count(atom) >
73972 +            get_current_super_private()->tmgr.atom_max_size)
73973 +           || atom_is_dotard(atom);
73974 +}
73975 +
73976 +/* return 1 if current atom exists and requires commit. */
73977 +int current_atom_should_commit(void)
73978 +{
73979 +       txn_atom *atom;
73980 +       int result = 0;
73981 +
73982 +       atom = get_current_atom_locked_nocheck();
73983 +       if (atom) {
73984 +               result = atom_should_commit(atom);
73985 +               spin_unlock_atom(atom);
73986 +       }
73987 +       return result;
73988 +}
73989 +
73990 +static int atom_should_commit_asap(const txn_atom * atom)
73991 +{
73992 +       unsigned int captured;
73993 +       unsigned int pinnedpages;
73994 +
73995 +       assert("nikita-3309", atom != NULL);
73996 +
73997 +       captured = (unsigned)atom->capture_count;
73998 +       pinnedpages = (captured >> PAGE_CACHE_SHIFT) * sizeof(znode);
73999 +
74000 +       return (pinnedpages > (totalram_pages >> 3)) || (atom->flushed > 100);
74001 +}
74002 +
74003 +static jnode *find_first_dirty_in_list(struct list_head *head, int flags)
74004 +{
74005 +       jnode *first_dirty;
74006 +
74007 +       list_for_each_entry(first_dirty, head, capture_link) {
74008 +               if (!(flags & JNODE_FLUSH_COMMIT)) {
74009 +                       /*
74010 +                        * skip jnodes which "heard banshee" or having active
74011 +                        * I/O
74012 +                        */
74013 +                       if (JF_ISSET(first_dirty, JNODE_HEARD_BANSHEE) ||
74014 +                           JF_ISSET(first_dirty, JNODE_WRITEBACK))
74015 +                               continue;
74016 +               }
74017 +               return first_dirty;
74018 +       }
74019 +       return NULL;
74020 +}
74021 +
74022 +/* Get first dirty node from the atom's dirty_nodes[n] lists; return NULL if atom has no dirty
74023 +   nodes on atom's lists */
74024 +jnode *find_first_dirty_jnode(txn_atom * atom, int flags)
74025 +{
74026 +       jnode *first_dirty;
74027 +       tree_level level;
74028 +
74029 +       assert_spin_locked(&(atom->alock));
74030 +
74031 +       /* The flush starts from LEAF_LEVEL (=1). */
74032 +       for (level = 1; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
74033 +               if (list_empty_careful(ATOM_DIRTY_LIST(atom, level)))
74034 +                       continue;
74035 +
74036 +               first_dirty =
74037 +                   find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, level),
74038 +                                            flags);
74039 +               if (first_dirty)
74040 +                       return first_dirty;
74041 +       }
74042 +
74043 +       /* znode-above-root is on the list #0. */
74044 +       return find_first_dirty_in_list(ATOM_DIRTY_LIST(atom, 0), flags);
74045 +}
74046 +
74047 +static void dispatch_wb_list(txn_atom * atom, flush_queue_t * fq)
74048 +{
74049 +       jnode *cur;
74050 +
74051 +       assert("zam-905", atom_is_protected(atom));
74052 +
74053 +       cur = list_entry(ATOM_WB_LIST(atom)->next, jnode, capture_link);
74054 +       while (ATOM_WB_LIST(atom) != &cur->capture_link) {
74055 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
74056 +
74057 +               spin_lock_jnode(cur);
74058 +               if (!JF_ISSET(cur, JNODE_WRITEBACK)) {
74059 +                       if (JF_ISSET(cur, JNODE_DIRTY)) {
74060 +                               queue_jnode(fq, cur);
74061 +                       } else {
74062 +                               /* move jnode to atom's clean list */
74063 +                               list_move_tail(&cur->capture_link,
74064 +                                             ATOM_CLEAN_LIST(atom));
74065 +                       }
74066 +               }
74067 +               spin_unlock_jnode(cur);
74068 +
74069 +               cur = next;
74070 +       }
74071 +}
74072 +
74073 +/* Scan current atom->writeback_nodes list, re-submit dirty and !writeback
74074 + * jnodes to disk. */
74075 +static int submit_wb_list(void)
74076 +{
74077 +       int ret;
74078 +       flush_queue_t *fq;
74079 +
74080 +       fq = get_fq_for_current_atom();
74081 +       if (IS_ERR(fq))
74082 +               return PTR_ERR(fq);
74083 +
74084 +       dispatch_wb_list(fq->atom, fq);
74085 +       spin_unlock_atom(fq->atom);
74086 +
74087 +       ret = reiser4_write_fq(fq, NULL, 1);
74088 +       reiser4_fq_put(fq);
74089 +
74090 +       return ret;
74091 +}
74092 +
74093 +/* Wait completion of all writes, re-submit atom writeback list if needed. */
74094 +static int current_atom_complete_writes(void)
74095 +{
74096 +       int ret;
74097 +
74098 +       /* Each jnode from that list was modified and dirtied when it had i/o
74099 +        * request running already. After i/o completion we have to resubmit
74100 +        * them to disk again.*/
74101 +       ret = submit_wb_list();
74102 +       if (ret < 0)
74103 +               return ret;
74104 +
74105 +       /* Wait all i/o completion */
74106 +       ret = current_atom_finish_all_fq();
74107 +       if (ret)
74108 +               return ret;
74109 +
74110 +       /* Scan wb list again; all i/o should be completed, we re-submit dirty
74111 +        * nodes to disk */
74112 +       ret = submit_wb_list();
74113 +       if (ret < 0)
74114 +               return ret;
74115 +
74116 +       /* Wait all nodes we just submitted */
74117 +       return current_atom_finish_all_fq();
74118 +}
74119 +
74120 +#if REISER4_DEBUG
74121 +
74122 +static void reiser4_info_atom(const char *prefix, const txn_atom * atom)
74123 +{
74124 +       if (atom == NULL) {
74125 +               printk("%s: no atom\n", prefix);
74126 +               return;
74127 +       }
74128 +
74129 +       printk("%s: refcount: %i id: %i flags: %x txnh_count: %i"
74130 +              " capture_count: %i stage: %x start: %lu, flushed: %i\n", prefix,
74131 +              atomic_read(&atom->refcount), atom->atom_id, atom->flags,
74132 +              atom->txnh_count, atom->capture_count, atom->stage,
74133 +              atom->start_time, atom->flushed);
74134 +}
74135 +
74136 +#else  /*  REISER4_DEBUG  */
74137 +
74138 +static inline void reiser4_info_atom(const char *prefix, const txn_atom * atom) {}
74139 +
74140 +#endif  /*  REISER4_DEBUG  */
74141 +
74142 +#define TOOMANYFLUSHES (1 << 13)
74143 +
74144 +/* Called with the atom locked and no open "active" transaction handlers except
74145 +   ours, this function calls flush_current_atom() until all dirty nodes are
74146 +   processed.  Then it initiates commit processing.
74147 +
74148 +   Called by the single remaining open "active" txnh, which is closing. Other
74149 +   open txnhs belong to processes which wait atom commit in commit_txnh()
74150 +   routine. They are counted as "waiters" in atom->nr_waiters.  Therefore as
74151 +   long as we hold the atom lock none of the jnodes can be captured and/or
74152 +   locked.
74153 +
74154 +   Return value is an error code if commit fails.
74155 +*/
74156 +static int commit_current_atom(long *nr_submitted, txn_atom ** atom)
74157 +{
74158 +       reiser4_super_info_data *sbinfo = get_current_super_private();
74159 +       long ret = 0;
74160 +       /* how many times jnode_flush() was called as a part of attempt to
74161 +        * commit this atom. */
74162 +       int flushiters;
74163 +
74164 +       assert("zam-888", atom != NULL && *atom != NULL);
74165 +       assert_spin_locked(&((*atom)->alock));
74166 +       assert("zam-887", get_current_context()->trans->atom == *atom);
74167 +       assert("jmacd-151", atom_isopen(*atom));
74168 +
74169 +       assert("nikita-3184",
74170 +              get_current_super_private()->delete_mutex_owner != current);
74171 +
74172 +       for (flushiters = 0;; ++flushiters) {
74173 +               ret =
74174 +                   flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS |
74175 +                                      JNODE_FLUSH_COMMIT,
74176 +                                      LONG_MAX /* nr_to_write */ ,
74177 +                                      nr_submitted, atom, NULL);
74178 +               if (ret != -E_REPEAT)
74179 +                       break;
74180 +
74181 +               /* if atom's dirty list contains one znode which is
74182 +                  HEARD_BANSHEE and is locked we have to allow lock owner to
74183 +                  continue and uncapture that znode */
74184 +               reiser4_preempt_point();
74185 +
74186 +               *atom = get_current_atom_locked();
74187 +               if (flushiters > TOOMANYFLUSHES && IS_POW(flushiters)) {
74188 +                       warning("nikita-3176",
74189 +                               "Flushing like mad: %i", flushiters);
74190 +                       reiser4_info_atom("atom", *atom);
74191 +                       DEBUGON(flushiters > (1 << 20));
74192 +               }
74193 +       }
74194 +
74195 +       if (ret)
74196 +               return ret;
74197 +
74198 +       assert_spin_locked(&((*atom)->alock));
74199 +
74200 +       if (!atom_can_be_committed(*atom)) {
74201 +               spin_unlock_atom(*atom);
74202 +               return RETERR(-E_REPEAT);
74203 +       }
74204 +
74205 +       if ((*atom)->capture_count == 0)
74206 +               goto done;
74207 +
74208 +       /* Up to this point we have been flushing and after flush is called we
74209 +          return -E_REPEAT.  Now we can commit.  We cannot return -E_REPEAT
74210 +          at this point, commit should be successful. */
74211 +       reiser4_atom_set_stage(*atom, ASTAGE_PRE_COMMIT);
74212 +       ON_DEBUG(((*atom)->committer = current));
74213 +       spin_unlock_atom(*atom);
74214 +
74215 +       ret = current_atom_complete_writes();
74216 +       if (ret)
74217 +               return ret;
74218 +
74219 +       assert("zam-906", list_empty(ATOM_WB_LIST(*atom)));
74220 +
74221 +       /* isolate critical code path which should be executed by only one
74222 +        * thread using tmgr mutex */
74223 +       mutex_lock(&sbinfo->tmgr.commit_mutex);
74224 +
74225 +       ret = reiser4_write_logs(nr_submitted);
74226 +       if (ret < 0)
74227 +               reiser4_panic("zam-597", "write log failed (%ld)\n", ret);
74228 +
74229 +       /* The atom->ovrwr_nodes list is processed under commit mutex held
74230 +          because of bitmap nodes which are captured by special way in
74231 +          reiser4_pre_commit_hook_bitmap(), that way does not include
74232 +          capture_fuse_wait() as a capturing of other nodes does -- the commit
74233 +          mutex is used for transaction isolation instead. */
74234 +       reiser4_invalidate_list(ATOM_OVRWR_LIST(*atom));
74235 +       mutex_unlock(&sbinfo->tmgr.commit_mutex);
74236 +
74237 +       reiser4_invalidate_list(ATOM_CLEAN_LIST(*atom));
74238 +       reiser4_invalidate_list(ATOM_WB_LIST(*atom));
74239 +       assert("zam-927", list_empty(&(*atom)->inodes));
74240 +
74241 +       spin_lock_atom(*atom);
74242 + done:
74243 +       reiser4_atom_set_stage(*atom, ASTAGE_DONE);
74244 +       ON_DEBUG((*atom)->committer = NULL);
74245 +
74246 +       /* Atom's state changes, so wake up everybody waiting for this
74247 +          event. */
74248 +       wakeup_atom_waiting_list(*atom);
74249 +
74250 +       /* Decrement the "until commit" reference, at least one txnh (the caller) is
74251 +          still open. */
74252 +       atomic_dec(&(*atom)->refcount);
74253 +
74254 +       assert("jmacd-1070", atomic_read(&(*atom)->refcount) > 0);
74255 +       assert("jmacd-1062", (*atom)->capture_count == 0);
74256 +       BUG_ON((*atom)->capture_count != 0);
74257 +       assert_spin_locked(&((*atom)->alock));
74258 +
74259 +       return ret;
74260 +}
74261 +
74262 +/* TXN_TXNH */
74263 +
74264 +/**
74265 + * force_commit_atom - commit current atom and wait commit completion
74266 + * @txnh:
74267 + *
74268 + * Commits current atom and wait commit completion; current atom and @txnh have
74269 + * to be spinlocked before call, this function unlocks them on exit.
74270 + */
74271 +int force_commit_atom(txn_handle *txnh)
74272 +{
74273 +       txn_atom *atom;
74274 +
74275 +       assert("zam-837", txnh != NULL);
74276 +       assert_spin_locked(&(txnh->hlock));
74277 +       assert("nikita-2966", lock_stack_isclean(get_current_lock_stack()));
74278 +
74279 +       atom = txnh->atom;
74280 +
74281 +       assert("zam-834", atom != NULL);
74282 +       assert_spin_locked(&(atom->alock));
74283 +
74284 +       /*
74285 +        * Set flags for atom and txnh: forcing atom commit and waiting for
74286 +        * commit completion
74287 +        */
74288 +       txnh->flags |= TXNH_WAIT_COMMIT;
74289 +       atom->flags |= ATOM_FORCE_COMMIT;
74290 +
74291 +       spin_unlock_txnh(txnh);
74292 +       spin_unlock_atom(atom);
74293 +
74294 +       /* commit is here */
74295 +       reiser4_txn_restart_current();
74296 +       return 0;
74297 +}
74298 +
74299 +/* Called to force commit of any outstanding atoms.  @commit_all_atoms controls
74300 + * should we commit all atoms including new ones which are created after this
74301 + * functions is called. */
74302 +int txnmgr_force_commit_all(struct super_block *super, int commit_all_atoms)
74303 +{
74304 +       int ret;
74305 +       txn_atom *atom;
74306 +       txn_mgr *mgr;
74307 +       txn_handle *txnh;
74308 +       unsigned long start_time = jiffies;
74309 +       reiser4_context *ctx = get_current_context();
74310 +
74311 +       assert("nikita-2965", lock_stack_isclean(get_current_lock_stack()));
74312 +       assert("nikita-3058", reiser4_commit_check_locks());
74313 +
74314 +       reiser4_txn_restart_current();
74315 +
74316 +       mgr = &get_super_private(super)->tmgr;
74317 +
74318 +       txnh = ctx->trans;
74319 +
74320 +      again:
74321 +
74322 +       spin_lock_txnmgr(mgr);
74323 +
74324 +       list_for_each_entry(atom, &mgr->atoms_list, atom_link) {
74325 +               spin_lock_atom(atom);
74326 +
74327 +               /* Commit any atom which can be committed.  If @commit_new_atoms
74328 +                * is not set we commit only atoms which were created before
74329 +                * this call is started. */
74330 +               if (commit_all_atoms
74331 +                   || time_before_eq(atom->start_time, start_time)) {
74332 +                       if (atom->stage <= ASTAGE_POST_COMMIT) {
74333 +                               spin_unlock_txnmgr(mgr);
74334 +
74335 +                               if (atom->stage < ASTAGE_PRE_COMMIT) {
74336 +                                       spin_lock_txnh(txnh);
74337 +                                       /* Add force-context txnh */
74338 +                                       capture_assign_txnh_nolock(atom, txnh);
74339 +                                       ret = force_commit_atom(txnh);
74340 +                                       if (ret)
74341 +                                               return ret;
74342 +                               } else
74343 +                                       /* wait atom commit */
74344 +                                       reiser4_atom_wait_event(atom);
74345 +
74346 +                               goto again;
74347 +                       }
74348 +               }
74349 +
74350 +               spin_unlock_atom(atom);
74351 +       }
74352 +
74353 +#if REISER4_DEBUG
74354 +       if (commit_all_atoms) {
74355 +               reiser4_super_info_data *sbinfo = get_super_private(super);
74356 +               spin_lock_reiser4_super(sbinfo);
74357 +               assert("zam-813",
74358 +                      sbinfo->blocks_fake_allocated_unformatted == 0);
74359 +               assert("zam-812", sbinfo->blocks_fake_allocated == 0);
74360 +               spin_unlock_reiser4_super(sbinfo);
74361 +       }
74362 +#endif
74363 +
74364 +       spin_unlock_txnmgr(mgr);
74365 +
74366 +       return 0;
74367 +}
74368 +
74369 +/* check whether commit_some_atoms() can commit @atom. Locking is up to the
74370 + * caller */
74371 +static int atom_is_committable(txn_atom * atom)
74372 +{
74373 +       return
74374 +           atom->stage < ASTAGE_PRE_COMMIT &&
74375 +           atom->txnh_count == atom->nr_waiters && atom_should_commit(atom);
74376 +}
74377 +
74378 +/* called periodically from ktxnmgrd to commit old atoms. Releases ktxnmgrd spin
74379 + * lock at exit */
74380 +int commit_some_atoms(txn_mgr * mgr)
74381 +{
74382 +       int ret = 0;
74383 +       txn_atom *atom;
74384 +       txn_handle *txnh;
74385 +       reiser4_context *ctx;
74386 +       struct list_head *pos, *tmp;
74387 +
74388 +       ctx = get_current_context();
74389 +       assert("nikita-2444", ctx != NULL);
74390 +
74391 +       txnh = ctx->trans;
74392 +       spin_lock_txnmgr(mgr);
74393 +
74394 +       /*
74395 +        * this is to avoid gcc complain that atom might be used
74396 +        * uninitialized
74397 +        */
74398 +       atom = NULL;
74399 +
74400 +       /* look for atom to commit */
74401 +       list_for_each_safe(pos, tmp, &mgr->atoms_list) {
74402 +               atom = list_entry(pos, txn_atom, atom_link);
74403 +               /*
74404 +                * first test without taking atom spin lock, whether it is
74405 +                * eligible for committing at all
74406 +                */
74407 +               if (atom_is_committable(atom)) {
74408 +                       /* now, take spin lock and re-check */
74409 +                       spin_lock_atom(atom);
74410 +                       if (atom_is_committable(atom))
74411 +                               break;
74412 +                       spin_unlock_atom(atom);
74413 +               }
74414 +       }
74415 +
74416 +       ret = (&mgr->atoms_list == pos);
74417 +       spin_unlock_txnmgr(mgr);
74418 +
74419 +       if (ret) {
74420 +               /* nothing found */
74421 +               spin_unlock(&mgr->daemon->guard);
74422 +               return 0;
74423 +       }
74424 +
74425 +       spin_lock_txnh(txnh);
74426 +
74427 +       BUG_ON(atom == NULL);
74428 +       /* Set the atom to force committing */
74429 +       atom->flags |= ATOM_FORCE_COMMIT;
74430 +
74431 +       /* Add force-context txnh */
74432 +       capture_assign_txnh_nolock(atom, txnh);
74433 +
74434 +       spin_unlock_txnh(txnh);
74435 +       spin_unlock_atom(atom);
74436 +
74437 +       /* we are about to release daemon spin lock, notify daemon it
74438 +          has to rescan atoms */
74439 +       mgr->daemon->rescan = 1;
74440 +       spin_unlock(&mgr->daemon->guard);
74441 +       reiser4_txn_restart_current();
74442 +       return 0;
74443 +}
74444 +
74445 +static int txn_try_to_fuse_small_atom(txn_mgr * tmgr, txn_atom * atom)
74446 +{
74447 +       int atom_stage;
74448 +       txn_atom *atom_2;
74449 +       int repeat;
74450 +
74451 +       assert("zam-1051", atom->stage < ASTAGE_PRE_COMMIT);
74452 +
74453 +       atom_stage = atom->stage;
74454 +       repeat = 0;
74455 +
74456 +       if (!spin_trylock_txnmgr(tmgr)) {
74457 +               atomic_inc(&atom->refcount);
74458 +               spin_unlock_atom(atom);
74459 +               spin_lock_txnmgr(tmgr);
74460 +               spin_lock_atom(atom);
74461 +               repeat = 1;
74462 +               if (atom->stage != atom_stage) {
74463 +                       spin_unlock_txnmgr(tmgr);
74464 +                       atom_dec_and_unlock(atom);
74465 +                       return -E_REPEAT;
74466 +               }
74467 +               atomic_dec(&atom->refcount);
74468 +       }
74469 +
74470 +       list_for_each_entry(atom_2, &tmgr->atoms_list, atom_link) {
74471 +               if (atom == atom_2)
74472 +                       continue;
74473 +               /*
74474 +                * if trylock does not succeed we just do not fuse with that
74475 +                * atom.
74476 +                */
74477 +               if (spin_trylock_atom(atom_2)) {
74478 +                       if (atom_2->stage < ASTAGE_PRE_COMMIT) {
74479 +                               spin_unlock_txnmgr(tmgr);
74480 +                               capture_fuse_into(atom_2, atom);
74481 +                               /* all locks are lost we can only repeat here */
74482 +                               return -E_REPEAT;
74483 +                       }
74484 +                       spin_unlock_atom(atom_2);
74485 +               }
74486 +       }
74487 +       atom->flags |= ATOM_CANCEL_FUSION;
74488 +       spin_unlock_txnmgr(tmgr);
74489 +       if (repeat) {
74490 +               spin_unlock_atom(atom);
74491 +               return -E_REPEAT;
74492 +       }
74493 +       return 0;
74494 +}
74495 +
74496 +/* Calls jnode_flush for current atom if it exists; if not, just take another
74497 +   atom and call jnode_flush() for him.  If current transaction handle has
74498 +   already assigned atom (current atom) we have to close current transaction
74499 +   prior to switch to another atom or do something with current atom. This
74500 +   code tries to flush current atom.
74501 +
74502 +   flush_some_atom() is called as part of memory clearing process. It is
74503 +   invoked from balance_dirty_pages(), pdflushd, and entd.
74504 +
74505 +   If we can flush no nodes, atom is committed, because this frees memory.
74506 +
74507 +   If atom is too large or too old it is committed also.
74508 +*/
74509 +int
74510 +flush_some_atom(jnode * start, long *nr_submitted, const struct writeback_control *wbc,
74511 +               int flags)
74512 +{
74513 +       reiser4_context *ctx = get_current_context();
74514 +       txn_mgr *tmgr = &get_super_private(ctx->super)->tmgr;
74515 +       txn_handle *txnh = ctx->trans;
74516 +       txn_atom *atom;
74517 +       int ret;
74518 +
74519 +       BUG_ON(wbc->nr_to_write == 0);
74520 +       BUG_ON(*nr_submitted != 0);
74521 +       assert("zam-1042", txnh != NULL);
74522 +      repeat:
74523 +       if (txnh->atom == NULL) {
74524 +               /* current atom is not available, take first from txnmgr */
74525 +               spin_lock_txnmgr(tmgr);
74526 +
74527 +               /* traverse the list of all atoms */
74528 +               list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74529 +                       /* lock atom before checking its state */
74530 +                       spin_lock_atom(atom);
74531 +
74532 +                       /*
74533 +                        * we need an atom which is not being committed and
74534 +                        * which has no flushers (jnode_flush() add one flusher
74535 +                        * at the beginning and subtract one at the end).
74536 +                        */
74537 +                       if (atom->stage < ASTAGE_PRE_COMMIT &&
74538 +                           atom->nr_flushers == 0) {
74539 +                               spin_lock_txnh(txnh);
74540 +                               capture_assign_txnh_nolock(atom, txnh);
74541 +                               spin_unlock_txnh(txnh);
74542 +
74543 +                               goto found;
74544 +                       }
74545 +
74546 +                       spin_unlock_atom(atom);
74547 +               }
74548 +
74549 +               /*
74550 +                * Write throttling is case of no one atom can be
74551 +                * flushed/committed.
74552 +                */
74553 +               if (!current_is_pdflush() && !wbc->nonblocking) {
74554 +                       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
74555 +                               spin_lock_atom(atom);
74556 +                               /* Repeat the check from the above. */
74557 +                               if (atom->stage < ASTAGE_PRE_COMMIT
74558 +                                   && atom->nr_flushers == 0) {
74559 +                                       spin_lock_txnh(txnh);
74560 +                                       capture_assign_txnh_nolock(atom, txnh);
74561 +                                       spin_unlock_txnh(txnh);
74562 +
74563 +                                       goto found;
74564 +                               }
74565 +                               if (atom->stage <= ASTAGE_POST_COMMIT) {
74566 +                                       spin_unlock_txnmgr(tmgr);
74567 +                                       /*
74568 +                                        * we just wait until atom's flusher
74569 +                                        * makes a progress in flushing or
74570 +                                        * committing the atom
74571 +                                        */
74572 +                                       reiser4_atom_wait_event(atom);
74573 +                                       goto repeat;
74574 +                               }
74575 +                               spin_unlock_atom(atom);
74576 +                       }
74577 +               }
74578 +               spin_unlock_txnmgr(tmgr);
74579 +               return 0;
74580 +             found:
74581 +               spin_unlock_txnmgr(tmgr);
74582 +       } else
74583 +               atom = get_current_atom_locked();
74584 +
74585 +       BUG_ON(atom->super != ctx->super);
74586 +       assert("vs-35", atom->super == ctx->super);
74587 +       if (start) {
74588 +               spin_lock_jnode(start);
74589 +               ret = (atom == start->atom) ? 1 : 0;
74590 +               spin_unlock_jnode(start);
74591 +               if (ret == 0)
74592 +                       start = NULL;
74593 +       }
74594 +       ret = flush_current_atom(flags, wbc->nr_to_write, nr_submitted, &atom, start);
74595 +       if (ret == 0) {
74596 +               /* flush_current_atom returns 0 only if it submitted for write
74597 +                  nothing */
74598 +               BUG_ON(*nr_submitted != 0);
74599 +               if (*nr_submitted == 0 || atom_should_commit_asap(atom)) {
74600 +                       if (atom->capture_count < tmgr->atom_min_size &&
74601 +                           !(atom->flags & ATOM_CANCEL_FUSION)) {
74602 +                               ret = txn_try_to_fuse_small_atom(tmgr, atom);
74603 +                               if (ret == -E_REPEAT) {
74604 +                                       reiser4_preempt_point();
74605 +                                       goto repeat;
74606 +                               }
74607 +                       }
74608 +                       /* if early flushing could not make more nodes clean,
74609 +                        * or atom is too old/large,
74610 +                        * we force current atom to commit */
74611 +                       /* wait for commit completion but only if this
74612 +                        * wouldn't stall pdflushd and ent thread. */
74613 +                       if (!wbc->nonblocking && !ctx->entd)
74614 +                               txnh->flags |= TXNH_WAIT_COMMIT;
74615 +                       atom->flags |= ATOM_FORCE_COMMIT;
74616 +               }
74617 +               spin_unlock_atom(atom);
74618 +       } else if (ret == -E_REPEAT) {
74619 +               if (*nr_submitted == 0) {
74620 +                       /* let others who hampers flushing (hold longterm locks,
74621 +                          for instance) to free the way for flush */
74622 +                       reiser4_preempt_point();
74623 +                       goto repeat;
74624 +               }
74625 +               ret = 0;
74626 +       }
74627 +/*
74628 +       if (*nr_submitted > wbc->nr_to_write)
74629 +               warning("", "asked for %ld, written %ld\n", wbc->nr_to_write, *nr_submitted);
74630 +*/
74631 +       reiser4_txn_restart(ctx);
74632 +
74633 +       return ret;
74634 +}
74635 +
74636 +/* Remove processed nodes from atom's clean list (thereby remove them from transaction). */
74637 +void reiser4_invalidate_list(struct list_head *head)
74638 +{
74639 +       while (!list_empty(head)) {
74640 +               jnode *node;
74641 +
74642 +               node = list_entry(head->next, jnode, capture_link);
74643 +               spin_lock_jnode(node);
74644 +               reiser4_uncapture_block(node);
74645 +               jput(node);
74646 +       }
74647 +}
74648 +
74649 +static void init_wlinks(txn_wait_links * wlinks)
74650 +{
74651 +       wlinks->_lock_stack = get_current_lock_stack();
74652 +       INIT_LIST_HEAD(&wlinks->_fwaitfor_link);
74653 +       INIT_LIST_HEAD(&wlinks->_fwaiting_link);
74654 +       wlinks->waitfor_cb = NULL;
74655 +       wlinks->waiting_cb = NULL;
74656 +}
74657 +
74658 +/* Add atom to the atom's waitfor list and wait for somebody to wake us up; */
74659 +void reiser4_atom_wait_event(txn_atom * atom)
74660 +{
74661 +       txn_wait_links _wlinks;
74662 +
74663 +       assert_spin_locked(&(atom->alock));
74664 +       assert("nikita-3156",
74665 +              lock_stack_isclean(get_current_lock_stack()) ||
74666 +              atom->nr_running_queues > 0);
74667 +
74668 +       init_wlinks(&_wlinks);
74669 +       list_add_tail(&_wlinks._fwaitfor_link, &atom->fwaitfor_list);
74670 +       atomic_inc(&atom->refcount);
74671 +       spin_unlock_atom(atom);
74672 +
74673 +       reiser4_prepare_to_sleep(_wlinks._lock_stack);
74674 +       reiser4_go_to_sleep(_wlinks._lock_stack);
74675 +
74676 +       spin_lock_atom(atom);
74677 +       list_del(&_wlinks._fwaitfor_link);
74678 +       atom_dec_and_unlock(atom);
74679 +}
74680 +
74681 +void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage)
74682 +{
74683 +       assert("nikita-3535", atom != NULL);
74684 +       assert_spin_locked(&(atom->alock));
74685 +       assert("nikita-3536", stage <= ASTAGE_INVALID);
74686 +       /* Excelsior! */
74687 +       assert("nikita-3537", stage >= atom->stage);
74688 +       if (atom->stage != stage) {
74689 +               atom->stage = stage;
74690 +               reiser4_atom_send_event(atom);
74691 +       }
74692 +}
74693 +
74694 +/* wake all threads which wait for an event */
74695 +void reiser4_atom_send_event(txn_atom * atom)
74696 +{
74697 +       assert_spin_locked(&(atom->alock));
74698 +       wakeup_atom_waitfor_list(atom);
74699 +}
74700 +
74701 +/* Informs txn manager code that owner of this txn_handle should wait atom commit completion (for
74702 +   example, because it does fsync(2)) */
74703 +static int should_wait_commit(txn_handle * h)
74704 +{
74705 +       return h->flags & TXNH_WAIT_COMMIT;
74706 +}
74707 +
74708 +typedef struct commit_data {
74709 +       txn_atom *atom;
74710 +       txn_handle *txnh;
74711 +       long nr_written;
74712 +       /* as an optimization we start committing atom by first trying to
74713 +        * flush it few times without switching into ASTAGE_CAPTURE_WAIT. This
74714 +        * allows to reduce stalls due to other threads waiting for atom in
74715 +        * ASTAGE_CAPTURE_WAIT stage. ->preflush is counter of these
74716 +        * preliminary flushes. */
74717 +       int preflush;
74718 +       /* have we waited on atom. */
74719 +       int wait;
74720 +       int failed;
74721 +       int wake_ktxnmgrd_up;
74722 +} commit_data;
74723 +
74724 +/*
74725 + * Called from commit_txnh() repeatedly, until either error happens, or atom
74726 + * commits successfully.
74727 + */
74728 +static int try_commit_txnh(commit_data * cd)
74729 +{
74730 +       int result;
74731 +
74732 +       assert("nikita-2968", lock_stack_isclean(get_current_lock_stack()));
74733 +
74734 +       /* Get the atom and txnh locked. */
74735 +       cd->atom = txnh_get_atom(cd->txnh);
74736 +       assert("jmacd-309", cd->atom != NULL);
74737 +       spin_unlock_txnh(cd->txnh);
74738 +
74739 +       if (cd->wait) {
74740 +               cd->atom->nr_waiters--;
74741 +               cd->wait = 0;
74742 +       }
74743 +
74744 +       if (cd->atom->stage == ASTAGE_DONE)
74745 +               return 0;
74746 +
74747 +       if (cd->failed)
74748 +               return 0;
74749 +
74750 +       if (atom_should_commit(cd->atom)) {
74751 +               /* if atom is _very_ large schedule it for commit as soon as
74752 +                * possible. */
74753 +               if (atom_should_commit_asap(cd->atom)) {
74754 +                       /*
74755 +                        * When atom is in PRE_COMMIT or later stage following
74756 +                        * invariant (encoded   in    atom_can_be_committed())
74757 +                        * holds:  there is exactly one non-waiter transaction
74758 +                        * handle opened  on this atom.  When  thread wants to
74759 +                        * wait  until atom  commits (for  example  sync()) it
74760 +                        * waits    on    atom  event     after     increasing
74761 +                        * atom->nr_waiters (see blow  in  this  function). It
74762 +                        * cannot be guaranteed that atom is already committed
74763 +                        * after    receiving event,  so     loop has   to  be
74764 +                        * re-started. But  if  atom switched into  PRE_COMMIT
74765 +                        * stage and became  too  large, we cannot  change its
74766 +                        * state back   to CAPTURE_WAIT (atom  stage can  only
74767 +                        * increase monotonically), hence this check.
74768 +                        */
74769 +                       if (cd->atom->stage < ASTAGE_CAPTURE_WAIT)
74770 +                               reiser4_atom_set_stage(cd->atom,
74771 +                                                      ASTAGE_CAPTURE_WAIT);
74772 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
74773 +               }
74774 +               if (cd->txnh->flags & TXNH_DONT_COMMIT) {
74775 +                       /*
74776 +                        * this  thread (transaction  handle  that is) doesn't
74777 +                        * want to commit  atom. Notify waiters that handle is
74778 +                        * closed. This can happen, for  example, when we  are
74779 +                        * under  VFS directory lock  and don't want to commit
74780 +                        * atom  right   now to  avoid  stalling other threads
74781 +                        * working in the same directory.
74782 +                        */
74783 +
74784 +                       /* Wake  the ktxnmgrd up if  the ktxnmgrd is needed to
74785 +                        * commit this  atom: no  atom  waiters  and only  one
74786 +                        * (our) open transaction handle. */
74787 +                       cd->wake_ktxnmgrd_up =
74788 +                           cd->atom->txnh_count == 1 &&
74789 +                           cd->atom->nr_waiters == 0;
74790 +                       reiser4_atom_send_event(cd->atom);
74791 +                       result = 0;
74792 +               } else if (!atom_can_be_committed(cd->atom)) {
74793 +                       if (should_wait_commit(cd->txnh)) {
74794 +                               /* sync(): wait for commit */
74795 +                               cd->atom->nr_waiters++;
74796 +                               cd->wait = 1;
74797 +                               reiser4_atom_wait_event(cd->atom);
74798 +                               result = RETERR(-E_REPEAT);
74799 +                       } else {
74800 +                               result = 0;
74801 +                       }
74802 +               } else if (cd->preflush > 0 && !is_current_ktxnmgrd()) {
74803 +                       /*
74804 +                        * optimization: flush  atom without switching it into
74805 +                        * ASTAGE_CAPTURE_WAIT.
74806 +                        *
74807 +                        * But don't  do this for  ktxnmgrd, because  ktxnmgrd
74808 +                        * should never block on atom fusion.
74809 +                        */
74810 +                       result = flush_current_atom(JNODE_FLUSH_WRITE_BLOCKS,
74811 +                                                   LONG_MAX, &cd->nr_written,
74812 +                                                   &cd->atom, NULL);
74813 +                       if (result == 0) {
74814 +                               spin_unlock_atom(cd->atom);
74815 +                               cd->preflush = 0;
74816 +                               result = RETERR(-E_REPEAT);
74817 +                       } else  /* Atoms wasn't flushed
74818 +                                * completely. Rinse. Repeat. */
74819 +                               --cd->preflush;
74820 +               } else {
74821 +                       /* We change   atom state  to   ASTAGE_CAPTURE_WAIT to
74822 +                          prevent atom fusion and count  ourself as an active
74823 +                          flusher */
74824 +                       reiser4_atom_set_stage(cd->atom, ASTAGE_CAPTURE_WAIT);
74825 +                       cd->atom->flags |= ATOM_FORCE_COMMIT;
74826 +
74827 +                       result =
74828 +                           commit_current_atom(&cd->nr_written, &cd->atom);
74829 +                       if (result != 0 && result != -E_REPEAT)
74830 +                               cd->failed = 1;
74831 +               }
74832 +       } else
74833 +               result = 0;
74834 +
74835 +#if REISER4_DEBUG
74836 +       if (result == 0)
74837 +               assert_spin_locked(&(cd->atom->alock));
74838 +#endif
74839 +
74840 +       /* perfectly valid assertion, except that when atom/txnh is not locked
74841 +        * fusion can take place, and cd->atom points nowhere. */
74842 +       /*
74843 +          assert("jmacd-1028", ergo(result != 0, spin_atom_is_not_locked(cd->atom)));
74844 +        */
74845 +       return result;
74846 +}
74847 +
74848 +/* Called to commit a transaction handle.  This decrements the atom's number of open
74849 +   handles and if it is the last handle to commit and the atom should commit, initiates
74850 +   atom commit. if commit does not fail, return number of written blocks */
74851 +static int commit_txnh(txn_handle * txnh)
74852 +{
74853 +       commit_data cd;
74854 +       assert("umka-192", txnh != NULL);
74855 +
74856 +       memset(&cd, 0, sizeof cd);
74857 +       cd.txnh = txnh;
74858 +       cd.preflush = 10;
74859 +
74860 +       /* calls try_commit_txnh() until either atom commits, or error
74861 +        * happens */
74862 +       while (try_commit_txnh(&cd) != 0)
74863 +               reiser4_preempt_point();
74864 +
74865 +       spin_lock_txnh(txnh);
74866 +
74867 +       cd.atom->txnh_count -= 1;
74868 +       txnh->atom = NULL;
74869 +       /* remove transaction handle from atom's list of transaction handles */
74870 +       list_del_init(&txnh->txnh_link);
74871 +
74872 +       spin_unlock_txnh(txnh);
74873 +       atom_dec_and_unlock(cd.atom);
74874 +       /* if we don't want to do a commit (TXNH_DONT_COMMIT is set, probably
74875 +        * because it takes time) by current thread, we do that work
74876 +        * asynchronously by ktxnmgrd daemon. */
74877 +       if (cd.wake_ktxnmgrd_up)
74878 +               ktxnmgrd_kick(&get_current_super_private()->tmgr);
74879 +
74880 +       return 0;
74881 +}
74882 +
74883 +/* TRY_CAPTURE */
74884 +
74885 +/* This routine attempts a single block-capture request.  It may return -E_REPEAT if some
74886 +   condition indicates that the request should be retried, and it may block if the
74887 +   txn_capture mode does not include the TXN_CAPTURE_NONBLOCKING request flag.
74888 +
74889 +   This routine encodes the basic logic of block capturing described by:
74890 +
74891 +     http://namesys.com/v4/v4.html
74892 +
74893 +   Our goal here is to ensure that any two blocks that contain dependent modifications
74894 +   should commit at the same time.  This function enforces this discipline by initiating
74895 +   fusion whenever a transaction handle belonging to one atom requests to read or write a
74896 +   block belonging to another atom (TXN_CAPTURE_WRITE or TXN_CAPTURE_READ_ATOMIC).
74897 +
74898 +   In addition, this routine handles the initial assignment of atoms to blocks and
74899 +   transaction handles.  These are possible outcomes of this function:
74900 +
74901 +   1. The block and handle are already part of the same atom: return immediate success
74902 +
74903 +   2. The block is assigned but the handle is not: call capture_assign_txnh to assign
74904 +      the handle to the block's atom.
74905 +
74906 +   3. The handle is assigned but the block is not: call capture_assign_block to assign
74907 +      the block to the handle's atom.
74908 +
74909 +   4. Both handle and block are assigned, but to different atoms: call capture_init_fusion
74910 +      to fuse atoms.
74911 +
74912 +   5. Neither block nor handle are assigned: create a new atom and assign them both.
74913 +
74914 +   6. A read request for a non-captured block: return immediate success.
74915 +
74916 +   This function acquires and releases the handle's spinlock.  This function is called
74917 +   under the jnode lock and if the return value is 0, it returns with the jnode lock still
74918 +   held.  If the return is -E_REPEAT or some other error condition, the jnode lock is
74919 +   released.  The external interface (reiser4_try_capture) manages re-aquiring the jnode
74920 +   lock in the failure case.
74921 +*/
74922 +static int try_capture_block(
74923 +       txn_handle * txnh, jnode * node, txn_capture mode,
74924 +       txn_atom ** atom_alloc)
74925 +{
74926 +       txn_atom *block_atom;
74927 +       txn_atom *txnh_atom;
74928 +
74929 +       /* Should not call capture for READ_NONCOM requests, handled in reiser4_try_capture. */
74930 +       assert("jmacd-567", CAPTURE_TYPE(mode) != TXN_CAPTURE_READ_NONCOM);
74931 +
74932 +       /* FIXME-ZAM-HANS: FIXME_LATER_JMACD Should assert that atom->tree ==
74933 +        * node->tree somewhere. */
74934 +       assert("umka-194", txnh != NULL);
74935 +       assert("umka-195", node != NULL);
74936 +
74937 +       /* The jnode is already locked!  Being called from reiser4_try_capture(). */
74938 +       assert_spin_locked(&(node->guard));
74939 +       block_atom = node->atom;
74940 +
74941 +       /* Get txnh spinlock, this allows us to compare txn_atom pointers but it doesn't
74942 +          let us touch the atoms themselves. */
74943 +       spin_lock_txnh(txnh);
74944 +       txnh_atom = txnh->atom;
74945 +       /* Process of capturing continues into one of four branches depends on
74946 +          which atoms from (block atom (node->atom), current atom (txnh->atom))
74947 +          exist. */
74948 +       if (txnh_atom == NULL) {
74949 +               if (block_atom == NULL) {
74950 +                       spin_unlock_txnh(txnh);
74951 +                       spin_unlock_jnode(node);
74952 +                       /* assign empty atom to the txnh and repeat */
74953 +                       return atom_begin_and_assign_to_txnh(atom_alloc, txnh);
74954 +               } else {
74955 +                       atomic_inc(&block_atom->refcount);
74956 +                       /* node spin-lock isn't needed anymore */
74957 +                       spin_unlock_jnode(node);
74958 +                       if (!spin_trylock_atom(block_atom)) {
74959 +                               spin_unlock_txnh(txnh);
74960 +                               spin_lock_atom(block_atom);
74961 +                               spin_lock_txnh(txnh);
74962 +                       }
74963 +                       /* re-check state after getting txnh and the node
74964 +                        * atom spin-locked */
74965 +                       if (node->atom != block_atom || txnh->atom != NULL) {
74966 +                               spin_unlock_txnh(txnh);
74967 +                               atom_dec_and_unlock(block_atom);
74968 +                               return RETERR(-E_REPEAT);
74969 +                       }
74970 +                       atomic_dec(&block_atom->refcount);
74971 +                       if (block_atom->stage > ASTAGE_CAPTURE_WAIT ||
74972 +                           (block_atom->stage == ASTAGE_CAPTURE_WAIT &&
74973 +                            block_atom->txnh_count != 0))
74974 +                               return capture_fuse_wait(txnh, block_atom, NULL, mode);
74975 +                       capture_assign_txnh_nolock(block_atom, txnh);
74976 +                       spin_unlock_txnh(txnh);
74977 +                       spin_unlock_atom(block_atom);
74978 +                       return RETERR(-E_REPEAT);
74979 +               }
74980 +       } else {
74981 +               /* It is time to perform deadlock prevention check over the
74982 +                  node we want to capture.  It is possible this node was locked
74983 +                  for read without capturing it. The optimization which allows
74984 +                  to do it helps us in keeping atoms independent as long as
74985 +                  possible but it may cause lock/fuse deadlock problems.
74986 +
74987 +                  A number of similar deadlock situations with locked but not
74988 +                  captured nodes were found.  In each situation there are two
74989 +                  or more threads: one of them does flushing while another one
74990 +                  does routine balancing or tree lookup.  The flushing thread
74991 +                  (F) sleeps in long term locking request for node (N), another
74992 +                  thread (A) sleeps in trying to capture some node already
74993 +                  belonging the atom F, F has a state which prevents
74994 +                  immediately fusion .
74995 +
74996 +                  Deadlocks of this kind cannot happen if node N was properly
74997 +                  captured by thread A. The F thread fuse atoms before locking
74998 +                  therefore current atom of thread F and current atom of thread
74999 +                  A became the same atom and thread A may proceed.  This does
75000 +                  not work if node N was not captured because the fusion of
75001 +                  atom does not happens.
75002 +
75003 +                  The following scheme solves the deadlock: If
75004 +                  longterm_lock_znode locks and does not capture a znode, that
75005 +                  znode is marked as MISSED_IN_CAPTURE.  A node marked this way
75006 +                  is processed by the code below which restores the missed
75007 +                  capture and fuses current atoms of all the node lock owners
75008 +                  by calling the fuse_not_fused_lock_owners() function. */
75009 +               if (JF_ISSET(node, JNODE_MISSED_IN_CAPTURE)) {
75010 +                       JF_CLR(node, JNODE_MISSED_IN_CAPTURE);
75011 +                       if (jnode_is_znode(node) && znode_is_locked(JZNODE(node))) {
75012 +                               spin_unlock_txnh(txnh);
75013 +                               spin_unlock_jnode(node);
75014 +                               fuse_not_fused_lock_owners(txnh, JZNODE(node));
75015 +                               return RETERR(-E_REPEAT);
75016 +                       }
75017 +               }
75018 +               if (block_atom == NULL) {
75019 +                       atomic_inc(&txnh_atom->refcount);
75020 +                       spin_unlock_txnh(txnh);
75021 +                       if (!spin_trylock_atom(txnh_atom)) {
75022 +                               spin_unlock_jnode(node);
75023 +                               spin_lock_atom(txnh_atom);
75024 +                               spin_lock_jnode(node);
75025 +                       }
75026 +                       if (txnh->atom != txnh_atom || node->atom != NULL
75027 +                               || JF_ISSET(node, JNODE_IS_DYING)) {
75028 +                               spin_unlock_jnode(node);
75029 +                               atom_dec_and_unlock(txnh_atom);
75030 +                               return RETERR(-E_REPEAT);
75031 +                       }
75032 +                       atomic_dec(&txnh_atom->refcount);
75033 +                       capture_assign_block_nolock(txnh_atom, node);
75034 +                       spin_unlock_atom(txnh_atom);
75035 +               } else {
75036 +                       if (txnh_atom != block_atom) {
75037 +                               if (mode & TXN_CAPTURE_DONT_FUSE) {
75038 +                                       spin_unlock_txnh(txnh);
75039 +                                       spin_unlock_jnode(node);
75040 +                                       /* we are in a "no-fusion" mode and @node is
75041 +                                        * already part of transaction. */
75042 +                                       return RETERR(-E_NO_NEIGHBOR);
75043 +                               }
75044 +                               return capture_init_fusion(node, txnh, mode);
75045 +                       }
75046 +                       spin_unlock_txnh(txnh);
75047 +               }
75048 +       }
75049 +       return 0;
75050 +}
75051 +
75052 +static txn_capture
75053 +build_capture_mode(jnode * node, znode_lock_mode lock_mode, txn_capture flags)
75054 +{
75055 +       txn_capture cap_mode;
75056 +
75057 +       assert_spin_locked(&(node->guard));
75058 +
75059 +       /* FIXME_JMACD No way to set TXN_CAPTURE_READ_MODIFY yet. */
75060 +
75061 +       if (lock_mode == ZNODE_WRITE_LOCK) {
75062 +               cap_mode = TXN_CAPTURE_WRITE;
75063 +       } else if (node->atom != NULL) {
75064 +               cap_mode = TXN_CAPTURE_WRITE;
75065 +       } else if (0 &&         /* txnh->mode == TXN_READ_FUSING && */
75066 +                  jnode_get_level(node) == LEAF_LEVEL) {
75067 +               /* NOTE-NIKITA TXN_READ_FUSING is not currently used */
75068 +               /* We only need a READ_FUSING capture at the leaf level.  This
75069 +                  is because the internal levels of the tree (twigs included)
75070 +                  are redundant from the point of the user that asked for a
75071 +                  read-fusing transcrash.  The user only wants to read-fuse
75072 +                  atoms due to reading uncommitted data that another user has
75073 +                  written.  It is the file system that reads/writes the
75074 +                  internal tree levels, the user only reads/writes leaves. */
75075 +               cap_mode = TXN_CAPTURE_READ_ATOMIC;
75076 +       } else {
75077 +               /* In this case (read lock at a non-leaf) there's no reason to
75078 +                * capture. */
75079 +               /* cap_mode = TXN_CAPTURE_READ_NONCOM; */
75080 +               return 0;
75081 +       }
75082 +
75083 +       cap_mode |= (flags & (TXN_CAPTURE_NONBLOCKING | TXN_CAPTURE_DONT_FUSE));
75084 +       assert("nikita-3186", cap_mode != 0);
75085 +       return cap_mode;
75086 +}
75087 +
75088 +/* This is an external interface to try_capture_block(), it calls
75089 +   try_capture_block() repeatedly as long as -E_REPEAT is returned.
75090 +
75091 +   @node:         node to capture,
75092 +   @lock_mode:    read or write lock is used in capture mode calculation,
75093 +   @flags:        see txn_capture flags enumeration,
75094 +   @can_coc     : can copy-on-capture
75095 +
75096 +   @return: 0 - node was successfully captured, -E_REPEAT - capture request
75097 +            cannot be processed immediately as it was requested in flags,
75098 +           < 0 - other errors.
75099 +*/
75100 +int reiser4_try_capture(jnode *node, znode_lock_mode lock_mode,
75101 +                       txn_capture flags)
75102 +{
75103 +       txn_atom *atom_alloc = NULL;
75104 +       txn_capture cap_mode;
75105 +       txn_handle *txnh = get_current_context()->trans;
75106 +       int ret;
75107 +
75108 +       assert_spin_locked(&(node->guard));
75109 +
75110 +      repeat:
75111 +       if (JF_ISSET(node, JNODE_IS_DYING))
75112 +               return RETERR(-EINVAL);
75113 +       if (node->atom != NULL && txnh->atom == node->atom)
75114 +               return 0;
75115 +       cap_mode = build_capture_mode(node, lock_mode, flags);
75116 +       if (cap_mode == 0 ||
75117 +           (!(cap_mode & TXN_CAPTURE_WTYPES) && node->atom == NULL)) {
75118 +               /* Mark this node as "MISSED".  It helps in further deadlock
75119 +                * analysis */
75120 +               if (jnode_is_znode(node))
75121 +                       JF_SET(node, JNODE_MISSED_IN_CAPTURE);
75122 +               return 0;
75123 +       }
75124 +       /* Repeat try_capture as long as -E_REPEAT is returned. */
75125 +       ret = try_capture_block(txnh, node, cap_mode, &atom_alloc);
75126 +       /* Regardless of non_blocking:
75127 +
75128 +          If ret == 0 then jnode is still locked.
75129 +          If ret != 0 then jnode is unlocked.
75130 +        */
75131 +#if REISER4_DEBUG
75132 +       if (ret == 0)
75133 +               assert_spin_locked(&(node->guard));
75134 +       else
75135 +               assert_spin_not_locked(&(node->guard));
75136 +#endif
75137 +       assert_spin_not_locked(&(txnh->guard));
75138 +
75139 +       if (ret == -E_REPEAT) {
75140 +               /* E_REPEAT implies all locks were released, therefore we need
75141 +                  to take the jnode's lock again. */
75142 +               spin_lock_jnode(node);
75143 +
75144 +               /* Although this may appear to be a busy loop, it is not.
75145 +                  There are several conditions that cause E_REPEAT to be
75146 +                  returned by the call to try_capture_block, all cases
75147 +                  indicating some kind of state change that means you should
75148 +                  retry the request and will get a different result.  In some
75149 +                  cases this could be avoided with some extra code, but
75150 +                  generally it is done because the necessary locks were
75151 +                  released as a result of the operation and repeating is the
75152 +                  simplest thing to do (less bug potential).  The cases are:
75153 +                  atom fusion returns E_REPEAT after it completes (jnode and
75154 +                  txnh were unlocked); race conditions in assign_block,
75155 +                  assign_txnh, and init_fusion return E_REPEAT (trylock
75156 +                  failure); after going to sleep in capture_fuse_wait
75157 +                  (request was blocked but may now succeed).  I'm not quite
75158 +                  sure how capture_copy works yet, but it may also return
75159 +                  E_REPEAT.  When the request is legitimately blocked, the
75160 +                  requestor goes to sleep in fuse_wait, so this is not a busy
75161 +                  loop. */
75162 +               /* NOTE-NIKITA: still don't understand:
75163 +
75164 +                  try_capture_block->capture_assign_txnh->spin_trylock_atom->E_REPEAT
75165 +
75166 +                  looks like busy loop?
75167 +                */
75168 +               goto repeat;
75169 +       }
75170 +
75171 +       /* free extra atom object that was possibly allocated by
75172 +          try_capture_block().
75173 +
75174 +          Do this before acquiring jnode spin lock to
75175 +          minimize time spent under lock. --nikita */
75176 +       if (atom_alloc != NULL) {
75177 +               kmem_cache_free(_atom_slab, atom_alloc);
75178 +       }
75179 +
75180 +       if (ret != 0) {
75181 +               if (ret == -E_BLOCK) {
75182 +                       assert("nikita-3360",
75183 +                              cap_mode & TXN_CAPTURE_NONBLOCKING);
75184 +                       ret = -E_REPEAT;
75185 +               }
75186 +
75187 +               /* Failure means jnode is not locked.  FIXME_LATER_JMACD May
75188 +                  want to fix the above code to avoid releasing the lock and
75189 +                  re-acquiring it, but there are cases were failure occurs
75190 +                  when the lock is not held, and those cases would need to be
75191 +                  modified to re-take the lock. */
75192 +               spin_lock_jnode(node);
75193 +       }
75194 +
75195 +       /* Jnode is still locked. */
75196 +       assert_spin_locked(&(node->guard));
75197 +       return ret;
75198 +}
75199 +
75200 +static void release_two_atoms(txn_atom *one, txn_atom *two)
75201 +{
75202 +       spin_unlock_atom(one);
75203 +       atom_dec_and_unlock(two);
75204 +       spin_lock_atom(one);
75205 +       atom_dec_and_unlock(one);
75206 +}
75207 +
75208 +/* This function sets up a call to try_capture_block and repeats as long as -E_REPEAT is
75209 +   returned by that routine.  The txn_capture request mode is computed here depending on
75210 +   the transaction handle's type and the lock request.  This is called from the depths of
75211 +   the lock manager with the jnode lock held and it always returns with the jnode lock
75212 +   held.
75213 +*/
75214 +
75215 +/* fuse all 'active' atoms of lock owners of given node. */
75216 +static void fuse_not_fused_lock_owners(txn_handle * txnh, znode * node)
75217 +{
75218 +       lock_handle *lh;
75219 +       int repeat;
75220 +       txn_atom *atomh, *atomf;
75221 +       reiser4_context *me = get_current_context();
75222 +       reiser4_context *ctx = NULL;
75223 +
75224 +       assert_spin_not_locked(&(ZJNODE(node)->guard));
75225 +       assert_spin_not_locked(&(txnh->hlock));
75226 +
75227 + repeat:
75228 +       repeat = 0;
75229 +       atomh = txnh_get_atom(txnh);
75230 +       spin_unlock_txnh(txnh);
75231 +       assert("zam-692", atomh != NULL);
75232 +
75233 +       spin_lock_zlock(&node->lock);
75234 +       /* inspect list of lock owners */
75235 +       list_for_each_entry(lh, &node->lock.owners, owners_link) {
75236 +               ctx = get_context_by_lock_stack(lh->owner);
75237 +               if (ctx == me)
75238 +                       continue;
75239 +               /* below we use two assumptions to avoid addition spin-locks
75240 +                  for checking the condition :
75241 +
75242 +                  1) if the lock stack has lock, the transaction should be
75243 +                  opened, i.e. ctx->trans != NULL;
75244 +
75245 +                  2) reading of well-aligned ctx->trans->atom is atomic, if it
75246 +                  equals to the address of spin-locked atomh, we take that
75247 +                  the atoms are the same, nothing has to be captured. */
75248 +               if (atomh != ctx->trans->atom) {
75249 +                       reiser4_wake_up(lh->owner);
75250 +                       repeat = 1;
75251 +                       break;
75252 +               }
75253 +       }
75254 +       if (repeat) {
75255 +               if (!spin_trylock_txnh(ctx->trans)) {
75256 +                       spin_unlock_zlock(&node->lock);
75257 +                       spin_unlock_atom(atomh);
75258 +                       goto repeat;
75259 +               }
75260 +               atomf = ctx->trans->atom;
75261 +               if (atomf == NULL) {
75262 +                       capture_assign_txnh_nolock(atomh, ctx->trans);
75263 +                       /* release zlock lock _after_ assigning the atom to the
75264 +                        * transaction handle, otherwise the lock owner thread
75265 +                        * may unlock all znodes, exit kernel context and here
75266 +                        * we would access an invalid transaction handle. */
75267 +                       spin_unlock_zlock(&node->lock);
75268 +                       spin_unlock_atom(atomh);
75269 +                       spin_unlock_txnh(ctx->trans);
75270 +                       goto repeat;
75271 +               }
75272 +               assert("zam-1059", atomf != atomh);
75273 +               spin_unlock_zlock(&node->lock);
75274 +               atomic_inc(&atomh->refcount);
75275 +               atomic_inc(&atomf->refcount);
75276 +               spin_unlock_txnh(ctx->trans);
75277 +               if (atomf > atomh) {
75278 +                       spin_lock_atom_nested(atomf);
75279 +               } else {
75280 +                       spin_unlock_atom(atomh);
75281 +                       spin_lock_atom(atomf);
75282 +                       spin_lock_atom_nested(atomh);
75283 +               }
75284 +               if (atomh == atomf || !atom_isopen(atomh) || !atom_isopen(atomf)) {
75285 +                       release_two_atoms(atomf, atomh);
75286 +                       goto repeat;
75287 +               }
75288 +               atomic_dec(&atomh->refcount);
75289 +               atomic_dec(&atomf->refcount);
75290 +               capture_fuse_into(atomf, atomh);
75291 +               goto repeat;
75292 +       }
75293 +       spin_unlock_zlock(&node->lock);
75294 +       spin_unlock_atom(atomh);
75295 +}
75296 +
75297 +/* This is the interface to capture unformatted nodes via their struct page
75298 +   reference. Currently it is only used in reiser4_invalidatepage */
75299 +int try_capture_page_to_invalidate(struct page *pg)
75300 +{
75301 +       int ret;
75302 +       jnode *node;
75303 +
75304 +       assert("umka-292", pg != NULL);
75305 +       assert("nikita-2597", PageLocked(pg));
75306 +
75307 +       if (IS_ERR(node = jnode_of_page(pg))) {
75308 +               return PTR_ERR(node);
75309 +       }
75310 +
75311 +       spin_lock_jnode(node);
75312 +       unlock_page(pg);
75313 +
75314 +       ret = reiser4_try_capture(node, ZNODE_WRITE_LOCK, 0);
75315 +       spin_unlock_jnode(node);
75316 +       jput(node);
75317 +       lock_page(pg);
75318 +       return ret;
75319 +}
75320 +
75321 +/* This informs the transaction manager when a node is deleted.  Add the block to the
75322 +   atom's delete set and uncapture the block.
75323 +
75324 +VS-FIXME-HANS: this E_REPEAT paradigm clutters the code and creates a need for
75325 +explanations.  find all the functions that use it, and unless there is some very
75326 +good reason to use it (I have not noticed one so far and I doubt it exists, but maybe somewhere somehow....),
75327 +move the loop to inside the function.
75328 +
75329 +VS-FIXME-HANS: can this code be at all streamlined?  In particular, can you lock and unlock the jnode fewer times?
75330 +  */
75331 +void reiser4_uncapture_page(struct page *pg)
75332 +{
75333 +       jnode *node;
75334 +       txn_atom *atom;
75335 +
75336 +       assert("umka-199", pg != NULL);
75337 +       assert("nikita-3155", PageLocked(pg));
75338 +
75339 +       clear_page_dirty_for_io(pg);
75340 +
75341 +       reiser4_wait_page_writeback(pg);
75342 +
75343 +       node = jprivate(pg);
75344 +       BUG_ON(node == NULL);
75345 +
75346 +       spin_lock_jnode(node);
75347 +
75348 +       atom = jnode_get_atom(node);
75349 +       if (atom == NULL) {
75350 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
75351 +               spin_unlock_jnode(node);
75352 +               return;
75353 +       }
75354 +
75355 +       /* We can remove jnode from transaction even if it is on flush queue
75356 +        * prepped list, we only need to be sure that flush queue is not being
75357 +        * written by reiser4_write_fq().  reiser4_write_fq() does not use atom
75358 +        * spin lock for protection of the prepped nodes list, instead
75359 +        * write_fq() increments atom's nr_running_queues counters for the time
75360 +        * when prepped list is not protected by spin lock.  Here we check this
75361 +        * counter if we want to remove jnode from flush queue and, if the
75362 +        * counter is not zero, wait all reiser4_write_fq() for this atom to
75363 +        * complete. This is not significant overhead. */
75364 +       while (JF_ISSET(node, JNODE_FLUSH_QUEUED) && atom->nr_running_queues) {
75365 +               spin_unlock_jnode(node);
75366 +               /*
75367 +                * at this moment we want to wait for "atom event", viz. wait
75368 +                * until @node can be removed from flush queue. But
75369 +                * reiser4_atom_wait_event() cannot be called with page locked,
75370 +                * because it deadlocks with jnode_extent_write(). Unlock page,
75371 +                * after making sure (through page_cache_get()) that it cannot
75372 +                * be released from memory.
75373 +                */
75374 +               page_cache_get(pg);
75375 +               unlock_page(pg);
75376 +               reiser4_atom_wait_event(atom);
75377 +               lock_page(pg);
75378 +               /*
75379 +                * page may has been detached by ->writepage()->releasepage().
75380 +                */
75381 +               reiser4_wait_page_writeback(pg);
75382 +               spin_lock_jnode(node);
75383 +               page_cache_release(pg);
75384 +               atom = jnode_get_atom(node);
75385 +/* VS-FIXME-HANS: improve the commenting in this function */
75386 +               if (atom == NULL) {
75387 +                       spin_unlock_jnode(node);
75388 +                       return;
75389 +               }
75390 +       }
75391 +       reiser4_uncapture_block(node);
75392 +       spin_unlock_atom(atom);
75393 +       jput(node);
75394 +}
75395 +
75396 +/* this is used in extent's kill hook to uncapture and unhash jnodes attached to
75397 + * inode's tree of jnodes */
75398 +void reiser4_uncapture_jnode(jnode * node)
75399 +{
75400 +       txn_atom *atom;
75401 +
75402 +       assert_spin_locked(&(node->guard));
75403 +       assert("", node->pg == 0);
75404 +
75405 +       atom = jnode_get_atom(node);
75406 +       if (atom == NULL) {
75407 +               assert("jmacd-7111", !JF_ISSET(node, JNODE_DIRTY));
75408 +               spin_unlock_jnode(node);
75409 +               return;
75410 +       }
75411 +
75412 +       reiser4_uncapture_block(node);
75413 +       spin_unlock_atom(atom);
75414 +       jput(node);
75415 +}
75416 +
75417 +/* No-locking version of assign_txnh.  Sets the transaction handle's atom pointer,
75418 +   increases atom refcount and txnh_count, adds to txnh_list. */
75419 +static void capture_assign_txnh_nolock(txn_atom *atom, txn_handle *txnh)
75420 +{
75421 +       assert("umka-200", atom != NULL);
75422 +       assert("umka-201", txnh != NULL);
75423 +
75424 +       assert_spin_locked(&(txnh->hlock));
75425 +       assert_spin_locked(&(atom->alock));
75426 +       assert("jmacd-824", txnh->atom == NULL);
75427 +       assert("nikita-3540", atom_isopen(atom));
75428 +       BUG_ON(txnh->atom != NULL);
75429 +
75430 +       atomic_inc(&atom->refcount);
75431 +       txnh->atom = atom;
75432 +       reiser4_ctx_gfp_mask_set();
75433 +       list_add_tail(&txnh->txnh_link, &atom->txnh_list);
75434 +       atom->txnh_count += 1;
75435 +}
75436 +
75437 +/* No-locking version of assign_block.  Sets the block's atom pointer, references the
75438 +   block, adds it to the clean or dirty capture_jnode list, increments capture_count. */
75439 +static void capture_assign_block_nolock(txn_atom *atom, jnode *node)
75440 +{
75441 +       assert("umka-202", atom != NULL);
75442 +       assert("umka-203", node != NULL);
75443 +       assert_spin_locked(&(node->guard));
75444 +       assert_spin_locked(&(atom->alock));
75445 +       assert("jmacd-323", node->atom == NULL);
75446 +       BUG_ON(!list_empty_careful(&node->capture_link));
75447 +       assert("nikita-3470", !JF_ISSET(node, JNODE_DIRTY));
75448 +
75449 +       /* Pointer from jnode to atom is not counted in atom->refcount. */
75450 +       node->atom = atom;
75451 +
75452 +       list_add_tail(&node->capture_link, ATOM_CLEAN_LIST(atom));
75453 +       atom->capture_count += 1;
75454 +       /* reference to jnode is acquired by atom. */
75455 +       jref(node);
75456 +
75457 +       ON_DEBUG(count_jnode(atom, node, NOT_CAPTURED, CLEAN_LIST, 1));
75458 +
75459 +       LOCK_CNT_INC(t_refs);
75460 +}
75461 +
75462 +/* common code for dirtying both unformatted jnodes and formatted znodes. */
75463 +static void do_jnode_make_dirty(jnode * node, txn_atom * atom)
75464 +{
75465 +       assert_spin_locked(&(node->guard));
75466 +       assert_spin_locked(&(atom->alock));
75467 +       assert("jmacd-3981", !JF_ISSET(node, JNODE_DIRTY));
75468 +
75469 +       JF_SET(node, JNODE_DIRTY);
75470 +
75471 +       get_current_context()->nr_marked_dirty++;
75472 +
75473 +       /* We grab2flush_reserve one additional block only if node was
75474 +          not CREATED and jnode_flush did not sort it into neither
75475 +          relocate set nor overwrite one. If node is in overwrite or
75476 +          relocate set we assume that atom's flush reserved counter was
75477 +          already adjusted. */
75478 +       if (!JF_ISSET(node, JNODE_CREATED) && !JF_ISSET(node, JNODE_RELOC)
75479 +           && !JF_ISSET(node, JNODE_OVRWR) && jnode_is_leaf(node)
75480 +           && !jnode_is_cluster_page(node)) {
75481 +               assert("vs-1093", !reiser4_blocknr_is_fake(&node->blocknr));
75482 +               assert("vs-1506", *jnode_get_block(node) != 0);
75483 +               grabbed2flush_reserved_nolock(atom, (__u64) 1);
75484 +               JF_SET(node, JNODE_FLUSH_RESERVED);
75485 +       }
75486 +
75487 +       if (!JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
75488 +               /* If the atom is not set yet, it will be added to the appropriate list in
75489 +                  capture_assign_block_nolock. */
75490 +               /* Sometimes a node is set dirty before being captured -- the case for new
75491 +                  jnodes.  In that case the jnode will be added to the appropriate list
75492 +                  in capture_assign_block_nolock. Another reason not to re-link jnode is
75493 +                  that jnode is on a flush queue (see flush.c for details) */
75494 +
75495 +               int level = jnode_get_level(node);
75496 +
75497 +               assert("nikita-3152", !JF_ISSET(node, JNODE_OVRWR));
75498 +               assert("zam-654", atom->stage < ASTAGE_PRE_COMMIT);
75499 +               assert("nikita-2607", 0 <= level);
75500 +               assert("nikita-2606", level <= REAL_MAX_ZTREE_HEIGHT);
75501 +
75502 +               /* move node to atom's dirty list */
75503 +               list_move_tail(&node->capture_link, ATOM_DIRTY_LIST(atom, level));
75504 +               ON_DEBUG(count_jnode
75505 +                        (atom, node, NODE_LIST(node), DIRTY_LIST, 1));
75506 +       }
75507 +}
75508 +
75509 +/* Set the dirty status for this (spin locked) jnode. */
75510 +void jnode_make_dirty_locked(jnode * node)
75511 +{
75512 +       assert("umka-204", node != NULL);
75513 +       assert_spin_locked(&(node->guard));
75514 +
75515 +       if (REISER4_DEBUG && rofs_jnode(node)) {
75516 +               warning("nikita-3365", "Dirtying jnode on rofs");
75517 +               dump_stack();
75518 +       }
75519 +
75520 +       /* Fast check for already dirty node */
75521 +       if (!JF_ISSET(node, JNODE_DIRTY)) {
75522 +               txn_atom *atom;
75523 +
75524 +               atom = jnode_get_atom(node);
75525 +               assert("vs-1094", atom);
75526 +               /* Check jnode dirty status again because node spin lock might
75527 +                * be released inside jnode_get_atom(). */
75528 +               if (likely(!JF_ISSET(node, JNODE_DIRTY)))
75529 +                       do_jnode_make_dirty(node, atom);
75530 +               spin_unlock_atom(atom);
75531 +       }
75532 +}
75533 +
75534 +/* Set the dirty status for this znode. */
75535 +void znode_make_dirty(znode * z)
75536 +{
75537 +       jnode *node;
75538 +       struct page *page;
75539 +
75540 +       assert("umka-204", z != NULL);
75541 +       assert("nikita-3290", znode_above_root(z) || znode_is_loaded(z));
75542 +       assert("nikita-3560", znode_is_write_locked(z));
75543 +
75544 +       node = ZJNODE(z);
75545 +       /* znode is longterm locked, we can check dirty bit without spinlock */
75546 +       if (JF_ISSET(node, JNODE_DIRTY)) {
75547 +               /* znode is dirty already. All we have to do is to change znode version */
75548 +               z->version = znode_build_version(jnode_get_tree(node));
75549 +               return;
75550 +       }
75551 +
75552 +       spin_lock_jnode(node);
75553 +       jnode_make_dirty_locked(node);
75554 +       page = jnode_page(node);
75555 +       if (page != NULL) {
75556 +               /* this is useful assertion (allows one to check that no
75557 +                * modifications are lost due to update of in-flight page),
75558 +                * but it requires locking on page to check PG_writeback
75559 +                * bit. */
75560 +               /* assert("nikita-3292",
75561 +                  !PageWriteback(page) || ZF_ISSET(z, JNODE_WRITEBACK)); */
75562 +               page_cache_get(page);
75563 +
75564 +               /* jnode lock is not needed for the rest of
75565 +                * znode_set_dirty(). */
75566 +               spin_unlock_jnode(node);
75567 +               /* reiser4 file write code calls set_page_dirty for
75568 +                * unformatted nodes, for formatted nodes we do it here. */
75569 +               reiser4_set_page_dirty_internal(page);
75570 +               page_cache_release(page);
75571 +               /* bump version counter in znode */
75572 +               z->version = znode_build_version(jnode_get_tree(node));
75573 +       } else {
75574 +               assert("zam-596", znode_above_root(JZNODE(node)));
75575 +               spin_unlock_jnode(node);
75576 +       }
75577 +
75578 +       assert("nikita-1900", znode_is_write_locked(z));
75579 +       assert("jmacd-9777", node->atom != NULL);
75580 +}
75581 +
75582 +int reiser4_sync_atom(txn_atom * atom)
75583 +{
75584 +       int result;
75585 +       txn_handle *txnh;
75586 +
75587 +       txnh = get_current_context()->trans;
75588 +
75589 +       result = 0;
75590 +       if (atom != NULL) {
75591 +               if (atom->stage < ASTAGE_PRE_COMMIT) {
75592 +                       spin_lock_txnh(txnh);
75593 +                       capture_assign_txnh_nolock(atom, txnh);
75594 +                       result = force_commit_atom(txnh);
75595 +               } else if (atom->stage < ASTAGE_POST_COMMIT) {
75596 +                       /* wait atom commit */
75597 +                       reiser4_atom_wait_event(atom);
75598 +                       /* try once more */
75599 +                       result = RETERR(-E_REPEAT);
75600 +               } else
75601 +                       spin_unlock_atom(atom);
75602 +       }
75603 +       return result;
75604 +}
75605 +
75606 +#if REISER4_DEBUG
75607 +
75608 +/* move jnode form one list to another
75609 +   call this after atom->capture_count is updated */
75610 +void
75611 +count_jnode(txn_atom * atom, jnode * node, atom_list old_list,
75612 +           atom_list new_list, int check_lists)
75613 +{
75614 +       struct list_head *pos;
75615 +
75616 +       assert("zam-1018", atom_is_protected(atom));
75617 +       assert_spin_locked(&(node->guard));
75618 +       assert("", NODE_LIST(node) == old_list);
75619 +
75620 +       switch (NODE_LIST(node)) {
75621 +       case NOT_CAPTURED:
75622 +               break;
75623 +       case DIRTY_LIST:
75624 +               assert("", atom->dirty > 0);
75625 +               atom->dirty--;
75626 +               break;
75627 +       case CLEAN_LIST:
75628 +               assert("", atom->clean > 0);
75629 +               atom->clean--;
75630 +               break;
75631 +       case FQ_LIST:
75632 +               assert("", atom->fq > 0);
75633 +               atom->fq--;
75634 +               break;
75635 +       case WB_LIST:
75636 +               assert("", atom->wb > 0);
75637 +               atom->wb--;
75638 +               break;
75639 +       case OVRWR_LIST:
75640 +               assert("", atom->ovrwr > 0);
75641 +               atom->ovrwr--;
75642 +               break;
75643 +       default:
75644 +               impossible("", "");
75645 +       }
75646 +
75647 +       switch (new_list) {
75648 +       case NOT_CAPTURED:
75649 +               break;
75650 +       case DIRTY_LIST:
75651 +               atom->dirty++;
75652 +               break;
75653 +       case CLEAN_LIST:
75654 +               atom->clean++;
75655 +               break;
75656 +       case FQ_LIST:
75657 +               atom->fq++;
75658 +               break;
75659 +       case WB_LIST:
75660 +               atom->wb++;
75661 +               break;
75662 +       case OVRWR_LIST:
75663 +               atom->ovrwr++;
75664 +               break;
75665 +       default:
75666 +               impossible("", "");
75667 +       }
75668 +       ASSIGN_NODE_LIST(node, new_list);
75669 +       if (0 && check_lists) {
75670 +               int count;
75671 +               tree_level level;
75672 +
75673 +               count = 0;
75674 +
75675 +               /* flush queue list */
75676 +               /* reiser4_check_fq(atom); */
75677 +
75678 +               /* dirty list */
75679 +               count = 0;
75680 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
75681 +                       list_for_each(pos, ATOM_DIRTY_LIST(atom, level))
75682 +                               count++;
75683 +               }
75684 +               if (count != atom->dirty)
75685 +                       warning("", "dirty counter %d, real %d\n", atom->dirty,
75686 +                               count);
75687 +
75688 +               /* clean list */
75689 +               count = 0;
75690 +               list_for_each(pos, ATOM_CLEAN_LIST(atom))
75691 +                       count++;
75692 +               if (count != atom->clean)
75693 +                       warning("", "clean counter %d, real %d\n", atom->clean,
75694 +                               count);
75695 +
75696 +               /* wb list */
75697 +               count = 0;
75698 +               list_for_each(pos, ATOM_WB_LIST(atom))
75699 +                       count++;
75700 +               if (count != atom->wb)
75701 +                       warning("", "wb counter %d, real %d\n", atom->wb,
75702 +                               count);
75703 +
75704 +               /* overwrite list */
75705 +               count = 0;
75706 +               list_for_each(pos, ATOM_OVRWR_LIST(atom))
75707 +                       count++;
75708 +
75709 +               if (count != atom->ovrwr)
75710 +                       warning("", "ovrwr counter %d, real %d\n", atom->ovrwr,
75711 +                               count);
75712 +       }
75713 +       assert("vs-1624", atom->num_queued == atom->fq);
75714 +       if (atom->capture_count !=
75715 +           atom->dirty + atom->clean + atom->ovrwr + atom->wb + atom->fq) {
75716 +               printk
75717 +                   ("count %d, dirty %d clean %d ovrwr %d wb %d fq %d\n",
75718 +                    atom->capture_count, atom->dirty, atom->clean, atom->ovrwr,
75719 +                    atom->wb, atom->fq);
75720 +               assert("vs-1622",
75721 +                      atom->capture_count ==
75722 +                      atom->dirty + atom->clean + atom->ovrwr + atom->wb +
75723 +                      atom->fq);
75724 +       }
75725 +}
75726 +
75727 +#endif
75728 +
75729 +/* Make node OVRWR and put it on atom->overwrite_nodes list, atom lock and jnode
75730 + * lock should be taken before calling this function. */
75731 +void jnode_make_wander_nolock(jnode * node)
75732 +{
75733 +       txn_atom *atom;
75734 +
75735 +       assert("nikita-2431", node != NULL);
75736 +       assert("nikita-2432", !JF_ISSET(node, JNODE_RELOC));
75737 +       assert("nikita-3153", JF_ISSET(node, JNODE_DIRTY));
75738 +       assert("zam-897", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75739 +       assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
75740 +
75741 +       atom = node->atom;
75742 +
75743 +       assert("zam-895", atom != NULL);
75744 +       assert("zam-894", atom_is_protected(atom));
75745 +
75746 +       JF_SET(node, JNODE_OVRWR);
75747 +       /* move node to atom's overwrite list */
75748 +       list_move_tail(&node->capture_link, ATOM_OVRWR_LIST(atom));
75749 +       ON_DEBUG(count_jnode(atom, node, DIRTY_LIST, OVRWR_LIST, 1));
75750 +}
75751 +
75752 +/* Same as jnode_make_wander_nolock, but all necessary locks are taken inside
75753 + * this function. */
75754 +void jnode_make_wander(jnode * node)
75755 +{
75756 +       txn_atom *atom;
75757 +
75758 +       spin_lock_jnode(node);
75759 +       atom = jnode_get_atom(node);
75760 +       assert("zam-913", atom != NULL);
75761 +       assert("zam-914", !JF_ISSET(node, JNODE_RELOC));
75762 +
75763 +       jnode_make_wander_nolock(node);
75764 +       spin_unlock_atom(atom);
75765 +       spin_unlock_jnode(node);
75766 +}
75767 +
75768 +/* this just sets RELOC bit  */
75769 +static void jnode_make_reloc_nolock(flush_queue_t * fq, jnode * node)
75770 +{
75771 +       assert_spin_locked(&(node->guard));
75772 +       assert("zam-916", JF_ISSET(node, JNODE_DIRTY));
75773 +       assert("zam-917", !JF_ISSET(node, JNODE_RELOC));
75774 +       assert("zam-918", !JF_ISSET(node, JNODE_OVRWR));
75775 +       assert("zam-920", !JF_ISSET(node, JNODE_FLUSH_QUEUED));
75776 +       assert("nikita-3367", !reiser4_blocknr_is_fake(jnode_get_block(node)));
75777 +       jnode_set_reloc(node);
75778 +}
75779 +
75780 +/* Make znode RELOC and put it on flush queue */
75781 +void znode_make_reloc(znode * z, flush_queue_t * fq)
75782 +{
75783 +       jnode *node;
75784 +       txn_atom *atom;
75785 +
75786 +       node = ZJNODE(z);
75787 +       spin_lock_jnode(node);
75788 +
75789 +       atom = jnode_get_atom(node);
75790 +       assert("zam-919", atom != NULL);
75791 +
75792 +       jnode_make_reloc_nolock(fq, node);
75793 +       queue_jnode(fq, node);
75794 +
75795 +       spin_unlock_atom(atom);
75796 +       spin_unlock_jnode(node);
75797 +
75798 +}
75799 +
75800 +/* Make unformatted node RELOC and put it on flush queue */
75801 +void unformatted_make_reloc(jnode *node, flush_queue_t *fq)
75802 +{
75803 +       assert("vs-1479", jnode_is_unformatted(node));
75804 +
75805 +       jnode_make_reloc_nolock(fq, node);
75806 +       queue_jnode(fq, node);
75807 +}
75808 +
75809 +int reiser4_capture_super_block(struct super_block *s)
75810 +{
75811 +       int result;
75812 +       znode *uber;
75813 +       lock_handle lh;
75814 +
75815 +       init_lh(&lh);
75816 +       result = get_uber_znode(reiser4_get_tree(s),
75817 +                               ZNODE_WRITE_LOCK, ZNODE_LOCK_LOPRI, &lh);
75818 +       if (result)
75819 +               return result;
75820 +
75821 +       uber = lh.node;
75822 +       /* Grabbing one block for superblock */
75823 +       result = reiser4_grab_space_force((__u64) 1, BA_RESERVED);
75824 +       if (result != 0)
75825 +               return result;
75826 +
75827 +       znode_make_dirty(uber);
75828 +
75829 +       done_lh(&lh);
75830 +       return 0;
75831 +}
75832 +
75833 +/* Wakeup every handle on the atom's WAITFOR list */
75834 +static void wakeup_atom_waitfor_list(txn_atom * atom)
75835 +{
75836 +       txn_wait_links *wlinks;
75837 +
75838 +       assert("umka-210", atom != NULL);
75839 +
75840 +       /* atom is locked */
75841 +       list_for_each_entry(wlinks, &atom->fwaitfor_list, _fwaitfor_link) {
75842 +               if (wlinks->waitfor_cb == NULL ||
75843 +                   wlinks->waitfor_cb(atom, wlinks))
75844 +                       /* Wake up. */
75845 +                       reiser4_wake_up(wlinks->_lock_stack);
75846 +       }
75847 +}
75848 +
75849 +/* Wakeup every handle on the atom's WAITING list */
75850 +static void wakeup_atom_waiting_list(txn_atom * atom)
75851 +{
75852 +       txn_wait_links *wlinks;
75853 +
75854 +       assert("umka-211", atom != NULL);
75855 +
75856 +       /* atom is locked */
75857 +       list_for_each_entry(wlinks, &atom->fwaiting_list, _fwaiting_link) {
75858 +               if (wlinks->waiting_cb == NULL ||
75859 +                   wlinks->waiting_cb(atom, wlinks))
75860 +                       /* Wake up. */
75861 +                       reiser4_wake_up(wlinks->_lock_stack);
75862 +       }
75863 +}
75864 +
75865 +/* helper function used by capture_fuse_wait() to avoid "spurious wake-ups" */
75866 +static int wait_for_fusion(txn_atom * atom, txn_wait_links * wlinks)
75867 +{
75868 +       assert("nikita-3330", atom != NULL);
75869 +       assert_spin_locked(&(atom->alock));
75870 +
75871 +       /* atom->txnh_count == 1 is for waking waiters up if we are releasing
75872 +        * last transaction handle. */
75873 +       return atom->stage != ASTAGE_CAPTURE_WAIT || atom->txnh_count == 1;
75874 +}
75875 +
75876 +/* The general purpose of this function is to wait on the first of two possible events.
75877 +   The situation is that a handle (and its atom atomh) is blocked trying to capture a
75878 +   block (i.e., node) but the node's atom (atomf) is in the CAPTURE_WAIT state.  The
75879 +   handle's atom (atomh) is not in the CAPTURE_WAIT state.  However, atomh could fuse with
75880 +   another atom or, due to age, enter the CAPTURE_WAIT state itself, at which point it
75881 +   needs to unblock the handle to avoid deadlock.  When the txnh is unblocked it will
75882 +   proceed and fuse the two atoms in the CAPTURE_WAIT state.
75883 +
75884 +   In other words, if either atomh or atomf change state, the handle will be awakened,
75885 +   thus there are two lists per atom: WAITING and WAITFOR.
75886 +
75887 +   This is also called by capture_assign_txnh with (atomh == NULL) to wait for atomf to
75888 +   close but it is not assigned to an atom of its own.
75889 +
75890 +   Lock ordering in this method: all four locks are held: JNODE_LOCK, TXNH_LOCK,
75891 +   BOTH_ATOM_LOCKS.  Result: all four locks are released.
75892 +*/
75893 +static int capture_fuse_wait(txn_handle * txnh, txn_atom * atomf,
75894 +                   txn_atom * atomh, txn_capture mode)
75895 +{
75896 +       int ret;
75897 +       txn_wait_links wlinks;
75898 +
75899 +       assert("umka-213", txnh != NULL);
75900 +       assert("umka-214", atomf != NULL);
75901 +
75902 +       if ((mode & TXN_CAPTURE_NONBLOCKING) != 0) {
75903 +               spin_unlock_txnh(txnh);
75904 +               spin_unlock_atom(atomf);
75905 +
75906 +               if (atomh) {
75907 +                       spin_unlock_atom(atomh);
75908 +               }
75909 +
75910 +               return RETERR(-E_BLOCK);
75911 +       }
75912 +
75913 +       /* Initialize the waiting list links. */
75914 +       init_wlinks(&wlinks);
75915 +
75916 +       /* Add txnh to atomf's waitfor list, unlock atomf. */
75917 +       list_add_tail(&wlinks._fwaitfor_link, &atomf->fwaitfor_list);
75918 +       wlinks.waitfor_cb = wait_for_fusion;
75919 +       atomic_inc(&atomf->refcount);
75920 +       spin_unlock_atom(atomf);
75921 +
75922 +       if (atomh) {
75923 +               /* Add txnh to atomh's waiting list, unlock atomh. */
75924 +               list_add_tail(&wlinks._fwaiting_link, &atomh->fwaiting_list);
75925 +               atomic_inc(&atomh->refcount);
75926 +               spin_unlock_atom(atomh);
75927 +       }
75928 +
75929 +       /* Go to sleep. */
75930 +       spin_unlock_txnh(txnh);
75931 +
75932 +       ret = reiser4_prepare_to_sleep(wlinks._lock_stack);
75933 +       if (ret == 0) {
75934 +               reiser4_go_to_sleep(wlinks._lock_stack);
75935 +               ret = RETERR(-E_REPEAT);
75936 +       }
75937 +
75938 +       /* Remove from the waitfor list. */
75939 +       spin_lock_atom(atomf);
75940 +
75941 +       list_del(&wlinks._fwaitfor_link);
75942 +       atom_dec_and_unlock(atomf);
75943 +
75944 +       if (atomh) {
75945 +               /* Remove from the waiting list. */
75946 +               spin_lock_atom(atomh);
75947 +               list_del(&wlinks._fwaiting_link);
75948 +               atom_dec_and_unlock(atomh);
75949 +       }
75950 +       return ret;
75951 +}
75952 +
75953 +static void lock_two_atoms(txn_atom * one, txn_atom * two)
75954 +{
75955 +       assert("zam-1067", one != two);
75956 +
75957 +       /* lock the atom with lesser address first */
75958 +       if (one < two) {
75959 +               spin_lock_atom(one);
75960 +               spin_lock_atom_nested(two);
75961 +       } else {
75962 +               spin_lock_atom(two);
75963 +               spin_lock_atom_nested(one);
75964 +       }
75965 +}
75966 +
75967 +/* Perform the necessary work to prepare for fusing two atoms, which involves
75968 + * acquiring two atom locks in the proper order.  If one of the node's atom is
75969 + * blocking fusion (i.e., it is in the CAPTURE_WAIT stage) and the handle's
75970 + * atom is not then the handle's request is put to sleep.  If the node's atom
75971 + * is committing, then the node can be copy-on-captured.  Otherwise, pick the
75972 + * atom with fewer pointers to be fused into the atom with more pointer and
75973 + * call capture_fuse_into.
75974 + */
75975 +static int capture_init_fusion(jnode *node, txn_handle *txnh, txn_capture mode)
75976 +{
75977 +       txn_atom * txnh_atom = txnh->atom;
75978 +       txn_atom * block_atom = node->atom;
75979 +
75980 +       atomic_inc(&txnh_atom->refcount);
75981 +       atomic_inc(&block_atom->refcount);
75982 +
75983 +       spin_unlock_txnh(txnh);
75984 +       spin_unlock_jnode(node);
75985 +
75986 +       lock_two_atoms(txnh_atom, block_atom);
75987 +
75988 +       if (txnh->atom != txnh_atom || node->atom != block_atom ) {
75989 +               release_two_atoms(txnh_atom, block_atom);
75990 +               return RETERR(-E_REPEAT);
75991 +       }
75992 +
75993 +       atomic_dec(&txnh_atom->refcount);
75994 +       atomic_dec(&block_atom->refcount);
75995 +
75996 +       assert ("zam-1066", atom_isopen(txnh_atom));
75997 +
75998 +       if (txnh_atom->stage >= block_atom->stage ||
75999 +           (block_atom->stage == ASTAGE_CAPTURE_WAIT && block_atom->txnh_count == 0)) {
76000 +               capture_fuse_into(txnh_atom, block_atom);
76001 +               return RETERR(-E_REPEAT);
76002 +       }
76003 +       spin_lock_txnh(txnh);
76004 +       return capture_fuse_wait(txnh, block_atom, txnh_atom, mode);
76005 +}
76006 +
76007 +/* This function splices together two jnode lists (small and large) and sets all jnodes in
76008 +   the small list to point to the large atom.  Returns the length of the list. */
76009 +static int
76010 +capture_fuse_jnode_lists(txn_atom *large, struct list_head *large_head,
76011 +                        struct list_head *small_head)
76012 +{
76013 +       int count = 0;
76014 +       jnode *node;
76015 +
76016 +       assert("umka-218", large != NULL);
76017 +       assert("umka-219", large_head != NULL);
76018 +       assert("umka-220", small_head != NULL);
76019 +       /* small atom should be locked also. */
76020 +       assert_spin_locked(&(large->alock));
76021 +
76022 +       /* For every jnode on small's capture list... */
76023 +       list_for_each_entry(node, small_head, capture_link) {
76024 +               count += 1;
76025 +
76026 +               /* With the jnode lock held, update atom pointer. */
76027 +               spin_lock_jnode(node);
76028 +               node->atom = large;
76029 +               spin_unlock_jnode(node);
76030 +       }
76031 +
76032 +       /* Splice the lists. */
76033 +       list_splice_init(small_head, large_head->prev);
76034 +
76035 +       return count;
76036 +}
76037 +
76038 +/* This function splices together two txnh lists (small and large) and sets all txn handles in
76039 +   the small list to point to the large atom.  Returns the length of the list. */
76040 +static int
76041 +capture_fuse_txnh_lists(txn_atom *large, struct list_head *large_head,
76042 +                       struct list_head *small_head)
76043 +{
76044 +       int count = 0;
76045 +       txn_handle *txnh;
76046 +
76047 +       assert("umka-221", large != NULL);
76048 +       assert("umka-222", large_head != NULL);
76049 +       assert("umka-223", small_head != NULL);
76050 +
76051 +       /* Adjust every txnh to the new atom. */
76052 +       list_for_each_entry(txnh, small_head, txnh_link) {
76053 +               count += 1;
76054 +
76055 +               /* With the txnh lock held, update atom pointer. */
76056 +               spin_lock_txnh(txnh);
76057 +               txnh->atom = large;
76058 +               spin_unlock_txnh(txnh);
76059 +       }
76060 +
76061 +       /* Splice the txn_handle list. */
76062 +       list_splice_init(small_head, large_head->prev);
76063 +
76064 +       return count;
76065 +}
76066 +
76067 +/* This function fuses two atoms.  The captured nodes and handles belonging to SMALL are
76068 +   added to LARGE and their ->atom pointers are all updated.  The associated counts are
76069 +   updated as well, and any waiting handles belonging to either are awakened.  Finally the
76070 +   smaller atom's refcount is decremented.
76071 +*/
76072 +static void capture_fuse_into(txn_atom * small, txn_atom * large)
76073 +{
76074 +       int level;
76075 +       unsigned zcount = 0;
76076 +       unsigned tcount = 0;
76077 +
76078 +       assert("umka-224", small != NULL);
76079 +       assert("umka-225", small != NULL);
76080 +
76081 +       assert_spin_locked(&(large->alock));
76082 +       assert_spin_locked(&(small->alock));
76083 +
76084 +       assert("jmacd-201", atom_isopen(small));
76085 +       assert("jmacd-202", atom_isopen(large));
76086 +
76087 +       /* Splice and update the per-level dirty jnode lists */
76088 +       for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; level += 1) {
76089 +               zcount +=
76090 +                   capture_fuse_jnode_lists(large,
76091 +                                            ATOM_DIRTY_LIST(large, level),
76092 +                                            ATOM_DIRTY_LIST(small, level));
76093 +       }
76094 +
76095 +       /* Splice and update the [clean,dirty] jnode and txnh lists */
76096 +       zcount +=
76097 +           capture_fuse_jnode_lists(large, ATOM_CLEAN_LIST(large),
76098 +                                    ATOM_CLEAN_LIST(small));
76099 +       zcount +=
76100 +           capture_fuse_jnode_lists(large, ATOM_OVRWR_LIST(large),
76101 +                                    ATOM_OVRWR_LIST(small));
76102 +       zcount +=
76103 +           capture_fuse_jnode_lists(large, ATOM_WB_LIST(large),
76104 +                                    ATOM_WB_LIST(small));
76105 +       zcount +=
76106 +           capture_fuse_jnode_lists(large, &large->inodes, &small->inodes);
76107 +       tcount +=
76108 +           capture_fuse_txnh_lists(large, &large->txnh_list,
76109 +                                   &small->txnh_list);
76110 +
76111 +       /* Check our accounting. */
76112 +       assert("jmacd-1063",
76113 +              zcount + small->num_queued == small->capture_count);
76114 +       assert("jmacd-1065", tcount == small->txnh_count);
76115 +
76116 +       /* sum numbers of waiters threads */
76117 +       large->nr_waiters += small->nr_waiters;
76118 +       small->nr_waiters = 0;
76119 +
76120 +       /* splice flush queues */
76121 +       reiser4_fuse_fq(large, small);
76122 +
76123 +       /* update counter of jnode on every atom' list */
76124 +       ON_DEBUG(large->dirty += small->dirty;
76125 +                small->dirty = 0;
76126 +                large->clean += small->clean;
76127 +                small->clean = 0;
76128 +                large->ovrwr += small->ovrwr;
76129 +                small->ovrwr = 0;
76130 +                large->wb += small->wb;
76131 +                small->wb = 0;
76132 +                large->fq += small->fq;
76133 +                small->fq = 0;);
76134 +
76135 +       /* count flushers in result atom */
76136 +       large->nr_flushers += small->nr_flushers;
76137 +       small->nr_flushers = 0;
76138 +
76139 +       /* update counts of flushed nodes */
76140 +       large->flushed += small->flushed;
76141 +       small->flushed = 0;
76142 +
76143 +       /* Transfer list counts to large. */
76144 +       large->txnh_count += small->txnh_count;
76145 +       large->capture_count += small->capture_count;
76146 +
76147 +       /* Add all txnh references to large. */
76148 +       atomic_add(small->txnh_count, &large->refcount);
76149 +       atomic_sub(small->txnh_count, &small->refcount);
76150 +
76151 +       /* Reset small counts */
76152 +       small->txnh_count = 0;
76153 +       small->capture_count = 0;
76154 +
76155 +       /* Assign the oldest start_time, merge flags. */
76156 +       large->start_time = min(large->start_time, small->start_time);
76157 +       large->flags |= small->flags;
76158 +
76159 +       /* Merge blocknr sets. */
76160 +       blocknr_set_merge(&small->delete_set, &large->delete_set);
76161 +       blocknr_set_merge(&small->wandered_map, &large->wandered_map);
76162 +
76163 +       /* Merge allocated/deleted file counts */
76164 +       large->nr_objects_deleted += small->nr_objects_deleted;
76165 +       large->nr_objects_created += small->nr_objects_created;
76166 +
76167 +       small->nr_objects_deleted = 0;
76168 +       small->nr_objects_created = 0;
76169 +
76170 +       /* Merge allocated blocks counts */
76171 +       large->nr_blocks_allocated += small->nr_blocks_allocated;
76172 +
76173 +       large->nr_running_queues += small->nr_running_queues;
76174 +       small->nr_running_queues = 0;
76175 +
76176 +       /* Merge blocks reserved for overwrite set. */
76177 +       large->flush_reserved += small->flush_reserved;
76178 +       small->flush_reserved = 0;
76179 +
76180 +       if (large->stage < small->stage) {
76181 +               /* Large only needs to notify if it has changed state. */
76182 +               reiser4_atom_set_stage(large, small->stage);
76183 +               wakeup_atom_waiting_list(large);
76184 +       }
76185 +
76186 +       reiser4_atom_set_stage(small, ASTAGE_INVALID);
76187 +
76188 +       /* Notify any waiters--small needs to unload its wait lists.  Waiters
76189 +          actually remove themselves from the list before returning from the
76190 +          fuse_wait function. */
76191 +       wakeup_atom_waiting_list(small);
76192 +
76193 +       /* Unlock atoms */
76194 +       spin_unlock_atom(large);
76195 +       atom_dec_and_unlock(small);
76196 +}
76197 +
76198 +/* TXNMGR STUFF */
76199 +
76200 +/* Release a block from the atom, reversing the effects of being captured,
76201 +   do not release atom's reference to jnode due to holding spin-locks.
76202 +   Currently this is only called when the atom commits.
76203 +
76204 +   NOTE: this function does not release a (journal) reference to jnode
76205 +   due to locking optimizations, you should call jput() somewhere after
76206 +   calling reiser4_uncapture_block(). */
76207 +void reiser4_uncapture_block(jnode * node)
76208 +{
76209 +       txn_atom *atom;
76210 +
76211 +       assert("umka-226", node != NULL);
76212 +       atom = node->atom;
76213 +       assert("umka-228", atom != NULL);
76214 +
76215 +       assert("jmacd-1021", node->atom == atom);
76216 +       assert_spin_locked(&(node->guard));
76217 +       assert("jmacd-1023", atom_is_protected(atom));
76218 +
76219 +       JF_CLR(node, JNODE_DIRTY);
76220 +       JF_CLR(node, JNODE_RELOC);
76221 +       JF_CLR(node, JNODE_OVRWR);
76222 +       JF_CLR(node, JNODE_CREATED);
76223 +       JF_CLR(node, JNODE_WRITEBACK);
76224 +       JF_CLR(node, JNODE_REPACK);
76225 +
76226 +       list_del_init(&node->capture_link);
76227 +       if (JF_ISSET(node, JNODE_FLUSH_QUEUED)) {
76228 +               assert("zam-925", atom_isopen(atom));
76229 +               assert("vs-1623", NODE_LIST(node) == FQ_LIST);
76230 +               ON_DEBUG(atom->num_queued--);
76231 +               JF_CLR(node, JNODE_FLUSH_QUEUED);
76232 +       }
76233 +       atom->capture_count -= 1;
76234 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), NOT_CAPTURED, 1));
76235 +       node->atom = NULL;
76236 +
76237 +       spin_unlock_jnode(node);
76238 +       LOCK_CNT_DEC(t_refs);
76239 +}
76240 +
76241 +/* Unconditional insert of jnode into atom's overwrite list. Currently used in
76242 +   bitmap-based allocator code for adding modified bitmap blocks the
76243 +   transaction. @atom and @node are spin locked */
76244 +void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node)
76245 +{
76246 +       assert("zam-538", atom_is_protected(atom));
76247 +       assert_spin_locked(&(node->guard));
76248 +       assert("zam-899", JF_ISSET(node, JNODE_OVRWR));
76249 +       assert("zam-543", node->atom == NULL);
76250 +       assert("vs-1433", !jnode_is_unformatted(node) && !jnode_is_znode(node));
76251 +
76252 +       list_add(&node->capture_link, ATOM_OVRWR_LIST(atom));
76253 +       jref(node);
76254 +       node->atom = atom;
76255 +       atom->capture_count++;
76256 +       ON_DEBUG(count_jnode(atom, node, NODE_LIST(node), OVRWR_LIST, 1));
76257 +}
76258 +
76259 +static int count_deleted_blocks_actor(txn_atom * atom,
76260 +                                     const reiser4_block_nr * a,
76261 +                                     const reiser4_block_nr * b, void *data)
76262 +{
76263 +       reiser4_block_nr *counter = data;
76264 +
76265 +       assert("zam-995", data != NULL);
76266 +       assert("zam-996", a != NULL);
76267 +       if (b == NULL)
76268 +               *counter += 1;
76269 +       else
76270 +               *counter += *b;
76271 +       return 0;
76272 +}
76273 +
76274 +reiser4_block_nr txnmgr_count_deleted_blocks(void)
76275 +{
76276 +       reiser4_block_nr result;
76277 +       txn_mgr *tmgr = &get_super_private(reiser4_get_current_sb())->tmgr;
76278 +       txn_atom *atom;
76279 +
76280 +       result = 0;
76281 +
76282 +       spin_lock_txnmgr(tmgr);
76283 +       list_for_each_entry(atom, &tmgr->atoms_list, atom_link) {
76284 +               spin_lock_atom(atom);
76285 +               if (atom_isopen(atom))
76286 +                       blocknr_set_iterator(
76287 +                               atom, &atom->delete_set,
76288 +                               count_deleted_blocks_actor, &result, 0);
76289 +               spin_unlock_atom(atom);
76290 +       }
76291 +       spin_unlock_txnmgr(tmgr);
76292 +
76293 +       return result;
76294 +}
76295 +
76296 +/*
76297 + * Local variables:
76298 + * c-indentation-style: "K&R"
76299 + * mode-name: "LC"
76300 + * c-basic-offset: 8
76301 + * tab-width: 8
76302 + * fill-column: 79
76303 + * End:
76304 + */
76305 diff --git a/fs/reiser4/txnmgr.h b/fs/reiser4/txnmgr.h
76306 new file mode 100644
76307 index 0000000..6ad4b5a
76308 --- /dev/null
76309 +++ b/fs/reiser4/txnmgr.h
76310 @@ -0,0 +1,708 @@
76311 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
76312 + * reiser4/README */
76313 +
76314 +/* data-types and function declarations for transaction manager. See txnmgr.c
76315 + * for details. */
76316 +
76317 +#ifndef __REISER4_TXNMGR_H__
76318 +#define __REISER4_TXNMGR_H__
76319 +
76320 +#include "forward.h"
76321 +#include "dformat.h"
76322 +
76323 +#include <linux/fs.h>
76324 +#include <linux/mm.h>
76325 +#include <linux/types.h>
76326 +#include <linux/spinlock.h>
76327 +#include <asm/atomic.h>
76328 +#include <linux/wait.h>
76329 +
76330 +/* TYPE DECLARATIONS */
76331 +
76332 +/* This enumeration describes the possible types of a capture request (reiser4_try_capture).
76333 +   A capture request dynamically assigns a block to the calling thread's transaction
76334 +   handle. */
76335 +typedef enum {
76336 +       /* A READ_ATOMIC request indicates that a block will be read and that the caller's
76337 +          atom should fuse in order to ensure that the block commits atomically with the
76338 +          caller. */
76339 +       TXN_CAPTURE_READ_ATOMIC = (1 << 0),
76340 +
76341 +       /* A READ_NONCOM request indicates that a block will be read and that the caller is
76342 +          willing to read a non-committed block without causing atoms to fuse. */
76343 +       TXN_CAPTURE_READ_NONCOM = (1 << 1),
76344 +
76345 +       /* A READ_MODIFY request indicates that a block will be read but that the caller
76346 +          wishes for the block to be captured as it will be written.  This capture request
76347 +          mode is not currently used, but eventually it will be useful for preventing
76348 +          deadlock in read-modify-write cycles. */
76349 +       TXN_CAPTURE_READ_MODIFY = (1 << 2),
76350 +
76351 +       /* A WRITE capture request indicates that a block will be modified and that atoms
76352 +          should fuse to make the commit atomic. */
76353 +       TXN_CAPTURE_WRITE = (1 << 3),
76354 +
76355 +       /* CAPTURE_TYPES is a mask of the four above capture types, used to separate the
76356 +          exclusive type designation from extra bits that may be supplied -- see
76357 +          below. */
76358 +       TXN_CAPTURE_TYPES = (TXN_CAPTURE_READ_ATOMIC |
76359 +                            TXN_CAPTURE_READ_NONCOM | TXN_CAPTURE_READ_MODIFY |
76360 +                            TXN_CAPTURE_WRITE),
76361 +
76362 +       /* A subset of CAPTURE_TYPES, CAPTURE_WTYPES is a mask of request types that
76363 +          indicate modification will occur. */
76364 +       TXN_CAPTURE_WTYPES = (TXN_CAPTURE_READ_MODIFY | TXN_CAPTURE_WRITE),
76365 +
76366 +       /* An option to reiser4_try_capture, NONBLOCKING indicates that the caller would
76367 +          prefer not to sleep waiting for an aging atom to commit. */
76368 +       TXN_CAPTURE_NONBLOCKING = (1 << 4),
76369 +
76370 +       /* An option to reiser4_try_capture to prevent atom fusion, just simple
76371 +          capturing is allowed */
76372 +       TXN_CAPTURE_DONT_FUSE = (1 << 5)
76373 +
76374 +       /* This macro selects only the exclusive capture request types, stripping out any
76375 +          options that were supplied (i.e., NONBLOCKING). */
76376 +#define CAPTURE_TYPE(x) ((x) & TXN_CAPTURE_TYPES)
76377 +} txn_capture;
76378 +
76379 +/* There are two kinds of transaction handle: WRITE_FUSING and READ_FUSING, the only
76380 +   difference is in the handling of read requests.  A WRITE_FUSING transaction handle
76381 +   defaults read capture requests to TXN_CAPTURE_READ_NONCOM whereas a READ_FUSIONG
76382 +   transaction handle defaults to TXN_CAPTURE_READ_ATOMIC. */
76383 +typedef enum {
76384 +       TXN_WRITE_FUSING = (1 << 0),
76385 +       TXN_READ_FUSING = (1 << 1) | TXN_WRITE_FUSING,  /* READ implies WRITE */
76386 +} txn_mode;
76387 +
76388 +/* Every atom has a stage, which is one of these exclusive values: */
76389 +typedef enum {
76390 +       /* Initially an atom is free. */
76391 +       ASTAGE_FREE = 0,
76392 +
76393 +       /* An atom begins by entering the CAPTURE_FUSE stage, where it proceeds to capture
76394 +          blocks and fuse with other atoms. */
76395 +       ASTAGE_CAPTURE_FUSE = 1,
76396 +
76397 +       /* We need to have a ASTAGE_CAPTURE_SLOW in which an atom fuses with one node for every X nodes it flushes to disk where X > 1. */
76398 +
76399 +       /* When an atom reaches a certain age it must do all it can to commit.  An atom in
76400 +          the CAPTURE_WAIT stage refuses new transaction handles and prevents fusion from
76401 +          atoms in the CAPTURE_FUSE stage. */
76402 +       ASTAGE_CAPTURE_WAIT = 2,
76403 +
76404 +       /* Waiting for I/O before commit.  Copy-on-capture (see
76405 +          http://namesys.com/v4/v4.html). */
76406 +       ASTAGE_PRE_COMMIT = 3,
76407 +
76408 +       /* Post-commit overwrite I/O.  Steal-on-capture. */
76409 +       ASTAGE_POST_COMMIT = 4,
76410 +
76411 +       /* Atom which waits for the removal of the last reference to (it? ) to
76412 +        * be deleted from memory  */
76413 +       ASTAGE_DONE = 5,
76414 +
76415 +       /* invalid atom. */
76416 +       ASTAGE_INVALID = 6,
76417 +
76418 +} txn_stage;
76419 +
76420 +/* Certain flags may be set in the txn_atom->flags field. */
76421 +typedef enum {
76422 +       /* Indicates that the atom should commit as soon as possible. */
76423 +       ATOM_FORCE_COMMIT = (1 << 0),
76424 +       /* to avoid endless loop, mark the atom (which was considered as too
76425 +        * small) after failed attempt to fuse it. */
76426 +       ATOM_CANCEL_FUSION = (1 << 1)
76427 +} txn_flags;
76428 +
76429 +/* Flags for controlling commit_txnh */
76430 +typedef enum {
76431 +       /* Wait commit atom completion in commit_txnh */
76432 +       TXNH_WAIT_COMMIT = 0x2,
76433 +       /* Don't commit atom when this handle is closed */
76434 +       TXNH_DONT_COMMIT = 0x4
76435 +} txn_handle_flags_t;
76436 +
76437 +/* TYPE DEFINITIONS */
76438 +
76439 +/* A note on lock ordering: the handle & jnode spinlock protects reading of their ->atom
76440 +   fields, so typically an operation on the atom through either of these objects must (1)
76441 +   lock the object, (2) read the atom pointer, (3) lock the atom.
76442 +
76443 +   During atom fusion, the process holds locks on both atoms at once.  Then, it iterates
76444 +   through the list of handles and pages held by the smaller of the two atoms.  For each
76445 +   handle and page referencing the smaller atom, the fusing process must: (1) lock the
76446 +   object, and (2) update the atom pointer.
76447 +
76448 +   You can see that there is a conflict of lock ordering here, so the more-complex
76449 +   procedure should have priority, i.e., the fusing process has priority so that it is
76450 +   guaranteed to make progress and to avoid restarts.
76451 +
76452 +   This decision, however, means additional complexity for aquiring the atom lock in the
76453 +   first place.
76454 +
76455 +   The general original procedure followed in the code was:
76456 +
76457 +       TXN_OBJECT *obj = ...;
76458 +       TXN_ATOM   *atom;
76459 +
76460 +       spin_lock (& obj->_lock);
76461 +
76462 +       atom = obj->_atom;
76463 +
76464 +       if (! spin_trylock_atom (atom))
76465 +         {
76466 +           spin_unlock (& obj->_lock);
76467 +           RESTART OPERATION, THERE WAS A RACE;
76468 +         }
76469 +
76470 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
76471 +
76472 +   It has however been found that this wastes CPU a lot in a manner that is
76473 +   hard to profile. So, proper refcounting was added to atoms, and new
76474 +   standard locking sequence is like following:
76475 +
76476 +       TXN_OBJECT *obj = ...;
76477 +       TXN_ATOM   *atom;
76478 +
76479 +       spin_lock (& obj->_lock);
76480 +
76481 +       atom = obj->_atom;
76482 +
76483 +       if (! spin_trylock_atom (atom))
76484 +         {
76485 +           atomic_inc (& atom->refcount);
76486 +           spin_unlock (& obj->_lock);
76487 +           spin_lock (&atom->_lock);
76488 +           atomic_dec (& atom->refcount);
76489 +           // HERE atom is locked
76490 +           spin_unlock (&atom->_lock);
76491 +           RESTART OPERATION, THERE WAS A RACE;
76492 +         }
76493 +
76494 +       ELSE YOU HAVE BOTH ATOM AND OBJ LOCKED
76495 +
76496 +   (core of this is implemented in trylock_throttle() function)
76497 +
76498 +   See the jnode_get_atom() function for a common case.
76499 +
76500 +   As an additional (and important) optimization allowing to avoid restarts,
76501 +   it is possible to re-check required pre-conditions at the HERE point in
76502 +   code above and proceed without restarting if they are still satisfied.
76503 +*/
76504 +
76505 +/* An atomic transaction: this is the underlying system representation
76506 +   of a transaction, not the one seen by clients.
76507 +
76508 +   Invariants involving this data-type:
76509 +
76510 +      [sb-fake-allocated]
76511 +*/
76512 +struct txn_atom {
76513 +       /* The spinlock protecting the atom, held during fusion and various other state
76514 +          changes. */
76515 +       spinlock_t alock;
76516 +
76517 +       /* The atom's reference counter, increasing (in case of a duplication
76518 +          of an existing reference or when we are sure that some other
76519 +          reference exists) may be done without taking spinlock, decrementing
76520 +          of the ref. counter requires a spinlock to be held.
76521 +
76522 +          Each transaction handle counts in ->refcount. All jnodes count as
76523 +          one reference acquired in atom_begin_andlock(), released in
76524 +          commit_current_atom().
76525 +        */
76526 +       atomic_t refcount;
76527 +
76528 +       /* The atom_id identifies the atom in persistent records such as the log. */
76529 +       __u32 atom_id;
76530 +
76531 +       /* Flags holding any of the txn_flags enumerated values (e.g.,
76532 +          ATOM_FORCE_COMMIT). */
76533 +       __u32 flags;
76534 +
76535 +       /* Number of open handles. */
76536 +       __u32 txnh_count;
76537 +
76538 +       /* The number of znodes captured by this atom.  Equal to the sum of lengths of the
76539 +          dirty_nodes[level] and clean_nodes lists. */
76540 +       __u32 capture_count;
76541 +
76542 +#if REISER4_DEBUG
76543 +       int clean;
76544 +       int dirty;
76545 +       int ovrwr;
76546 +       int wb;
76547 +       int fq;
76548 +#endif
76549 +
76550 +       __u32 flushed;
76551 +
76552 +       /* Current transaction stage. */
76553 +       txn_stage stage;
76554 +
76555 +       /* Start time. */
76556 +       unsigned long start_time;
76557 +
76558 +       /* The atom's delete set. It collects block numbers of the nodes
76559 +          which were deleted during the transaction. */
76560 +       struct list_head delete_set;
76561 +
76562 +       /* The atom's wandered_block mapping. */
76563 +       struct list_head wandered_map;
76564 +
76565 +       /* The transaction's list of dirty captured nodes--per level.  Index
76566 +          by (level). dirty_nodes[0] is for znode-above-root */
76567 +       struct list_head dirty_nodes[REAL_MAX_ZTREE_HEIGHT + 1];
76568 +
76569 +       /* The transaction's list of clean captured nodes. */
76570 +       struct list_head clean_nodes;
76571 +
76572 +       /* The atom's overwrite set */
76573 +       struct list_head ovrwr_nodes;
76574 +
76575 +       /* nodes which are being written to disk */
76576 +       struct list_head writeback_nodes;
76577 +
76578 +       /* list of inodes */
76579 +       struct list_head inodes;
76580 +
76581 +       /* List of handles associated with this atom. */
76582 +       struct list_head txnh_list;
76583 +
76584 +       /* Transaction list link: list of atoms in the transaction manager. */
76585 +       struct list_head atom_link;
76586 +
76587 +       /* List of handles waiting FOR this atom: see 'capture_fuse_wait' comment. */
76588 +       struct list_head fwaitfor_list;
76589 +
76590 +       /* List of this atom's handles that are waiting: see 'capture_fuse_wait' comment. */
76591 +       struct list_head fwaiting_list;
76592 +
76593 +       /* Numbers of objects which were deleted/created in this transaction
76594 +          thereby numbers of objects IDs which were released/deallocated. */
76595 +       int nr_objects_deleted;
76596 +       int nr_objects_created;
76597 +       /* number of blocks allocated during the transaction */
76598 +       __u64 nr_blocks_allocated;
76599 +       /* All atom's flush queue objects are on this list  */
76600 +       struct list_head flush_queues;
76601 +#if REISER4_DEBUG
76602 +       /* number of flush queues for this atom. */
76603 +       int nr_flush_queues;
76604 +       /* Number of jnodes which were removed from atom's lists and put
76605 +          on flush_queue */
76606 +       int num_queued;
76607 +#endif
76608 +       /* number of threads who wait for this atom to complete commit */
76609 +       int nr_waiters;
76610 +       /* number of threads which do jnode_flush() over this atom */
76611 +       int nr_flushers;
76612 +       /* number of flush queues which are IN_USE and jnodes from fq->prepped
76613 +          are submitted to disk by the reiser4_write_fq() routine. */
76614 +       int nr_running_queues;
76615 +       /* A counter of grabbed unformatted nodes, see a description of the
76616 +        * reiser4 space reservation scheme at block_alloc.c */
76617 +       reiser4_block_nr flush_reserved;
76618 +#if REISER4_DEBUG
76619 +       void *committer;
76620 +#endif
76621 +       struct super_block *super;
76622 +};
76623 +
76624 +#define ATOM_DIRTY_LIST(atom, level) (&(atom)->dirty_nodes[level])
76625 +#define ATOM_CLEAN_LIST(atom) (&(atom)->clean_nodes)
76626 +#define ATOM_OVRWR_LIST(atom) (&(atom)->ovrwr_nodes)
76627 +#define ATOM_WB_LIST(atom) (&(atom)->writeback_nodes)
76628 +#define ATOM_FQ_LIST(fq) (&(fq)->prepped)
76629 +
76630 +#define NODE_LIST(node) (node)->list
76631 +#define ASSIGN_NODE_LIST(node, list) ON_DEBUG(NODE_LIST(node) = list)
76632 +ON_DEBUG(void
76633 +        count_jnode(txn_atom *, jnode *, atom_list old_list,
76634 +                    atom_list new_list, int check_lists));
76635 +
76636 +typedef struct protected_jnodes {
76637 +       struct list_head inatom; /* link to atom's list these structures */
76638 +       struct list_head nodes; /* head of list of protected nodes */
76639 +} protected_jnodes;
76640 +
76641 +/* A transaction handle: the client obtains and commits this handle which is assigned by
76642 +   the system to a txn_atom. */
76643 +struct txn_handle {
76644 +       /* Spinlock protecting ->atom pointer */
76645 +       spinlock_t hlock;
76646 +
76647 +       /* Flags for controlling commit_txnh() behavior */
76648 +       /* from txn_handle_flags_t */
76649 +       txn_handle_flags_t flags;
76650 +
76651 +       /* Whether it is READ_FUSING or WRITE_FUSING. */
76652 +       txn_mode mode;
76653 +
76654 +       /* If assigned, the atom it is part of. */
76655 +       txn_atom *atom;
76656 +
76657 +       /* Transaction list link. Head is in txn_atom. */
76658 +       struct list_head txnh_link;
76659 +};
76660 +
76661 +/* The transaction manager: one is contained in the reiser4_super_info_data */
76662 +struct txn_mgr {
76663 +       /* A spinlock protecting the atom list, id_count, flush_control */
76664 +       spinlock_t tmgr_lock;
76665 +
76666 +       /* List of atoms. */
76667 +       struct list_head atoms_list;
76668 +
76669 +       /* Number of atoms. */
76670 +       int atom_count;
76671 +
76672 +       /* A counter used to assign atom->atom_id values. */
76673 +       __u32 id_count;
76674 +
76675 +       /* a mutex object for commit serialization */
76676 +       struct mutex commit_mutex;
76677 +
76678 +       /* a list of all txnmrgs served by particular daemon. */
76679 +       struct list_head linkage;
76680 +
76681 +       /* description of daemon for this txnmgr */
76682 +       ktxnmgrd_context *daemon;
76683 +
76684 +       /* parameters. Adjustable through mount options. */
76685 +       unsigned int atom_max_size;
76686 +       unsigned int atom_max_age;
76687 +       unsigned int atom_min_size;
76688 +       /* max number of concurrent flushers for one atom, 0 - unlimited.  */
76689 +       unsigned int atom_max_flushers;
76690 +       struct dentry *debugfs_atom_count;
76691 +       struct dentry *debugfs_id_count;
76692 +};
76693 +
76694 +/* FUNCTION DECLARATIONS */
76695 +
76696 +/* These are the externally (within Reiser4) visible transaction functions, therefore they
76697 +   are prefixed with "txn_".  For comments, see txnmgr.c. */
76698 +
76699 +extern int init_txnmgr_static(void);
76700 +extern void done_txnmgr_static(void);
76701 +
76702 +extern void reiser4_init_txnmgr(txn_mgr *);
76703 +extern void reiser4_done_txnmgr(txn_mgr *);
76704 +
76705 +extern int reiser4_txn_reserve(int reserved);
76706 +
76707 +extern void reiser4_txn_begin(reiser4_context * context);
76708 +extern int reiser4_txn_end(reiser4_context * context);
76709 +
76710 +extern void reiser4_txn_restart(reiser4_context * context);
76711 +extern void reiser4_txn_restart_current(void);
76712 +
76713 +extern int txnmgr_force_commit_all(struct super_block *, int);
76714 +extern int current_atom_should_commit(void);
76715 +
76716 +extern jnode *find_first_dirty_jnode(txn_atom *, int);
76717 +
76718 +extern int commit_some_atoms(txn_mgr *);
76719 +extern int force_commit_atom(txn_handle *);
76720 +extern int flush_current_atom(int, long, long *, txn_atom **, jnode *);
76721 +
76722 +extern int flush_some_atom(jnode *, long *, const struct writeback_control *, int);
76723 +
76724 +extern void reiser4_atom_set_stage(txn_atom * atom, txn_stage stage);
76725 +
76726 +extern int same_slum_check(jnode * base, jnode * check, int alloc_check,
76727 +                          int alloc_value);
76728 +extern void atom_dec_and_unlock(txn_atom * atom);
76729 +
76730 +extern int reiser4_try_capture(jnode * node, znode_lock_mode mode, txn_capture flags);
76731 +extern int try_capture_page_to_invalidate(struct page *pg);
76732 +
76733 +extern void reiser4_uncapture_page(struct page *pg);
76734 +extern void reiser4_uncapture_block(jnode *);
76735 +extern void reiser4_uncapture_jnode(jnode *);
76736 +
76737 +extern int reiser4_capture_inode(struct inode *);
76738 +extern int reiser4_uncapture_inode(struct inode *);
76739 +
76740 +extern txn_atom *get_current_atom_locked_nocheck(void);
76741 +
76742 +#if REISER4_DEBUG
76743 +
76744 +/**
76745 + * atom_is_protected - make sure that nobody but us can do anything with atom
76746 + * @atom: atom to be checked
76747 + *
76748 + * This is used to assert that atom either entered commit stages or is spin
76749 + * locked.
76750 + */
76751 +static inline int atom_is_protected(txn_atom *atom)
76752 +{
76753 +       if (atom->stage >= ASTAGE_PRE_COMMIT)
76754 +               return 1;
76755 +       assert_spin_locked(&(atom->alock));
76756 +       return 1;
76757 +}
76758 +
76759 +#endif
76760 +
76761 +/* Get the current atom and spinlock it if current atom present. May not return NULL */
76762 +static inline txn_atom *get_current_atom_locked(void)
76763 +{
76764 +       txn_atom *atom;
76765 +
76766 +       atom = get_current_atom_locked_nocheck();
76767 +       assert("zam-761", atom != NULL);
76768 +
76769 +       return atom;
76770 +}
76771 +
76772 +extern txn_atom *jnode_get_atom(jnode *);
76773 +
76774 +extern void reiser4_atom_wait_event(txn_atom *);
76775 +extern void reiser4_atom_send_event(txn_atom *);
76776 +
76777 +extern void insert_into_atom_ovrwr_list(txn_atom * atom, jnode * node);
76778 +extern int reiser4_capture_super_block(struct super_block *s);
76779 +int capture_bulk(jnode **, int count);
76780 +
76781 +/* See the comment on the function blocknrset.c:blocknr_set_add for the
76782 +   calling convention of these three routines. */
76783 +extern void blocknr_set_init(struct list_head * bset);
76784 +extern void blocknr_set_destroy(struct list_head * bset);
76785 +extern void blocknr_set_merge(struct list_head * from, struct list_head * into);
76786 +extern int blocknr_set_add_extent(txn_atom * atom,
76787 +                                 struct list_head * bset,
76788 +                                 blocknr_set_entry ** new_bsep,
76789 +                                 const reiser4_block_nr * start,
76790 +                                 const reiser4_block_nr * len);
76791 +extern int blocknr_set_add_pair(txn_atom * atom, struct list_head * bset,
76792 +                               blocknr_set_entry ** new_bsep,
76793 +                               const reiser4_block_nr * a,
76794 +                               const reiser4_block_nr * b);
76795 +
76796 +typedef int (*blocknr_set_actor_f) (txn_atom *, const reiser4_block_nr *,
76797 +                                   const reiser4_block_nr *, void *);
76798 +
76799 +extern int blocknr_set_iterator(txn_atom * atom, struct list_head * bset,
76800 +                               blocknr_set_actor_f actor, void *data,
76801 +                               int delete);
76802 +
76803 +/* flush code takes care about how to fuse flush queues */
76804 +extern void flush_init_atom(txn_atom * atom);
76805 +extern void flush_fuse_queues(txn_atom * large, txn_atom * small);
76806 +
76807 +static inline void spin_lock_atom(txn_atom *atom)
76808 +{
76809 +       /* check that spinlocks of lower priorities are not held */
76810 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76811 +                   LOCK_CNT_NIL(spin_locked_atom) &&
76812 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
76813 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
76814 +                   LOCK_CNT_NIL(rw_locked_dk) &&
76815 +                   LOCK_CNT_NIL(rw_locked_tree)));
76816 +
76817 +       spin_lock(&(atom->alock));
76818 +
76819 +       LOCK_CNT_INC(spin_locked_atom);
76820 +       LOCK_CNT_INC(spin_locked);
76821 +}
76822 +
76823 +static inline void spin_lock_atom_nested(txn_atom *atom)
76824 +{
76825 +       assert("", (LOCK_CNT_NIL(spin_locked_txnh) &&
76826 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
76827 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
76828 +                   LOCK_CNT_NIL(rw_locked_dk) &&
76829 +                   LOCK_CNT_NIL(rw_locked_tree)));
76830 +
76831 +       spin_lock_nested(&(atom->alock), SINGLE_DEPTH_NESTING);
76832 +
76833 +       LOCK_CNT_INC(spin_locked_atom);
76834 +       LOCK_CNT_INC(spin_locked);
76835 +}
76836 +
76837 +static inline int spin_trylock_atom(txn_atom *atom)
76838 +{
76839 +       if (spin_trylock(&(atom->alock))) {
76840 +               LOCK_CNT_INC(spin_locked_atom);
76841 +               LOCK_CNT_INC(spin_locked);
76842 +               return 1;
76843 +       }
76844 +       return 0;
76845 +}
76846 +
76847 +static inline void spin_unlock_atom(txn_atom *atom)
76848 +{
76849 +       assert_spin_locked(&(atom->alock));
76850 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_atom));
76851 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76852 +
76853 +       LOCK_CNT_DEC(spin_locked_atom);
76854 +       LOCK_CNT_DEC(spin_locked);
76855 +
76856 +       spin_unlock(&(atom->alock));
76857 +}
76858 +
76859 +static inline void spin_lock_txnh(txn_handle *txnh)
76860 +{
76861 +       /* check that spinlocks of lower priorities are not held */
76862 +       assert("", (LOCK_CNT_NIL(rw_locked_dk) &&
76863 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
76864 +                   LOCK_CNT_NIL(rw_locked_tree)));
76865 +
76866 +       spin_lock(&(txnh->hlock));
76867 +
76868 +       LOCK_CNT_INC(spin_locked_txnh);
76869 +       LOCK_CNT_INC(spin_locked);
76870 +}
76871 +
76872 +static inline int spin_trylock_txnh(txn_handle *txnh)
76873 +{
76874 +       if (spin_trylock(&(txnh->hlock))) {
76875 +               LOCK_CNT_INC(spin_locked_txnh);
76876 +               LOCK_CNT_INC(spin_locked);
76877 +               return 1;
76878 +       }
76879 +       return 0;
76880 +}
76881 +
76882 +static inline void spin_unlock_txnh(txn_handle *txnh)
76883 +{
76884 +       assert_spin_locked(&(txnh->hlock));
76885 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnh));
76886 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76887 +
76888 +       LOCK_CNT_DEC(spin_locked_txnh);
76889 +       LOCK_CNT_DEC(spin_locked);
76890 +
76891 +       spin_unlock(&(txnh->hlock));
76892 +}
76893 +
76894 +#define spin_ordering_pred_txnmgr(tmgr)                \
76895 +       ( LOCK_CNT_NIL(spin_locked_atom) &&     \
76896 +         LOCK_CNT_NIL(spin_locked_txnh) &&     \
76897 +         LOCK_CNT_NIL(spin_locked_jnode) &&    \
76898 +         LOCK_CNT_NIL(rw_locked_zlock) &&      \
76899 +         LOCK_CNT_NIL(rw_locked_dk) &&         \
76900 +         LOCK_CNT_NIL(rw_locked_tree) )
76901 +
76902 +static inline void spin_lock_txnmgr(txn_mgr *mgr)
76903 +{
76904 +       /* check that spinlocks of lower priorities are not held */
76905 +       assert("", (LOCK_CNT_NIL(spin_locked_atom) &&
76906 +                   LOCK_CNT_NIL(spin_locked_txnh) &&
76907 +                   LOCK_CNT_NIL(spin_locked_jnode) &&
76908 +                   LOCK_CNT_NIL(spin_locked_zlock) &&
76909 +                   LOCK_CNT_NIL(rw_locked_dk) &&
76910 +                   LOCK_CNT_NIL(rw_locked_tree)));
76911 +
76912 +       spin_lock(&(mgr->tmgr_lock));
76913 +
76914 +       LOCK_CNT_INC(spin_locked_txnmgr);
76915 +       LOCK_CNT_INC(spin_locked);
76916 +}
76917 +
76918 +static inline int spin_trylock_txnmgr(txn_mgr *mgr)
76919 +{
76920 +       if (spin_trylock(&(mgr->tmgr_lock))) {
76921 +               LOCK_CNT_INC(spin_locked_txnmgr);
76922 +               LOCK_CNT_INC(spin_locked);
76923 +               return 1;
76924 +       }
76925 +       return 0;
76926 +}
76927 +
76928 +static inline void spin_unlock_txnmgr(txn_mgr *mgr)
76929 +{
76930 +       assert_spin_locked(&(mgr->tmgr_lock));
76931 +       assert("nikita-1375", LOCK_CNT_GTZ(spin_locked_txnmgr));
76932 +       assert("nikita-1376", LOCK_CNT_GTZ(spin_locked));
76933 +
76934 +       LOCK_CNT_DEC(spin_locked_txnmgr);
76935 +       LOCK_CNT_DEC(spin_locked);
76936 +
76937 +       spin_unlock(&(mgr->tmgr_lock));
76938 +}
76939 +
76940 +typedef enum {
76941 +       FQ_IN_USE = 0x1
76942 +} flush_queue_state_t;
76943 +
76944 +typedef struct flush_queue flush_queue_t;
76945 +
76946 +/* This is an accumulator for jnodes prepared for writing to disk. A flush queue
76947 +   is filled by the jnode_flush() routine, and written to disk under memory
76948 +   pressure or at atom commit time. */
76949 +/* LOCKING: fq state and fq->atom are protected by guard spinlock, fq->nr_queued
76950 +   field and fq->prepped list can be modified if atom is spin-locked and fq
76951 +   object is "in-use" state.  For read-only traversal of the fq->prepped list
76952 +   and reading of the fq->nr_queued field it is enough to keep fq "in-use" or
76953 +   only have atom spin-locked. */
76954 +struct flush_queue {
76955 +       /* linkage element is the first in this structure to make debugging
76956 +          easier.  See field in atom struct for description of list. */
76957 +       struct list_head alink;
76958 +       /* A spinlock to protect changes of fq state and fq->atom pointer */
76959 +       spinlock_t guard;
76960 +       /* flush_queue state: [in_use | ready] */
76961 +       flush_queue_state_t state;
76962 +       /* A list which contains queued nodes, queued nodes are removed from any
76963 +        * atom's list and put on this ->prepped one. */
76964 +       struct list_head prepped;
76965 +       /* number of submitted i/o requests */
76966 +       atomic_t nr_submitted;
76967 +       /* number of i/o errors */
76968 +       atomic_t nr_errors;
76969 +       /* An atom this flush queue is attached to */
76970 +       txn_atom *atom;
76971 +       /* A wait queue head to wait on i/o completion */
76972 +       wait_queue_head_t wait;
76973 +#if REISER4_DEBUG
76974 +       /* A thread which took this fq in exclusive use, NULL if fq is free,
76975 +        * used for debugging. */
76976 +       struct task_struct *owner;
76977 +#endif
76978 +};
76979 +
76980 +extern int reiser4_fq_by_atom(txn_atom *, flush_queue_t **);
76981 +extern void reiser4_fq_put_nolock(flush_queue_t *);
76982 +extern void reiser4_fq_put(flush_queue_t *);
76983 +extern void reiser4_fuse_fq(txn_atom * to, txn_atom * from);
76984 +extern void queue_jnode(flush_queue_t *, jnode *);
76985 +
76986 +extern int reiser4_write_fq(flush_queue_t *, long *, int);
76987 +extern int current_atom_finish_all_fq(void);
76988 +extern void init_atom_fq_parts(txn_atom *);
76989 +
76990 +extern reiser4_block_nr txnmgr_count_deleted_blocks(void);
76991 +
76992 +extern void znode_make_dirty(znode * node);
76993 +extern void jnode_make_dirty_locked(jnode * node);
76994 +
76995 +extern int reiser4_sync_atom(txn_atom * atom);
76996 +
76997 +#if REISER4_DEBUG
76998 +extern int atom_fq_parts_are_clean(txn_atom *);
76999 +#endif
77000 +
77001 +extern void add_fq_to_bio(flush_queue_t *, struct bio *);
77002 +extern flush_queue_t *get_fq_for_current_atom(void);
77003 +
77004 +void protected_jnodes_init(protected_jnodes * list);
77005 +void protected_jnodes_done(protected_jnodes * list);
77006 +void reiser4_invalidate_list(struct list_head * head);
77007 +
77008 +# endif                                /* __REISER4_TXNMGR_H__ */
77009 +
77010 +/* Make Linus happy.
77011 +   Local variables:
77012 +   c-indentation-style: "K&R"
77013 +   mode-name: "LC"
77014 +   c-basic-offset: 8
77015 +   tab-width: 8
77016 +   fill-column: 120
77017 +   End:
77018 +*/
77019 diff --git a/fs/reiser4/type_safe_hash.h b/fs/reiser4/type_safe_hash.h
77020 new file mode 100644
77021 index 0000000..b2fdacd
77022 --- /dev/null
77023 +++ b/fs/reiser4/type_safe_hash.h
77024 @@ -0,0 +1,320 @@
77025 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77026 + * reiser4/README */
77027 +
77028 +/* A hash table class that uses hash chains (singly-linked) and is
77029 +   parametrized to provide type safety.  */
77030 +
77031 +#ifndef __REISER4_TYPE_SAFE_HASH_H__
77032 +#define __REISER4_TYPE_SAFE_HASH_H__
77033 +
77034 +#include "debug.h"
77035 +
77036 +#include <asm/errno.h>
77037 +/* Step 1: Use TYPE_SAFE_HASH_DECLARE() to define the TABLE and LINK objects
77038 +   based on the object type.  You need to declare the item type before
77039 +   this definition, define it after this definition. */
77040 +#define TYPE_SAFE_HASH_DECLARE(PREFIX,ITEM_TYPE)                                                     \
77041 +                                                                                              \
77042 +typedef struct PREFIX##_hash_table_  PREFIX##_hash_table;                                     \
77043 +typedef struct PREFIX##_hash_link_   PREFIX##_hash_link;                                      \
77044 +                                                                                              \
77045 +struct PREFIX##_hash_table_                                                                   \
77046 +{                                                                                             \
77047 +  ITEM_TYPE  **_table;                                                                        \
77048 +  __u32        _buckets;                                                                      \
77049 +};                                                                                            \
77050 +                                                                                              \
77051 +struct PREFIX##_hash_link_                                                                    \
77052 +{                                                                                             \
77053 +  ITEM_TYPE *_next;                                                                           \
77054 +}
77055 +
77056 +/* Step 2: Define the object type of the hash: give it field of type
77057 +   PREFIX_hash_link. */
77058 +
77059 +/* Step 3: Use TYPE_SAFE_HASH_DEFINE to define the hash table interface using
77060 +   the type and field name used in step 3.  The arguments are:
77061 +
77062 +   ITEM_TYPE    The item type being hashed
77063 +   KEY_TYPE     The type of key being hashed
77064 +   KEY_NAME     The name of the key field within the item
77065 +   LINK_NAME    The name of the link field within the item, which you must make type PREFIX_hash_link)
77066 +   HASH_FUNC    The name of the hash function (or macro, takes const pointer to key)
77067 +   EQ_FUNC      The name of the equality function (or macro, takes const pointer to two keys)
77068 +
77069 +   It implements these functions:
77070 +
77071 +   prefix_hash_init           Initialize the table given its size.
77072 +   prefix_hash_insert         Insert an item
77073 +   prefix_hash_insert_index   Insert an item w/ precomputed hash_index
77074 +   prefix_hash_find           Find an item by key
77075 +   prefix_hash_find_index     Find an item w/ precomputed hash_index
77076 +   prefix_hash_remove         Remove an item, returns 1 if found, 0 if not found
77077 +   prefix_hash_remove_index   Remove an item w/ precomputed hash_index
77078 +
77079 +   If you'd like something to be done differently, feel free to ask me
77080 +   for modifications.  Additional features that could be added but
77081 +   have not been:
77082 +
77083 +   prefix_hash_remove_key           Find and remove an item by key
77084 +   prefix_hash_remove_key_index     Find and remove an item by key w/ precomputed hash_index
77085 +
77086 +   The hash_function currently receives only the key as an argument,
77087 +   meaning it must somehow know the number of buckets.  If this is a
77088 +   problem let me know.
77089 +
77090 +   This hash table uses a single-linked hash chain.  This means
77091 +   insertion is fast but deletion requires searching the chain.
77092 +
77093 +   There is also the doubly-linked hash chain approach, under which
77094 +   deletion requires no search but the code is longer and it takes two
77095 +   pointers per item.
77096 +
77097 +   The circularly-linked approach has the shortest code but requires
77098 +   two pointers per bucket, doubling the size of the bucket array (in
77099 +   addition to two pointers per item).
77100 +*/
77101 +#define TYPE_SAFE_HASH_DEFINE(PREFIX,ITEM_TYPE,KEY_TYPE,KEY_NAME,LINK_NAME,HASH_FUNC,EQ_FUNC)  \
77102 +                                                                                       \
77103 +static __inline__ void                                                                 \
77104 +PREFIX##_check_hash (PREFIX##_hash_table *table UNUSED_ARG,                            \
77105 +                    __u32                hash UNUSED_ARG)                              \
77106 +{                                                                                      \
77107 +       assert("nikita-2780", hash < table->_buckets);                                  \
77108 +}                                                                                      \
77109 +                                                                                       \
77110 +static __inline__ int                                                                  \
77111 +PREFIX##_hash_init (PREFIX##_hash_table *hash,                                         \
77112 +                   __u32                buckets)                                       \
77113 +{                                                                                      \
77114 +  hash->_table   = (ITEM_TYPE**) KMALLOC (sizeof (ITEM_TYPE*) * buckets);              \
77115 +  hash->_buckets = buckets;                                                            \
77116 +  if (hash->_table == NULL)                                                            \
77117 +    {                                                                                  \
77118 +      return RETERR(-ENOMEM);                                                          \
77119 +    }                                                                                  \
77120 +  memset (hash->_table, 0, sizeof (ITEM_TYPE*) * buckets);                             \
77121 +  ON_DEBUG(printk(#PREFIX "_hash_table: %i buckets\n", buckets));                      \
77122 +  return 0;                                                                            \
77123 +}                                                                                      \
77124 +                                                                                       \
77125 +static __inline__ void                                                                 \
77126 +PREFIX##_hash_done (PREFIX##_hash_table *hash)                                         \
77127 +{                                                                                      \
77128 +  if (REISER4_DEBUG && hash->_table != NULL) {                                          \
77129 +           __u32 i;                                                                    \
77130 +           for (i = 0 ; i < hash->_buckets ; ++ i)                                     \
77131 +                   assert("nikita-2905", hash->_table[i] == NULL);                     \
77132 +  }                                                                                     \
77133 +  if (hash->_table != NULL)                                                            \
77134 +    KFREE (hash->_table, sizeof (ITEM_TYPE*) * hash->_buckets);                                \
77135 +  hash->_table = NULL;                                                                 \
77136 +}                                                                                      \
77137 +                                                                                       \
77138 +static __inline__ void                                                                 \
77139 +PREFIX##_hash_prefetch_next (ITEM_TYPE *item)                                          \
77140 +{                                                                                      \
77141 +       prefetch(item->LINK_NAME._next);                                                \
77142 +}                                                                                      \
77143 +                                                                                       \
77144 +static __inline__ void                                                                 \
77145 +PREFIX##_hash_prefetch_bucket (PREFIX##_hash_table *hash,                              \
77146 +                              __u32                index)                              \
77147 +{                                                                                      \
77148 +       prefetch(hash->_table[index]);                                                  \
77149 +}                                                                                      \
77150 +                                                                                       \
77151 +static __inline__ ITEM_TYPE*                                                           \
77152 +PREFIX##_hash_find_index (PREFIX##_hash_table *hash,                                   \
77153 +                         __u32                hash_index,                              \
77154 +                         KEY_TYPE const      *find_key)                                \
77155 +{                                                                                      \
77156 +  ITEM_TYPE *item;                                                                     \
77157 +                                                                                       \
77158 +  PREFIX##_check_hash(hash, hash_index);                                               \
77159 +                                                                                       \
77160 +  for (item  = hash->_table[hash_index];                                               \
77161 +       item != NULL;                                                                   \
77162 +       item  = item->LINK_NAME._next)                                                  \
77163 +    {                                                                                  \
77164 +      prefetch(item->LINK_NAME._next);                                                 \
77165 +      prefetch(item->LINK_NAME._next + offsetof(ITEM_TYPE, KEY_NAME));                 \
77166 +      if (EQ_FUNC (& item->KEY_NAME, find_key))                                                \
77167 +        {                                                                              \
77168 +          return item;                                                                 \
77169 +        }                                                                              \
77170 +    }                                                                                  \
77171 +                                                                                       \
77172 +  return NULL;                                                                         \
77173 +}                                                                                      \
77174 +                                                                                       \
77175 +static __inline__ ITEM_TYPE*                                                           \
77176 +PREFIX##_hash_find_index_lru (PREFIX##_hash_table *hash,                               \
77177 +                             __u32                hash_index,                          \
77178 +                             KEY_TYPE const      *find_key)                            \
77179 +{                                                                                      \
77180 +  ITEM_TYPE ** item = &hash->_table[hash_index];                                        \
77181 +                                                                                       \
77182 +  PREFIX##_check_hash(hash, hash_index);                                               \
77183 +                                                                                        \
77184 +  while (*item != NULL) {                                                               \
77185 +    prefetch(&(*item)->LINK_NAME._next);                                               \
77186 +    if (EQ_FUNC (&(*item)->KEY_NAME, find_key)) {                                       \
77187 +      ITEM_TYPE *found;                                                                \
77188 +                                                                                       \
77189 +      found = *item;                                                                   \
77190 +      *item = found->LINK_NAME._next;                                                   \
77191 +      found->LINK_NAME._next = hash->_table[hash_index];                               \
77192 +      hash->_table[hash_index] = found;                                                        \
77193 +      return found;                                                                     \
77194 +    }                                                                                   \
77195 +    item = &(*item)->LINK_NAME._next;                                                   \
77196 +  }                                                                                    \
77197 +  return NULL;                                                                         \
77198 +}                                                                                      \
77199 +                                                                                       \
77200 +static __inline__ int                                                                  \
77201 +PREFIX##_hash_remove_index (PREFIX##_hash_table *hash,                                 \
77202 +                           __u32                hash_index,                            \
77203 +                           ITEM_TYPE           *del_item)                              \
77204 +{                                                                                      \
77205 +  ITEM_TYPE ** hash_item_p = &hash->_table[hash_index];                                 \
77206 +                                                                                       \
77207 +  PREFIX##_check_hash(hash, hash_index);                                               \
77208 +                                                                                        \
77209 +  while (*hash_item_p != NULL) {                                                        \
77210 +    prefetch(&(*hash_item_p)->LINK_NAME._next);                                                \
77211 +    if (*hash_item_p == del_item) {                                                     \
77212 +      *hash_item_p = (*hash_item_p)->LINK_NAME._next;                                   \
77213 +      return 1;                                                                         \
77214 +    }                                                                                   \
77215 +    hash_item_p = &(*hash_item_p)->LINK_NAME._next;                                     \
77216 +  }                                                                                    \
77217 +  return 0;                                                                            \
77218 +}                                                                                      \
77219 +                                                                                       \
77220 +static __inline__ void                                                                 \
77221 +PREFIX##_hash_insert_index (PREFIX##_hash_table *hash,                                 \
77222 +                           __u32                hash_index,                            \
77223 +                           ITEM_TYPE           *ins_item)                              \
77224 +{                                                                                      \
77225 +  PREFIX##_check_hash(hash, hash_index);                                               \
77226 +                                                                                       \
77227 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
77228 +  hash->_table[hash_index]  = ins_item;                                                        \
77229 +}                                                                                      \
77230 +                                                                                       \
77231 +static __inline__ void                                                                 \
77232 +PREFIX##_hash_insert_index_rcu (PREFIX##_hash_table *hash,                             \
77233 +                               __u32                hash_index,                        \
77234 +                               ITEM_TYPE           *ins_item)                          \
77235 +{                                                                                      \
77236 +  PREFIX##_check_hash(hash, hash_index);                                               \
77237 +                                                                                       \
77238 +  ins_item->LINK_NAME._next = hash->_table[hash_index];                                        \
77239 +  smp_wmb();                                                                           \
77240 +  hash->_table[hash_index]  = ins_item;                                                        \
77241 +}                                                                                      \
77242 +                                                                                       \
77243 +static __inline__ ITEM_TYPE*                                                           \
77244 +PREFIX##_hash_find (PREFIX##_hash_table *hash,                                         \
77245 +                   KEY_TYPE const      *find_key)                                      \
77246 +{                                                                                      \
77247 +  return PREFIX##_hash_find_index (hash, HASH_FUNC(hash, find_key), find_key);         \
77248 +}                                                                                      \
77249 +                                                                                       \
77250 +static __inline__ ITEM_TYPE*                                                           \
77251 +PREFIX##_hash_find_lru (PREFIX##_hash_table *hash,                                     \
77252 +                       KEY_TYPE const      *find_key)                                  \
77253 +{                                                                                      \
77254 +  return PREFIX##_hash_find_index_lru (hash, HASH_FUNC(hash, find_key), find_key);     \
77255 +}                                                                                      \
77256 +                                                                                       \
77257 +static __inline__ int                                                                  \
77258 +PREFIX##_hash_remove (PREFIX##_hash_table *hash,                                       \
77259 +                     ITEM_TYPE           *del_item)                                    \
77260 +{                                                                                      \
77261 +  return PREFIX##_hash_remove_index (hash,                                             \
77262 +                                     HASH_FUNC(hash, &del_item->KEY_NAME), del_item);  \
77263 +}                                                                                      \
77264 +                                                                                       \
77265 +static __inline__ int                                                                  \
77266 +PREFIX##_hash_remove_rcu (PREFIX##_hash_table *hash,                                   \
77267 +                     ITEM_TYPE           *del_item)                                    \
77268 +{                                                                                      \
77269 +  return PREFIX##_hash_remove (hash, del_item);                                                \
77270 +}                                                                                      \
77271 +                                                                                       \
77272 +static __inline__ void                                                                 \
77273 +PREFIX##_hash_insert (PREFIX##_hash_table *hash,                                       \
77274 +                     ITEM_TYPE           *ins_item)                                    \
77275 +{                                                                                      \
77276 +  return PREFIX##_hash_insert_index (hash,                                             \
77277 +                                     HASH_FUNC(hash, &ins_item->KEY_NAME), ins_item);  \
77278 +}                                                                                      \
77279 +                                                                                       \
77280 +static __inline__ void                                                                 \
77281 +PREFIX##_hash_insert_rcu (PREFIX##_hash_table *hash,                                   \
77282 +                         ITEM_TYPE           *ins_item)                                \
77283 +{                                                                                      \
77284 +  return PREFIX##_hash_insert_index_rcu (hash, HASH_FUNC(hash, &ins_item->KEY_NAME),           \
77285 +                                         ins_item);                                    \
77286 +}                                                                                      \
77287 +                                                                                       \
77288 +static __inline__ ITEM_TYPE *                                                          \
77289 +PREFIX##_hash_first (PREFIX##_hash_table *hash, __u32 ind)                             \
77290 +{                                                                                      \
77291 +  ITEM_TYPE *first;                                                                    \
77292 +                                                                                       \
77293 +  for (first = NULL; ind < hash->_buckets; ++ ind) {                                   \
77294 +    first = hash->_table[ind];                                                         \
77295 +    if (first != NULL)                                                                 \
77296 +      break;                                                                           \
77297 +  }                                                                                    \
77298 +  return first;                                                                                \
77299 +}                                                                                      \
77300 +                                                                                       \
77301 +static __inline__ ITEM_TYPE *                                                          \
77302 +PREFIX##_hash_next (PREFIX##_hash_table *hash,                                         \
77303 +                   ITEM_TYPE           *item)                                          \
77304 +{                                                                                      \
77305 +  ITEM_TYPE  *next;                                                                    \
77306 +                                                                                       \
77307 +  if (item == NULL)                                                                    \
77308 +    return NULL;                                                                       \
77309 +  next = item->LINK_NAME._next;                                                                \
77310 +  if (next == NULL)                                                                    \
77311 +    next = PREFIX##_hash_first (hash, HASH_FUNC(hash, &item->KEY_NAME) + 1);           \
77312 +  return next;                                                                         \
77313 +}                                                                                      \
77314 +                                                                                       \
77315 +typedef struct {} PREFIX##_hash_dummy
77316 +
77317 +#define for_all_ht_buckets(table, head)                                        \
77318 +for ((head) = &(table) -> _table[ 0 ] ;                                        \
77319 +     (head) != &(table) -> _table[ (table) -> _buckets ] ; ++ (head))
77320 +
77321 +#define for_all_in_bucket(bucket, item, next, field)                           \
77322 +for ((item) = *(bucket), (next) = (item) ? (item) -> field._next : NULL ;      \
77323 +     (item) != NULL ;                                                          \
77324 +     (item) = (next), (next) = (item) ? (item) -> field._next : NULL )
77325 +
77326 +#define for_all_in_htable(table, prefix, item, next)   \
77327 +for ((item) = prefix ## _hash_first ((table), 0),      \
77328 +     (next) = prefix ## _hash_next ((table), (item)) ; \
77329 +     (item) != NULL ;                                  \
77330 +     (item) = (next),                                  \
77331 +     (next) = prefix ## _hash_next ((table), (item)))
77332 +
77333 +/* __REISER4_TYPE_SAFE_HASH_H__ */
77334 +#endif
77335 +
77336 +/* Make Linus happy.
77337 +   Local variables:
77338 +   c-indentation-style: "K&R"
77339 +   mode-name: "LC"
77340 +   c-basic-offset: 8
77341 +   tab-width: 8
77342 +   fill-column: 120
77343 +   End:
77344 +*/
77345 diff --git a/fs/reiser4/vfs_ops.c b/fs/reiser4/vfs_ops.c
77346 new file mode 100644
77347 index 0000000..31afd3e
77348 --- /dev/null
77349 +++ b/fs/reiser4/vfs_ops.c
77350 @@ -0,0 +1,259 @@
77351 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77352 + * reiser4/README */
77353 +
77354 +/* Interface to VFS. Reiser4 {super|export|dentry}_operations are defined
77355 +   here. */
77356 +
77357 +#include "forward.h"
77358 +#include "debug.h"
77359 +#include "dformat.h"
77360 +#include "coord.h"
77361 +#include "plugin/item/item.h"
77362 +#include "plugin/file/file.h"
77363 +#include "plugin/security/perm.h"
77364 +#include "plugin/disk_format/disk_format.h"
77365 +#include "plugin/plugin.h"
77366 +#include "plugin/plugin_set.h"
77367 +#include "plugin/object.h"
77368 +#include "txnmgr.h"
77369 +#include "jnode.h"
77370 +#include "znode.h"
77371 +#include "block_alloc.h"
77372 +#include "tree.h"
77373 +#include "vfs_ops.h"
77374 +#include "inode.h"
77375 +#include "page_cache.h"
77376 +#include "ktxnmgrd.h"
77377 +#include "super.h"
77378 +#include "reiser4.h"
77379 +#include "entd.h"
77380 +#include "status_flags.h"
77381 +#include "flush.h"
77382 +#include "dscale.h"
77383 +
77384 +#include <linux/profile.h>
77385 +#include <linux/types.h>
77386 +#include <linux/mount.h>
77387 +#include <linux/vfs.h>
77388 +#include <linux/mm.h>
77389 +#include <linux/buffer_head.h>
77390 +#include <linux/dcache.h>
77391 +#include <linux/list.h>
77392 +#include <linux/pagemap.h>
77393 +#include <linux/slab.h>
77394 +#include <linux/seq_file.h>
77395 +#include <linux/init.h>
77396 +#include <linux/module.h>
77397 +#include <linux/writeback.h>
77398 +#include <linux/blkdev.h>
77399 +#include <linux/quotaops.h>
77400 +#include <linux/security.h>
77401 +#include <linux/reboot.h>
77402 +#include <linux/rcupdate.h>
77403 +
77404 +/* update inode stat-data by calling plugin */
77405 +int reiser4_update_sd(struct inode *object)
77406 +{
77407 +       file_plugin *fplug;
77408 +
77409 +       assert("nikita-2338", object != NULL);
77410 +       /* check for read-only file system. */
77411 +       if (IS_RDONLY(object))
77412 +               return 0;
77413 +
77414 +       fplug = inode_file_plugin(object);
77415 +       assert("nikita-2339", fplug != NULL);
77416 +       return fplug->write_sd_by_inode(object);
77417 +}
77418 +
77419 +/* helper function: increase inode nlink count and call plugin method to save
77420 +   updated stat-data.
77421 +
77422 +   Used by link/create and during creation of dot and dotdot in mkdir
77423 +*/
77424 +int reiser4_add_nlink(struct inode *object /* object to which link is added */ ,
77425 +                     struct inode *parent /* parent where new entry will be */
77426 +                     ,
77427 +                     int write_sd_p    /* true if stat-data has to be
77428 +                                        * updated */ )
77429 +{
77430 +       file_plugin *fplug;
77431 +       int result;
77432 +
77433 +       assert("nikita-1351", object != NULL);
77434 +
77435 +       fplug = inode_file_plugin(object);
77436 +       assert("nikita-1445", fplug != NULL);
77437 +
77438 +       /* ask plugin whether it can add yet another link to this
77439 +          object */
77440 +       if (!fplug->can_add_link(object))
77441 +               return RETERR(-EMLINK);
77442 +
77443 +       assert("nikita-2211", fplug->add_link != NULL);
77444 +       /* call plugin to do actual addition of link */
77445 +       result = fplug->add_link(object, parent);
77446 +
77447 +       /* optionally update stat data */
77448 +       if (result == 0 && write_sd_p)
77449 +               result = fplug->write_sd_by_inode(object);
77450 +       return result;
77451 +}
77452 +
77453 +/* helper function: decrease inode nlink count and call plugin method to save
77454 +   updated stat-data.
77455 +
77456 +   Used by unlink/create
77457 +*/
77458 +int reiser4_del_nlink(struct inode *object     /* object from which link is
77459 +                                                * removed */ ,
77460 +                     struct inode *parent /* parent where entry was */ ,
77461 +                     int write_sd_p    /* true is stat-data has to be
77462 +                                        * updated */ )
77463 +{
77464 +       file_plugin *fplug;
77465 +       int result;
77466 +
77467 +       assert("nikita-1349", object != NULL);
77468 +
77469 +       fplug = inode_file_plugin(object);
77470 +       assert("nikita-1350", fplug != NULL);
77471 +       assert("nikita-1446", object->i_nlink > 0);
77472 +       assert("nikita-2210", fplug->rem_link != NULL);
77473 +
77474 +       /* call plugin to do actual deletion of link */
77475 +       result = fplug->rem_link(object, parent);
77476 +
77477 +       /* optionally update stat data */
77478 +       if (result == 0 && write_sd_p)
77479 +               result = fplug->write_sd_by_inode(object);
77480 +       return result;
77481 +}
77482 +
77483 +/* Release reiser4 dentry. This is d_op->d_release() method. */
77484 +static void reiser4_d_release(struct dentry *dentry /* dentry released */ )
77485 +{
77486 +       reiser4_free_dentry_fsdata(dentry);
77487 +}
77488 +
77489 +/*
77490 + * Called by reiser4_sync_inodes(), during speculative write-back (through
77491 + * pdflush, or balance_dirty_pages()).
77492 + */
77493 +void reiser4_writeout(struct super_block *sb, struct writeback_control *wbc)
77494 +{
77495 +       long written = 0;
77496 +       int repeats = 0;
77497 +       int result;
77498 +       struct address_space *mapping;
77499 +
77500 +       /*
77501 +        * Performs early flushing, trying to free some memory. If there is
77502 +        * nothing to flush, commits some atoms.
77503 +        */
77504 +
77505 +       /* Commit all atoms if reiser4_writepages() is called from sys_sync() or
77506 +          sys_fsync(). */
77507 +       if (wbc->sync_mode != WB_SYNC_NONE) {
77508 +               txnmgr_force_commit_all(sb, 0);
77509 +               return;
77510 +       }
77511 +
77512 +       BUG_ON(reiser4_get_super_fake(sb) == NULL);
77513 +       mapping = reiser4_get_super_fake(sb)->i_mapping;
77514 +       do {
77515 +               long nr_submitted = 0;
77516 +               jnode *node = NULL;
77517 +
77518 +               /* do not put more requests to overload write queue */
77519 +               if (wbc->nonblocking &&
77520 +                   bdi_write_congested(mapping->backing_dev_info)) {
77521 +                       blk_run_address_space(mapping);
77522 +                       wbc->encountered_congestion = 1;
77523 +                       break;
77524 +               }
77525 +               repeats++;
77526 +               BUG_ON(wbc->nr_to_write <= 0);
77527 +
77528 +               if (get_current_context()->entd) {
77529 +                       entd_context *ent = get_entd_context(sb);
77530 +
77531 +                       if (ent->cur_request->node)
77532 +                               /*
77533 +                                * this is ent thread and it managed to capture
77534 +                                * requested page itself - start flush from
77535 +                                * that page
77536 +                                */
77537 +                               node = jref(ent->cur_request->node);
77538 +               }
77539 +
77540 +               result = flush_some_atom(node, &nr_submitted, wbc,
77541 +                                        JNODE_FLUSH_WRITE_BLOCKS);
77542 +               if (result != 0)
77543 +                       warning("nikita-31001", "Flush failed: %i", result);
77544 +               if (node)
77545 +                       jput(node);
77546 +               if (!nr_submitted)
77547 +                       break;
77548 +
77549 +               wbc->nr_to_write -= nr_submitted;
77550 +               written += nr_submitted;
77551 +       } while (wbc->nr_to_write > 0);
77552 +}
77553 +
77554 +void reiser4_throttle_write(struct inode *inode)
77555 +{
77556 +       reiser4_txn_restart_current();
77557 +       balance_dirty_pages_ratelimited(inode->i_mapping);
77558 +}
77559 +
77560 +const char *REISER4_SUPER_MAGIC_STRING = "ReIsEr4";
77561 +const int REISER4_MAGIC_OFFSET = 16 * 4096;    /* offset to magic string from the
77562 +                                                * beginning of device */
77563 +
77564 +/*
77565 + * Reiser4 initialization/shutdown.
77566 + *
77567 + * Code below performs global reiser4 initialization that is done either as
77568 + * part of kernel initialization (when reiser4 is statically built-in), or
77569 + * during reiser4 module load (when compiled as module).
77570 + */
77571 +
77572 +void reiser4_handle_error(void)
77573 +{
77574 +       struct super_block *sb = reiser4_get_current_sb();
77575 +
77576 +       if (!sb)
77577 +               return;
77578 +       reiser4_status_write(REISER4_STATUS_DAMAGED, 0,
77579 +                            "Filesystem error occured");
77580 +       switch (get_super_private(sb)->onerror) {
77581 +       case 0:
77582 +               reiser4_panic("foobar-42", "Filesystem error occured\n");
77583 +       case 1:
77584 +       default:
77585 +               if (sb->s_flags & MS_RDONLY)
77586 +                       return;
77587 +               sb->s_flags |= MS_RDONLY;
77588 +               break;
77589 +       }
77590 +}
77591 +
77592 +struct dentry_operations reiser4_dentry_operations = {
77593 +       .d_revalidate = NULL,
77594 +       .d_hash = NULL,
77595 +       .d_compare = NULL,
77596 +       .d_delete = NULL,
77597 +       .d_release = reiser4_d_release,
77598 +       .d_iput = NULL,
77599 +};
77600 +
77601 +/* Make Linus happy.
77602 +   Local variables:
77603 +   c-indentation-style: "K&R"
77604 +   mode-name: "LC"
77605 +   c-basic-offset: 8
77606 +   tab-width: 8
77607 +   fill-column: 120
77608 +   End:
77609 +*/
77610 diff --git a/fs/reiser4/vfs_ops.h b/fs/reiser4/vfs_ops.h
77611 new file mode 100644
77612 index 0000000..03e16ce
77613 --- /dev/null
77614 +++ b/fs/reiser4/vfs_ops.h
77615 @@ -0,0 +1,53 @@
77616 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77617 + * reiser4/README */
77618 +
77619 +/* vfs_ops.c's exported symbols */
77620 +
77621 +#if !defined( __FS_REISER4_VFS_OPS_H__ )
77622 +#define __FS_REISER4_VFS_OPS_H__
77623 +
77624 +#include "forward.h"
77625 +#include "coord.h"
77626 +#include "seal.h"
77627 +#include "plugin/file/file.h"
77628 +#include "super.h"
77629 +#include "readahead.h"
77630 +
77631 +#include <linux/types.h>       /* for loff_t */
77632 +#include <linux/fs.h>          /* for struct address_space */
77633 +#include <linux/dcache.h>      /* for struct dentry */
77634 +#include <linux/mm.h>
77635 +#include <linux/backing-dev.h>
77636 +
77637 +/* address space operations */
77638 +int reiser4_writepage(struct page *, struct writeback_control *);
77639 +int reiser4_set_page_dirty(struct page *);
77640 +void reiser4_invalidatepage(struct page *, unsigned long offset);
77641 +int reiser4_releasepage(struct page *, gfp_t);
77642 +
77643 +extern int reiser4_update_sd(struct inode *);
77644 +extern int reiser4_add_nlink(struct inode *, struct inode *, int);
77645 +extern int reiser4_del_nlink(struct inode *, struct inode *, int);
77646 +
77647 +extern int reiser4_start_up_io(struct page *page);
77648 +extern void reiser4_throttle_write(struct inode *);
77649 +extern int jnode_is_releasable(jnode *);
77650 +
77651 +#define CAPTURE_APAGE_BURST (1024l)
77652 +void reiser4_writeout(struct super_block *, struct writeback_control *);
77653 +
77654 +extern void reiser4_handle_error(void);
77655 +
77656 +/* __FS_REISER4_VFS_OPS_H__ */
77657 +#endif
77658 +
77659 +/* Make Linus happy.
77660 +   Local variables:
77661 +   c-indentation-style: "K&R"
77662 +   mode-name: "LC"
77663 +   c-basic-offset: 8
77664 +   tab-width: 8
77665 +   fill-column: 120
77666 +   scroll-step: 1
77667 +   End:
77668 +*/
77669 diff --git a/fs/reiser4/wander.c b/fs/reiser4/wander.c
77670 new file mode 100644
77671 index 0000000..6d1d1d9
77672 --- /dev/null
77673 +++ b/fs/reiser4/wander.c
77674 @@ -0,0 +1,1797 @@
77675 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
77676 + * reiser4/README */
77677 +
77678 +/* Reiser4 Wandering Log */
77679 +
77680 +/* You should read http://www.namesys.com/txn-doc.html
77681 +
77682 +   That describes how filesystem operations are performed as atomic
77683 +   transactions, and how we try to arrange it so that we can write most of the
77684 +   data only once while performing the operation atomically.
77685 +
77686 +   For the purposes of this code, it is enough for it to understand that it
77687 +   has been told a given block should be written either once, or twice (if
77688 +   twice then once to the wandered location and once to the real location).
77689 +
77690 +   This code guarantees that those blocks that are defined to be part of an
77691 +   atom either all take effect or none of them take effect.
77692 +
77693 +   Relocate set nodes are submitted to write by the jnode_flush() routine, and
77694 +   the overwrite set is submitted by reiser4_write_log().  This is because with
77695 +   the overwrite set we seek to optimize writes, and with the relocate set we
77696 +   seek to cause disk order to correlate with the parent first pre-order.
77697 +
77698 +   reiser4_write_log() allocates and writes wandered blocks and maintains
77699 +   additional on-disk structures of the atom as wander records (each wander
77700 +   record occupies one block) for storing of the "wandered map" (a table which
77701 +   contains a relation between wandered and real block numbers) and other
77702 +   information which might be needed at transaction recovery time.
77703 +
77704 +   The wander records are unidirectionally linked into a circle: each wander
77705 +   record contains a block number of the next wander record, the last wander
77706 +   record points to the first one.
77707 +
77708 +   One wander record (named "tx head" in this file) has a format which is
77709 +   different from the other wander records. The "tx head" has a reference to the
77710 +   "tx head" block of the previously committed atom.  Also, "tx head" contains
77711 +   fs information (the free blocks counter, and the oid allocator state) which
77712 +   is logged in a special way .
77713 +
77714 +   There are two journal control blocks, named journal header and journal
77715 +   footer which have fixed on-disk locations.  The journal header has a
77716 +   reference to the "tx head" block of the last committed atom.  The journal
77717 +   footer points to the "tx head" of the last flushed atom.  The atom is
77718 +   "played" when all blocks from its overwrite set are written to disk the
77719 +   second time (i.e. written to their real locations).
77720 +
77721 +   NOTE: People who know reiserfs internals and its journal structure might be
77722 +   confused with these terms journal footer and journal header. There is a table
77723 +   with terms of similar semantics in reiserfs (reiser3) and reiser4:
77724 +
77725 +   REISER3 TERM        |  REISER4 TERM         | DESCRIPTION
77726 +   --------------------+-----------------------+----------------------------
77727 +   commit record       |  journal header       | atomic write of this record
77728 +                       |                       | ends transaction commit
77729 +   --------------------+-----------------------+----------------------------
77730 +   journal header      |  journal footer       | atomic write of this record
77731 +                       |                       | ends post-commit writes.
77732 +                       |                       | After successful
77733 +                       |                       | writing of this journal
77734 +                       |                       | blocks (in reiser3) or
77735 +                       |                       | wandered blocks/records are
77736 +                       |                       | free for re-use.
77737 +   --------------------+-----------------------+----------------------------
77738 +
77739 +   The atom commit process is the following:
77740 +
77741 +   1. The overwrite set is taken from atom's clean list, and its size is
77742 +      counted.
77743 +
77744 +   2. The number of necessary wander records (including tx head) is calculated,
77745 +      and the wander record blocks are allocated.
77746 +
77747 +   3. Allocate wandered blocks and populate wander records by wandered map.
77748 +
77749 +   4. submit write requests for wander records and wandered blocks.
77750 +
77751 +   5. wait until submitted write requests complete.
77752 +
77753 +   6. update journal header: change the pointer to the block number of just
77754 +   written tx head, submit an i/o for modified journal header block and wait
77755 +   for i/o completion.
77756 +
77757 +   NOTE: The special logging for bitmap blocks and some reiser4 super block
77758 +   fields makes processes of atom commit, flush and recovering a bit more
77759 +   complex (see comments in the source code for details).
77760 +
77761 +   The atom playing process is the following:
77762 +
77763 +   1. Write atom's overwrite set in-place.
77764 +
77765 +   2. Wait on i/o.
77766 +
77767 +   3. Update journal footer: change the pointer to block number of tx head
77768 +   block of the atom we currently flushing, submit an i/o, wait on i/o
77769 +   completion.
77770 +
77771 +   4. Free disk space which was used for wandered blocks and wander records.
77772 +
77773 +   After the freeing of wandered blocks and wander records we have that journal
77774 +   footer points to the on-disk structure which might be overwritten soon.
77775 +   Neither the log writer nor the journal recovery procedure use that pointer
77776 +   for accessing the data.  When the journal recovery procedure finds the oldest
77777 +   transaction it compares the journal footer pointer value with the "prev_tx"
77778 +   pointer value in tx head, if values are equal the oldest not flushed
77779 +   transaction is found.
77780 +
77781 +   NOTE on disk space leakage: the information about of what blocks and how many
77782 +   blocks are allocated for wandered blocks, wandered records is not written to
77783 +   the disk because of special logging for bitmaps and some super blocks
77784 +   counters.  After a system crash we the reiser4 does not remember those
77785 +   objects allocation, thus we have no such a kind of disk space leakage.
77786 +*/
77787 +
77788 +/* Special logging of reiser4 super block fields. */
77789 +
77790 +/* There are some reiser4 super block fields (free block count and OID allocator
77791 +   state (number of files and next free OID) which are logged separately from
77792 +   super block to avoid unnecessary atom fusion.
77793 +
77794 +   So, the reiser4 super block can be not captured by a transaction with
77795 +   allocates/deallocates disk blocks or create/delete file objects.  Moreover,
77796 +   the reiser4 on-disk super block is not touched when such a transaction is
77797 +   committed and flushed.  Those "counters logged specially" are logged in "tx
77798 +   head" blocks and in the journal footer block.
77799 +
77800 +   A step-by-step description of special logging:
77801 +
77802 +   0. The per-atom information about deleted or created files and allocated or
77803 +   freed blocks is collected during the transaction.  The atom's
77804 +   ->nr_objects_created and ->nr_objects_deleted are for object
77805 +   deletion/creation tracking, the numbers of allocated and freed blocks are
77806 +   calculated using atom's delete set and atom's capture list -- all new and
77807 +   relocated nodes should be on atom's clean list and should have JNODE_RELOC
77808 +   bit set.
77809 +
77810 +   1. The "logged specially" reiser4 super block fields have their "committed"
77811 +   versions in the reiser4 in-memory super block.  They get modified only at
77812 +   atom commit time.  The atom's commit thread has an exclusive access to those
77813 +   "committed" fields because the log writer implementation supports only one
77814 +   atom commit a time (there is a per-fs "commit" mutex).  At
77815 +   that time "committed" counters are modified using per-atom information
77816 +   collected during the transaction. These counters are stored on disk as a
77817 +   part of tx head block when atom is committed.
77818 +
77819 +   2. When the atom is flushed the value of the free block counter and the OID
77820 +   allocator state get written to the journal footer block.  A special journal
77821 +   procedure (journal_recover_sb_data()) takes those values from the journal
77822 +   footer and updates the reiser4 in-memory super block.
77823 +
77824 +   NOTE: That means free block count and OID allocator state are logged
77825 +   separately from the reiser4 super block regardless of the fact that the
77826 +   reiser4 super block has fields to store both the free block counter and the
77827 +   OID allocator.
77828 +
77829 +   Writing the whole super block at commit time requires knowing true values of
77830 +   all its fields without changes made by not yet committed transactions. It is
77831 +   possible by having their "committed" version of the super block like the
77832 +   reiser4 bitmap blocks have "committed" and "working" versions.  However,
77833 +   another scheme was implemented which stores special logged values in the
77834 +   unused free space inside transaction head block.  In my opinion it has an
77835 +   advantage of not writing whole super block when only part of it was
77836 +   modified. */
77837 +
77838 +#include "debug.h"
77839 +#include "dformat.h"
77840 +#include "txnmgr.h"
77841 +#include "jnode.h"
77842 +#include "znode.h"
77843 +#include "block_alloc.h"
77844 +#include "page_cache.h"
77845 +#include "wander.h"
77846 +#include "reiser4.h"
77847 +#include "super.h"
77848 +#include "vfs_ops.h"
77849 +#include "writeout.h"
77850 +#include "inode.h"
77851 +#include "entd.h"
77852 +
77853 +#include <linux/types.h>
77854 +#include <linux/fs.h>          /* for struct super_block  */
77855 +#include <linux/mm.h>          /* for struct page */
77856 +#include <linux/pagemap.h>
77857 +#include <linux/bio.h>         /* for struct bio */
77858 +#include <linux/blkdev.h>
77859 +
77860 +static int write_jnodes_to_disk_extent(
77861 +       jnode *, int, const reiser4_block_nr *, flush_queue_t *, int);
77862 +
77863 +/* The commit_handle is a container for objects needed at atom commit time  */
77864 +struct commit_handle {
77865 +       /* A pointer to atom's list of OVRWR nodes */
77866 +       struct list_head *overwrite_set;
77867 +       /* atom's overwrite set size */
77868 +       int overwrite_set_size;
77869 +       /* jnodes for wander record blocks */
77870 +       struct list_head tx_list;
77871 +       /* number of wander records */
77872 +       __u32 tx_size;
77873 +       /* 'committed' sb counters are saved here until atom is completely
77874 +          flushed  */
77875 +       __u64 free_blocks;
77876 +       __u64 nr_files;
77877 +       __u64 next_oid;
77878 +       /* A pointer to the atom which is being committed */
77879 +       txn_atom *atom;
77880 +       /* A pointer to current super block */
77881 +       struct super_block *super;
77882 +       /* The counter of modified bitmaps */
77883 +       reiser4_block_nr nr_bitmap;
77884 +};
77885 +
77886 +static void init_commit_handle(struct commit_handle *ch, txn_atom *atom)
77887 +{
77888 +       memset(ch, 0, sizeof(struct commit_handle));
77889 +       INIT_LIST_HEAD(&ch->tx_list);
77890 +
77891 +       ch->atom = atom;
77892 +       ch->super = reiser4_get_current_sb();
77893 +}
77894 +
77895 +static void done_commit_handle(struct commit_handle *ch)
77896 +{
77897 +       assert("zam-690", list_empty(&ch->tx_list));
77898 +}
77899 +
77900 +static inline int reiser4_use_write_barrier(struct super_block * s)
77901 +{
77902 +       return !reiser4_is_set(s, REISER4_NO_WRITE_BARRIER);
77903 +}
77904 +
77905 +static void disable_write_barrier(struct super_block * s)
77906 +{
77907 +       notice("zam-1055", "%s does not support write barriers,"
77908 +              " using synchronous write instead.", s->s_id);
77909 +       set_bit((int)REISER4_NO_WRITE_BARRIER, &get_super_private(s)->fs_flags);
77910 +}
77911 +
77912 +/* fill journal header block data  */
77913 +static void format_journal_header(struct commit_handle *ch)
77914 +{
77915 +       struct reiser4_super_info_data *sbinfo;
77916 +       struct journal_header *header;
77917 +       jnode *txhead;
77918 +
77919 +       sbinfo = get_super_private(ch->super);
77920 +       assert("zam-479", sbinfo != NULL);
77921 +       assert("zam-480", sbinfo->journal_header != NULL);
77922 +
77923 +       txhead = list_entry(ch->tx_list.next, jnode, capture_link);
77924 +
77925 +       jload(sbinfo->journal_header);
77926 +
77927 +       header = (struct journal_header *)jdata(sbinfo->journal_header);
77928 +       assert("zam-484", header != NULL);
77929 +
77930 +       put_unaligned(cpu_to_le64(*jnode_get_block(txhead)),
77931 +                     &header->last_committed_tx);
77932 +
77933 +       jrelse(sbinfo->journal_header);
77934 +}
77935 +
77936 +/* fill journal footer block data */
77937 +static void format_journal_footer(struct commit_handle *ch)
77938 +{
77939 +       struct reiser4_super_info_data *sbinfo;
77940 +       struct journal_footer *footer;
77941 +       jnode *tx_head;
77942 +
77943 +       sbinfo = get_super_private(ch->super);
77944 +
77945 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77946 +
77947 +       assert("zam-493", sbinfo != NULL);
77948 +       assert("zam-494", sbinfo->journal_header != NULL);
77949 +
77950 +       check_me("zam-691", jload(sbinfo->journal_footer) == 0);
77951 +
77952 +       footer = (struct journal_footer *)jdata(sbinfo->journal_footer);
77953 +       assert("zam-495", footer != NULL);
77954 +
77955 +       put_unaligned(cpu_to_le64(*jnode_get_block(tx_head)),
77956 +                     &footer->last_flushed_tx);
77957 +       put_unaligned(cpu_to_le64(ch->free_blocks), &footer->free_blocks);
77958 +
77959 +       put_unaligned(cpu_to_le64(ch->nr_files), &footer->nr_files);
77960 +       put_unaligned(cpu_to_le64(ch->next_oid), &footer->next_oid);
77961 +
77962 +       jrelse(sbinfo->journal_footer);
77963 +}
77964 +
77965 +/* wander record capacity depends on current block size */
77966 +static int wander_record_capacity(const struct super_block *super)
77967 +{
77968 +       return (super->s_blocksize -
77969 +               sizeof(struct wander_record_header)) /
77970 +           sizeof(struct wander_entry);
77971 +}
77972 +
77973 +/* Fill first wander record (tx head) in accordance with supplied given data */
77974 +static void format_tx_head(struct commit_handle *ch)
77975 +{
77976 +       jnode *tx_head;
77977 +       jnode *next;
77978 +       struct tx_header *header;
77979 +
77980 +       tx_head = list_entry(ch->tx_list.next, jnode, capture_link);
77981 +       assert("zam-692", &ch->tx_list != &tx_head->capture_link);
77982 +
77983 +       next = list_entry(tx_head->capture_link.next, jnode, capture_link);
77984 +       if (&ch->tx_list == &next->capture_link)
77985 +               next = tx_head;
77986 +
77987 +       header = (struct tx_header *)jdata(tx_head);
77988 +
77989 +       assert("zam-460", header != NULL);
77990 +       assert("zam-462", ch->super->s_blocksize >= sizeof(struct tx_header));
77991 +
77992 +       memset(jdata(tx_head), 0, (size_t) ch->super->s_blocksize);
77993 +       memcpy(jdata(tx_head), TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE);
77994 +
77995 +       put_unaligned(cpu_to_le32(ch->tx_size), &header->total);
77996 +       put_unaligned(cpu_to_le64(get_super_private(ch->super)->last_committed_tx),
77997 +                     &header->prev_tx);
77998 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &header->next_block);
77999 +       put_unaligned(cpu_to_le64(ch->free_blocks), &header->free_blocks);
78000 +       put_unaligned(cpu_to_le64(ch->nr_files), &header->nr_files);
78001 +       put_unaligned(cpu_to_le64(ch->next_oid), &header->next_oid);
78002 +}
78003 +
78004 +/* prepare ordinary wander record block (fill all service fields) */
78005 +static void
78006 +format_wander_record(struct commit_handle *ch, jnode *node, __u32 serial)
78007 +{
78008 +       struct wander_record_header *LRH;
78009 +       jnode *next;
78010 +
78011 +       assert("zam-464", node != NULL);
78012 +
78013 +       LRH = (struct wander_record_header *)jdata(node);
78014 +       next = list_entry(node->capture_link.next, jnode, capture_link);
78015 +
78016 +       if (&ch->tx_list == &next->capture_link)
78017 +               next = list_entry(ch->tx_list.next, jnode, capture_link);
78018 +
78019 +       assert("zam-465", LRH != NULL);
78020 +       assert("zam-463",
78021 +              ch->super->s_blocksize > sizeof(struct wander_record_header));
78022 +
78023 +       memset(jdata(node), 0, (size_t) ch->super->s_blocksize);
78024 +       memcpy(jdata(node), WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE);
78025 +
78026 +       put_unaligned(cpu_to_le32(ch->tx_size), &LRH->total);
78027 +       put_unaligned(cpu_to_le32(serial), &LRH->serial);
78028 +       put_unaligned(cpu_to_le64(*jnode_get_block(next)), &LRH->next_block);
78029 +}
78030 +
78031 +/* add one wandered map entry to formatted wander record */
78032 +static void
78033 +store_entry(jnode * node, int index, const reiser4_block_nr * a,
78034 +           const reiser4_block_nr * b)
78035 +{
78036 +       char *data;
78037 +       struct wander_entry *pairs;
78038 +
78039 +       data = jdata(node);
78040 +       assert("zam-451", data != NULL);
78041 +
78042 +       pairs =
78043 +           (struct wander_entry *)(data + sizeof(struct wander_record_header));
78044 +
78045 +       put_unaligned(cpu_to_le64(*a), &pairs[index].original);
78046 +       put_unaligned(cpu_to_le64(*b), &pairs[index].wandered);
78047 +}
78048 +
78049 +/* currently, wander records contains contain only wandered map, which depend on
78050 +   overwrite set size */
78051 +static void get_tx_size(struct commit_handle *ch)
78052 +{
78053 +       assert("zam-440", ch->overwrite_set_size != 0);
78054 +       assert("zam-695", ch->tx_size == 0);
78055 +
78056 +       /* count all ordinary wander records
78057 +          (<overwrite_set_size> - 1) / <wander_record_capacity> + 1 and add one
78058 +          for tx head block */
78059 +       ch->tx_size =
78060 +           (ch->overwrite_set_size - 1) / wander_record_capacity(ch->super) +
78061 +           2;
78062 +}
78063 +
78064 +/* A special structure for using in store_wmap_actor() for saving its state
78065 +   between calls */
78066 +struct store_wmap_params {
78067 +       jnode *cur;             /* jnode of current wander record to fill */
78068 +       int idx;                /* free element index in wander record  */
78069 +       int capacity;           /* capacity  */
78070 +
78071 +#if REISER4_DEBUG
78072 +       struct list_head *tx_list;
78073 +#endif
78074 +};
78075 +
78076 +/* an actor for use in blocknr_set_iterator routine which populates the list
78077 +   of pre-formatted wander records by wandered map info */
78078 +static int
78079 +store_wmap_actor(txn_atom * atom UNUSED_ARG, const reiser4_block_nr * a,
78080 +                const reiser4_block_nr * b, void *data)
78081 +{
78082 +       struct store_wmap_params *params = data;
78083 +
78084 +       if (params->idx >= params->capacity) {
78085 +               /* a new wander record should be taken from the tx_list */
78086 +               params->cur = list_entry(params->cur->capture_link.next, jnode, capture_link);
78087 +               assert("zam-454",
78088 +                      params->tx_list != &params->cur->capture_link);
78089 +
78090 +               params->idx = 0;
78091 +       }
78092 +
78093 +       store_entry(params->cur, params->idx, a, b);
78094 +       params->idx++;
78095 +
78096 +       return 0;
78097 +}
78098 +
78099 +/* This function is called after Relocate set gets written to disk, Overwrite
78100 +   set is written to wandered locations and all wander records are written
78101 +   also. Updated journal header blocks contains a pointer (block number) to
78102 +   first wander record of the just written transaction */
78103 +static int update_journal_header(struct commit_handle *ch, int use_barrier)
78104 +{
78105 +       struct reiser4_super_info_data *sbinfo = get_super_private(ch->super);
78106 +       jnode *jh = sbinfo->journal_header;
78107 +       jnode *head = list_entry(ch->tx_list.next, jnode, capture_link);
78108 +       int ret;
78109 +
78110 +       format_journal_header(ch);
78111 +
78112 +       ret = write_jnodes_to_disk_extent(jh, 1, jnode_get_block(jh), NULL,
78113 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
78114 +       if (ret)
78115 +               return ret;
78116 +
78117 +       // blk_run_address_space(sbinfo->fake->i_mapping);
78118 +       /*blk_run_queues(); */
78119 +
78120 +       ret = jwait_io(jh, WRITE);
78121 +
78122 +       if (ret)
78123 +               return ret;
78124 +
78125 +       sbinfo->last_committed_tx = *jnode_get_block(head);
78126 +
78127 +       return 0;
78128 +}
78129 +
78130 +/* This function is called after write-back is finished. We update journal
78131 +   footer block and free blocks which were occupied by wandered blocks and
78132 +   transaction wander records */
78133 +static int update_journal_footer(struct commit_handle *ch, int use_barrier)
78134 +{
78135 +       reiser4_super_info_data *sbinfo = get_super_private(ch->super);
78136 +
78137 +       jnode *jf = sbinfo->journal_footer;
78138 +
78139 +       int ret;
78140 +
78141 +       format_journal_footer(ch);
78142 +
78143 +       ret = write_jnodes_to_disk_extent(jf, 1, jnode_get_block(jf), NULL,
78144 +                                         use_barrier ? WRITEOUT_BARRIER : 0);
78145 +       if (ret)
78146 +               return ret;
78147 +
78148 +       // blk_run_address_space(sbinfo->fake->i_mapping);
78149 +       /*blk_run_queue(); */
78150 +
78151 +       ret = jwait_io(jf, WRITE);
78152 +       if (ret)
78153 +               return ret;
78154 +
78155 +       return 0;
78156 +}
78157 +
78158 +/* free block numbers of wander records of already written in place transaction */
78159 +static void dealloc_tx_list(struct commit_handle *ch)
78160 +{
78161 +       while (!list_empty(&ch->tx_list)) {
78162 +               jnode *cur = list_entry(ch->tx_list.next, jnode, capture_link);
78163 +               list_del(&cur->capture_link);
78164 +               ON_DEBUG(INIT_LIST_HEAD(&cur->capture_link));
78165 +               reiser4_dealloc_block(jnode_get_block(cur), BLOCK_NOT_COUNTED,
78166 +                                     BA_FORMATTED);
78167 +
78168 +               unpin_jnode_data(cur);
78169 +               reiser4_drop_io_head(cur);
78170 +       }
78171 +}
78172 +
78173 +/* An actor for use in block_nr_iterator() routine which frees wandered blocks
78174 +   from atom's overwrite set. */
78175 +static int
78176 +dealloc_wmap_actor(txn_atom * atom UNUSED_ARG,
78177 +                  const reiser4_block_nr * a UNUSED_ARG,
78178 +                  const reiser4_block_nr * b, void *data UNUSED_ARG)
78179 +{
78180 +
78181 +       assert("zam-499", b != NULL);
78182 +       assert("zam-500", *b != 0);
78183 +       assert("zam-501", !reiser4_blocknr_is_fake(b));
78184 +
78185 +       reiser4_dealloc_block(b, BLOCK_NOT_COUNTED, BA_FORMATTED);
78186 +       return 0;
78187 +}
78188 +
78189 +/* free wandered block locations of already written in place transaction */
78190 +static void dealloc_wmap(struct commit_handle *ch)
78191 +{
78192 +       assert("zam-696", ch->atom != NULL);
78193 +
78194 +       blocknr_set_iterator(ch->atom, &ch->atom->wandered_map,
78195 +                            dealloc_wmap_actor, NULL, 1);
78196 +}
78197 +
78198 +/* helper function for alloc wandered blocks, which refill set of block
78199 +   numbers needed for wandered blocks  */
78200 +static int
78201 +get_more_wandered_blocks(int count, reiser4_block_nr * start, int *len)
78202 +{
78203 +       reiser4_blocknr_hint hint;
78204 +       int ret;
78205 +
78206 +       reiser4_block_nr wide_len = count;
78207 +
78208 +       /* FIXME-ZAM: A special policy needed for allocation of wandered blocks
78209 +          ZAM-FIXME-HANS: yes, what happened to our discussion of using a fixed
78210 +          reserved allocation area so as to get the best qualities of fixed
78211 +          journals? */
78212 +       reiser4_blocknr_hint_init(&hint);
78213 +       hint.block_stage = BLOCK_GRABBED;
78214 +
78215 +       ret = reiser4_alloc_blocks(&hint, start, &wide_len,
78216 +                                  BA_FORMATTED | BA_USE_DEFAULT_SEARCH_START);
78217 +       *len = (int)wide_len;
78218 +
78219 +       return ret;
78220 +}
78221 +
78222 +/*
78223 + * roll back changes made before issuing BIO in the case of IO error.
78224 + */
78225 +static void undo_bio(struct bio *bio)
78226 +{
78227 +       int i;
78228 +
78229 +       for (i = 0; i < bio->bi_vcnt; ++i) {
78230 +               struct page *pg;
78231 +               jnode *node;
78232 +
78233 +               pg = bio->bi_io_vec[i].bv_page;
78234 +               end_page_writeback(pg);
78235 +               node = jprivate(pg);
78236 +               spin_lock_jnode(node);
78237 +               JF_CLR(node, JNODE_WRITEBACK);
78238 +               JF_SET(node, JNODE_DIRTY);
78239 +               spin_unlock_jnode(node);
78240 +       }
78241 +       bio_put(bio);
78242 +}
78243 +
78244 +/* put overwrite set back to atom's clean list */
78245 +static void put_overwrite_set(struct commit_handle *ch)
78246 +{
78247 +       jnode *cur;
78248 +
78249 +       list_for_each_entry(cur, ch->overwrite_set, capture_link)
78250 +               jrelse_tail(cur);
78251 +}
78252 +
78253 +/* Count overwrite set size, grab disk space for wandered blocks allocation.
78254 +   Since we have a separate list for atom's overwrite set we just scan the list,
78255 +   count bitmap and other not leaf nodes which wandered blocks allocation we
78256 +   have to grab space for. */
78257 +static int get_overwrite_set(struct commit_handle *ch)
78258 +{
78259 +       int ret;
78260 +       jnode *cur;
78261 +       __u64 nr_not_leaves = 0;
78262 +#if REISER4_DEBUG
78263 +       __u64 nr_formatted_leaves = 0;
78264 +       __u64 nr_unformatted_leaves = 0;
78265 +#endif
78266 +
78267 +       assert("zam-697", ch->overwrite_set_size == 0);
78268 +
78269 +       ch->overwrite_set = ATOM_OVRWR_LIST(ch->atom);
78270 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78271 +
78272 +       while (ch->overwrite_set != &cur->capture_link) {
78273 +               jnode *next = list_entry(cur->capture_link.next, jnode, capture_link);
78274 +
78275 +               /* Count bitmap locks for getting correct statistics what number
78276 +                * of blocks were cleared by the transaction commit. */
78277 +               if (jnode_get_type(cur) == JNODE_BITMAP)
78278 +                       ch->nr_bitmap++;
78279 +
78280 +               assert("zam-939", JF_ISSET(cur, JNODE_OVRWR)
78281 +                      || jnode_get_type(cur) == JNODE_BITMAP);
78282 +
78283 +               if (jnode_is_znode(cur) && znode_above_root(JZNODE(cur))) {
78284 +                       /* we replace fake znode by another (real)
78285 +                          znode which is suggested by disk_layout
78286 +                          plugin */
78287 +
78288 +                       /* FIXME: it looks like fake znode should be
78289 +                          replaced by jnode supplied by
78290 +                          disk_layout. */
78291 +
78292 +                       struct super_block *s = reiser4_get_current_sb();
78293 +                       reiser4_super_info_data *sbinfo =
78294 +                           get_current_super_private();
78295 +
78296 +                       if (sbinfo->df_plug->log_super) {
78297 +                               jnode *sj = sbinfo->df_plug->log_super(s);
78298 +
78299 +                               assert("zam-593", sj != NULL);
78300 +
78301 +                               if (IS_ERR(sj))
78302 +                                       return PTR_ERR(sj);
78303 +
78304 +                               spin_lock_jnode(sj);
78305 +                               JF_SET(sj, JNODE_OVRWR);
78306 +                               insert_into_atom_ovrwr_list(ch->atom, sj);
78307 +                               spin_unlock_jnode(sj);
78308 +
78309 +                               /* jload it as the rest of overwrite set */
78310 +                               jload_gfp(sj, reiser4_ctx_gfp_mask_get(), 0);
78311 +
78312 +                               ch->overwrite_set_size++;
78313 +                       }
78314 +                       spin_lock_jnode(cur);
78315 +                       reiser4_uncapture_block(cur);
78316 +                       jput(cur);
78317 +
78318 +               } else {
78319 +                       int ret;
78320 +                       ch->overwrite_set_size++;
78321 +                       ret = jload_gfp(cur, reiser4_ctx_gfp_mask_get(), 0);
78322 +                       if (ret)
78323 +                               reiser4_panic("zam-783",
78324 +                                             "cannot load e-flushed jnode back (ret = %d)\n",
78325 +                                             ret);
78326 +               }
78327 +
78328 +               /* Count not leaves here because we have to grab disk space
78329 +                * for wandered blocks. They were not counted as "flush
78330 +                * reserved". Counting should be done _after_ nodes are pinned
78331 +                * into memory by jload(). */
78332 +               if (!jnode_is_leaf(cur))
78333 +                       nr_not_leaves++;
78334 +               else {
78335 +#if REISER4_DEBUG
78336 +                       /* at this point @cur either has JNODE_FLUSH_RESERVED
78337 +                        * or is eflushed. Locking is not strong enough to
78338 +                        * write an assertion checking for this. */
78339 +                       if (jnode_is_znode(cur))
78340 +                               nr_formatted_leaves++;
78341 +                       else
78342 +                               nr_unformatted_leaves++;
78343 +#endif
78344 +                       JF_CLR(cur, JNODE_FLUSH_RESERVED);
78345 +               }
78346 +
78347 +               cur = next;
78348 +       }
78349 +
78350 +       /* Grab space for writing (wandered blocks) of not leaves found in
78351 +        * overwrite set. */
78352 +       ret = reiser4_grab_space_force(nr_not_leaves, BA_RESERVED);
78353 +       if (ret)
78354 +               return ret;
78355 +
78356 +       /* Disk space for allocation of wandered blocks of leaf nodes already
78357 +        * reserved as "flush reserved", move it to grabbed space counter. */
78358 +       spin_lock_atom(ch->atom);
78359 +       assert("zam-940",
78360 +              nr_formatted_leaves + nr_unformatted_leaves <=
78361 +              ch->atom->flush_reserved);
78362 +       flush_reserved2grabbed(ch->atom, ch->atom->flush_reserved);
78363 +       spin_unlock_atom(ch->atom);
78364 +
78365 +       return ch->overwrite_set_size;
78366 +}
78367 +
78368 +/**
78369 + * write_jnodes_to_disk_extent - submit write request
78370 + * @head:
78371 + * @first: first jnode of the list
78372 + * @nr: number of jnodes on the list
78373 + * @block_p:
78374 + * @fq:
78375 + * @flags: used to decide whether page is to get PG_reclaim flag
78376 + *
78377 + * Submits a write request for @nr jnodes beginning from the @first, other
78378 + * jnodes are after the @first on the double-linked "capture" list.  All jnodes
78379 + * will be written to the disk region of @nr blocks starting with @block_p block
78380 + * number.  If @fq is not NULL it means that waiting for i/o completion will be
78381 + * done more efficiently by using flush_queue_t objects.
78382 + * This function is the one which writes list of jnodes in batch mode. It does
78383 + * all low-level things as bio construction and page states manipulation.
78384 + *
78385 + * ZAM-FIXME-HANS: brief me on why this function exists, and why bios are
78386 + * aggregated in this function instead of being left to the layers below
78387 + *
78388 + * FIXME: ZAM->HANS: What layer are you talking about? Can you point me to that?
78389 + * Why that layer needed? Why BIOs cannot be constructed here?
78390 + */
78391 +static int write_jnodes_to_disk_extent(
78392 +       jnode *first, int nr, const reiser4_block_nr *block_p,
78393 +       flush_queue_t *fq, int flags)
78394 +{
78395 +       struct super_block *super = reiser4_get_current_sb();
78396 +       int write_op = ( flags & WRITEOUT_BARRIER ) ? WRITE_BARRIER : WRITE;
78397 +       int max_blocks;
78398 +       jnode *cur = first;
78399 +       reiser4_block_nr block;
78400 +
78401 +       assert("zam-571", first != NULL);
78402 +       assert("zam-572", block_p != NULL);
78403 +       assert("zam-570", nr > 0);
78404 +
78405 +       block = *block_p;
78406 +       max_blocks = min(bio_get_nr_vecs(super->s_bdev), BIO_MAX_PAGES);
78407 +
78408 +       while (nr > 0) {
78409 +               struct bio *bio;
78410 +               int nr_blocks = min(nr, max_blocks);
78411 +               int i;
78412 +               int nr_used;
78413 +
78414 +               bio = bio_alloc(GFP_NOIO, nr_blocks);
78415 +               if (!bio)
78416 +                       return RETERR(-ENOMEM);
78417 +
78418 +               bio->bi_bdev = super->s_bdev;
78419 +               bio->bi_sector = block * (super->s_blocksize >> 9);
78420 +               for (nr_used = 0, i = 0; i < nr_blocks; i++) {
78421 +                       struct page *pg;
78422 +
78423 +                       pg = jnode_page(cur);
78424 +                       assert("zam-573", pg != NULL);
78425 +
78426 +                       page_cache_get(pg);
78427 +
78428 +                       lock_and_wait_page_writeback(pg);
78429 +
78430 +                       if (!bio_add_page(bio, pg, super->s_blocksize, 0)) {
78431 +                               /*
78432 +                                * underlying device is satiated. Stop adding
78433 +                                * pages to the bio.
78434 +                                */
78435 +                               unlock_page(pg);
78436 +                               page_cache_release(pg);
78437 +                               break;
78438 +                       }
78439 +
78440 +                       spin_lock_jnode(cur);
78441 +                       assert("nikita-3166",
78442 +                              pg->mapping == jnode_get_mapping(cur));
78443 +                       assert("zam-912", !JF_ISSET(cur, JNODE_WRITEBACK));
78444 +#if REISER4_DEBUG
78445 +                       spin_lock(&cur->load);
78446 +                       assert("nikita-3165", !jnode_is_releasable(cur));
78447 +                       spin_unlock(&cur->load);
78448 +#endif
78449 +                       JF_SET(cur, JNODE_WRITEBACK);
78450 +                       JF_CLR(cur, JNODE_DIRTY);
78451 +                       ON_DEBUG(cur->written++);
78452 +                       spin_unlock_jnode(cur);
78453 +
78454 +                       ClearPageError(pg);
78455 +                       set_page_writeback(pg);
78456 +
78457 +                       if (get_current_context()->entd) {
78458 +                               /* this is ent thread */
78459 +                               entd_context *ent = get_entd_context(super);
78460 +                               struct wbq *rq, *next;
78461 +
78462 +                               spin_lock(&ent->guard);
78463 +
78464 +                               if (pg == ent->cur_request->page) {
78465 +                                       /*
78466 +                                        * entd is called for this page. This
78467 +                                        * request is not in th etodo list
78468 +                                        */
78469 +                                       ent->cur_request->written = 1;
78470 +                               } else {
78471 +                                       /*
78472 +                                        * if we have written a page for which writepage
78473 +                                        * is called for - move request to another list.
78474 +                                        */
78475 +                                       list_for_each_entry_safe(rq, next, &ent->todo_list, link) {
78476 +                                               assert("", rq->magic == WBQ_MAGIC);
78477 +                                               if (pg == rq->page) {
78478 +                                                       /*
78479 +                                                        * remove request from
78480 +                                                        * entd's queue, but do
78481 +                                                        * not wake up a thread
78482 +                                                        * which put this
78483 +                                                        * request
78484 +                                                        */
78485 +                                                       list_del_init(&rq->link);
78486 +                                                       ent->nr_todo_reqs --;
78487 +                                                       list_add_tail(&rq->link, &ent->done_list);
78488 +                                                       ent->nr_done_reqs ++;
78489 +                                                       rq->written = 1;
78490 +                                                       break;
78491 +                                               }
78492 +                                       }
78493 +                               }
78494 +                               spin_unlock(&ent->guard);
78495 +                       }
78496 +
78497 +                       clear_page_dirty_for_io(pg);
78498 +
78499 +                       unlock_page(pg);
78500 +
78501 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78502 +                       nr_used++;
78503 +               }
78504 +               if (nr_used > 0) {
78505 +                       assert("nikita-3453",
78506 +                              bio->bi_size == super->s_blocksize * nr_used);
78507 +                       assert("nikita-3454", bio->bi_vcnt == nr_used);
78508 +
78509 +                       /* Check if we are allowed to write at all */
78510 +                       if (super->s_flags & MS_RDONLY)
78511 +                               undo_bio(bio);
78512 +                       else {
78513 +                               int not_supported;
78514 +
78515 +                               add_fq_to_bio(fq, bio);
78516 +                               bio_get(bio);
78517 +                               reiser4_submit_bio(write_op, bio);
78518 +                               not_supported = bio_flagged(bio, BIO_EOPNOTSUPP);
78519 +                               bio_put(bio);
78520 +                               if (not_supported)
78521 +                                       return -EOPNOTSUPP;
78522 +                       }
78523 +
78524 +                       block += nr_used - 1;
78525 +                       update_blocknr_hint_default(super, &block);
78526 +                       block += 1;
78527 +               } else {
78528 +                       bio_put(bio);
78529 +               }
78530 +               nr -= nr_used;
78531 +       }
78532 +
78533 +       return 0;
78534 +}
78535 +
78536 +/* This is a procedure which recovers a contiguous sequences of disk block
78537 +   numbers in the given list of j-nodes and submits write requests on this
78538 +   per-sequence basis */
78539 +int
78540 +write_jnode_list(struct list_head *head, flush_queue_t *fq,
78541 +                long *nr_submitted, int flags)
78542 +{
78543 +       int ret;
78544 +       jnode *beg = list_entry(head->next, jnode, capture_link);
78545 +
78546 +       while (head != &beg->capture_link) {
78547 +               int nr = 1;
78548 +               jnode *cur = list_entry(beg->capture_link.next, jnode, capture_link);
78549 +
78550 +               while (head != &cur->capture_link) {
78551 +                       if (*jnode_get_block(cur) != *jnode_get_block(beg) + nr)
78552 +                               break;
78553 +                       ++nr;
78554 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78555 +               }
78556 +
78557 +               ret = write_jnodes_to_disk_extent(
78558 +                       beg, nr, jnode_get_block(beg), fq, flags);
78559 +               if (ret)
78560 +                       return ret;
78561 +
78562 +               if (nr_submitted)
78563 +                       *nr_submitted += nr;
78564 +
78565 +               beg = cur;
78566 +       }
78567 +
78568 +       return 0;
78569 +}
78570 +
78571 +/* add given wandered mapping to atom's wandered map */
78572 +static int
78573 +add_region_to_wmap(jnode * cur, int len, const reiser4_block_nr * block_p)
78574 +{
78575 +       int ret;
78576 +       blocknr_set_entry *new_bsep = NULL;
78577 +       reiser4_block_nr block;
78578 +
78579 +       txn_atom *atom;
78580 +
78581 +       assert("zam-568", block_p != NULL);
78582 +       block = *block_p;
78583 +       assert("zam-569", len > 0);
78584 +
78585 +       while ((len--) > 0) {
78586 +               do {
78587 +                       atom = get_current_atom_locked();
78588 +                       assert("zam-536",
78589 +                              !reiser4_blocknr_is_fake(jnode_get_block(cur)));
78590 +                       ret =
78591 +                           blocknr_set_add_pair(atom, &atom->wandered_map,
78592 +                                                &new_bsep,
78593 +                                                jnode_get_block(cur), &block);
78594 +               } while (ret == -E_REPEAT);
78595 +
78596 +               if (ret) {
78597 +                       /* deallocate blocks which were not added to wandered
78598 +                          map */
78599 +                       reiser4_block_nr wide_len = len;
78600 +
78601 +                       reiser4_dealloc_blocks(&block, &wide_len,
78602 +                                              BLOCK_NOT_COUNTED,
78603 +                                              BA_FORMATTED
78604 +                                              /* formatted, without defer */ );
78605 +
78606 +                       return ret;
78607 +               }
78608 +
78609 +               spin_unlock_atom(atom);
78610 +
78611 +               cur = list_entry(cur->capture_link.next, jnode, capture_link);
78612 +               ++block;
78613 +       }
78614 +
78615 +       return 0;
78616 +}
78617 +
78618 +/* Allocate wandered blocks for current atom's OVERWRITE SET and immediately
78619 +   submit IO for allocated blocks.  We assume that current atom is in a stage
78620 +   when any atom fusion is impossible and atom is unlocked and it is safe. */
78621 +static int alloc_wandered_blocks(struct commit_handle *ch, flush_queue_t *fq)
78622 +{
78623 +       reiser4_block_nr block;
78624 +
78625 +       int rest;
78626 +       int len;
78627 +       int ret;
78628 +
78629 +       jnode *cur;
78630 +
78631 +       assert("zam-534", ch->overwrite_set_size > 0);
78632 +
78633 +       rest = ch->overwrite_set_size;
78634 +
78635 +       cur = list_entry(ch->overwrite_set->next, jnode, capture_link);
78636 +       while (ch->overwrite_set != &cur->capture_link) {
78637 +               assert("zam-567", JF_ISSET(cur, JNODE_OVRWR));
78638 +
78639 +               ret = get_more_wandered_blocks(rest, &block, &len);
78640 +               if (ret)
78641 +                       return ret;
78642 +
78643 +               rest -= len;
78644 +
78645 +               ret = add_region_to_wmap(cur, len, &block);
78646 +               if (ret)
78647 +                       return ret;
78648 +
78649 +               ret = write_jnodes_to_disk_extent(cur, len, &block, fq, 0);
78650 +               if (ret)
78651 +                       return ret;
78652 +
78653 +               while ((len--) > 0) {
78654 +                       assert("zam-604",
78655 +                              ch->overwrite_set != &cur->capture_link);
78656 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78657 +               }
78658 +       }
78659 +
78660 +       return 0;
78661 +}
78662 +
78663 +/* allocate given number of nodes over the journal area and link them into a
78664 +   list, return pointer to the first jnode in the list */
78665 +static int alloc_tx(struct commit_handle *ch, flush_queue_t * fq)
78666 +{
78667 +       reiser4_blocknr_hint hint;
78668 +       reiser4_block_nr allocated = 0;
78669 +       reiser4_block_nr first, len;
78670 +       jnode *cur;
78671 +       jnode *txhead;
78672 +       int ret;
78673 +       reiser4_context *ctx;
78674 +       reiser4_super_info_data *sbinfo;
78675 +
78676 +       assert("zam-698", ch->tx_size > 0);
78677 +       assert("zam-699", list_empty_careful(&ch->tx_list));
78678 +
78679 +       ctx = get_current_context();
78680 +       sbinfo = get_super_private(ctx->super);
78681 +
78682 +       while (allocated < (unsigned)ch->tx_size) {
78683 +               len = (ch->tx_size - allocated);
78684 +
78685 +               reiser4_blocknr_hint_init(&hint);
78686 +
78687 +               hint.block_stage = BLOCK_GRABBED;
78688 +
78689 +               /* FIXME: there should be some block allocation policy for
78690 +                  nodes which contain wander records */
78691 +
78692 +               /* We assume that disk space for wandered record blocks can be
78693 +                * taken from reserved area. */
78694 +               ret = reiser4_alloc_blocks(&hint, &first, &len,
78695 +                                          BA_FORMATTED | BA_RESERVED |
78696 +                                          BA_USE_DEFAULT_SEARCH_START);
78697 +               reiser4_blocknr_hint_done(&hint);
78698 +
78699 +               if (ret)
78700 +                       return ret;
78701 +
78702 +               allocated += len;
78703 +
78704 +               /* create jnodes for all wander records */
78705 +               while (len--) {
78706 +                       cur = reiser4_alloc_io_head(&first);
78707 +
78708 +                       if (cur == NULL) {
78709 +                               ret = RETERR(-ENOMEM);
78710 +                               goto free_not_assigned;
78711 +                       }
78712 +
78713 +                       ret = jinit_new(cur, reiser4_ctx_gfp_mask_get());
78714 +
78715 +                       if (ret != 0) {
78716 +                               jfree(cur);
78717 +                               goto free_not_assigned;
78718 +                       }
78719 +
78720 +                       pin_jnode_data(cur);
78721 +
78722 +                       list_add_tail(&cur->capture_link, &ch->tx_list);
78723 +
78724 +                       first++;
78725 +               }
78726 +       }
78727 +
78728 +       { /* format a on-disk linked list of wander records */
78729 +               int serial = 1;
78730 +
78731 +               txhead = list_entry(ch->tx_list.next, jnode, capture_link);
78732 +               format_tx_head(ch);
78733 +
78734 +               cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78735 +               while (&ch->tx_list != &cur->capture_link) {
78736 +                       format_wander_record(ch, cur, serial++);
78737 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78738 +               }
78739 +       }
78740 +
78741 +       { /* Fill wander records with Wandered Set */
78742 +               struct store_wmap_params params;
78743 +               txn_atom *atom;
78744 +
78745 +               params.cur = list_entry(txhead->capture_link.next, jnode, capture_link);
78746 +
78747 +               params.idx = 0;
78748 +               params.capacity =
78749 +                   wander_record_capacity(reiser4_get_current_sb());
78750 +
78751 +               atom = get_current_atom_locked();
78752 +               blocknr_set_iterator(atom, &atom->wandered_map,
78753 +                                    &store_wmap_actor, &params, 0);
78754 +               spin_unlock_atom(atom);
78755 +       }
78756 +
78757 +       { /* relse all jnodes from tx_list */
78758 +               cur = list_entry(ch->tx_list.next, jnode, capture_link);
78759 +               while (&ch->tx_list != &cur->capture_link) {
78760 +                       jrelse(cur);
78761 +                       cur = list_entry(cur->capture_link.next, jnode, capture_link);
78762 +               }
78763 +       }
78764 +
78765 +       ret = write_jnode_list(&ch->tx_list, fq, NULL, 0);
78766 +
78767 +       return ret;
78768 +
78769 +      free_not_assigned:
78770 +       /* We deallocate blocks not yet assigned to jnodes on tx_list. The
78771 +          caller takes care about invalidating of tx list  */
78772 +       reiser4_dealloc_blocks(&first, &len, BLOCK_NOT_COUNTED, BA_FORMATTED);
78773 +
78774 +       return ret;
78775 +}
78776 +
78777 +static int commit_tx(struct commit_handle *ch)
78778 +{
78779 +       flush_queue_t *fq;
78780 +       int barrier;
78781 +       int ret;
78782 +
78783 +       /* Grab more space for wandered records. */
78784 +       ret = reiser4_grab_space_force((__u64) (ch->tx_size), BA_RESERVED);
78785 +       if (ret)
78786 +               return ret;
78787 +
78788 +       fq = get_fq_for_current_atom();
78789 +       if (IS_ERR(fq))
78790 +               return PTR_ERR(fq);
78791 +
78792 +       spin_unlock_atom(fq->atom);
78793 +       do {
78794 +               ret = alloc_wandered_blocks(ch, fq);
78795 +               if (ret)
78796 +                       break;
78797 +               ret = alloc_tx(ch, fq);
78798 +               if (ret)
78799 +                       break;
78800 +       } while (0);
78801 +
78802 +       reiser4_fq_put(fq);
78803 +       if (ret)
78804 +               return ret;
78805 + repeat_wo_barrier:
78806 +       barrier = reiser4_use_write_barrier(ch->super);
78807 +       if (!barrier) {
78808 +               ret = current_atom_finish_all_fq();
78809 +               if (ret)
78810 +                       return ret;
78811 +       }
78812 +       ret = update_journal_header(ch, barrier);
78813 +       if (barrier) {
78814 +               if (ret) {
78815 +                       if (ret == -EOPNOTSUPP) {
78816 +                               disable_write_barrier(ch->super);
78817 +                               goto repeat_wo_barrier;
78818 +                       }
78819 +                       return ret;
78820 +               }
78821 +               ret = current_atom_finish_all_fq();
78822 +       }
78823 +       return ret;
78824 +}
78825 +
78826 +static int write_tx_back(struct commit_handle * ch)
78827 +{
78828 +       flush_queue_t *fq;
78829 +       int ret;
78830 +       int barrier;
78831 +
78832 +       reiser4_post_commit_hook();
78833 +       fq = get_fq_for_current_atom();
78834 +       if (IS_ERR(fq))
78835 +               return  PTR_ERR(fq);
78836 +       spin_unlock_atom(fq->atom);
78837 +       ret = write_jnode_list(
78838 +               ch->overwrite_set, fq, NULL, WRITEOUT_FOR_PAGE_RECLAIM);
78839 +       reiser4_fq_put(fq);
78840 +       if (ret)
78841 +               return ret;
78842 + repeat_wo_barrier:
78843 +       barrier = reiser4_use_write_barrier(ch->super);
78844 +       if (!barrier) {
78845 +               ret = current_atom_finish_all_fq();
78846 +               if (ret)
78847 +                       return ret;
78848 +       }
78849 +       ret = update_journal_footer(ch, barrier);
78850 +       if (barrier) {
78851 +               if (ret) {
78852 +                       if (ret == -EOPNOTSUPP) {
78853 +                               disable_write_barrier(ch->super);
78854 +                               goto repeat_wo_barrier;
78855 +                       }
78856 +                       return ret;
78857 +               }
78858 +               ret = current_atom_finish_all_fq();
78859 +       }
78860 +       if (ret)
78861 +               return ret;
78862 +       reiser4_post_write_back_hook();
78863 +       return 0;
78864 +}
78865 +
78866 +/* We assume that at this moment all captured blocks are marked as RELOC or
78867 +   WANDER (belong to Relocate o Overwrite set), all nodes from Relocate set
78868 +   are submitted to write.
78869 +*/
78870 +
78871 +int reiser4_write_logs(long *nr_submitted)
78872 +{
78873 +       txn_atom *atom;
78874 +       struct super_block *super = reiser4_get_current_sb();
78875 +       reiser4_super_info_data *sbinfo = get_super_private(super);
78876 +       struct commit_handle ch;
78877 +       int ret;
78878 +
78879 +       writeout_mode_enable();
78880 +
78881 +       /* block allocator may add j-nodes to the clean_list */
78882 +       ret = reiser4_pre_commit_hook();
78883 +       if (ret)
78884 +               return ret;
78885 +
78886 +       /* No locks are required if we take atom which stage >=
78887 +        * ASTAGE_PRE_COMMIT */
78888 +       atom = get_current_context()->trans->atom;
78889 +       assert("zam-965", atom != NULL);
78890 +
78891 +       /* relocate set is on the atom->clean_nodes list after
78892 +        * current_atom_complete_writes() finishes. It can be safely
78893 +        * uncaptured after commit_mutex is locked, because any atom that
78894 +        * captures these nodes is guaranteed to commit after current one.
78895 +        *
78896 +        * This can only be done after reiser4_pre_commit_hook(), because it is where
78897 +        * early flushed jnodes with CREATED bit are transferred to the
78898 +        * overwrite list. */
78899 +       reiser4_invalidate_list(ATOM_CLEAN_LIST(atom));
78900 +       spin_lock_atom(atom);
78901 +       /* There might be waiters for the relocate nodes which we have
78902 +        * released, wake them up. */
78903 +       reiser4_atom_send_event(atom);
78904 +       spin_unlock_atom(atom);
78905 +
78906 +       if (REISER4_DEBUG) {
78907 +               int level;
78908 +
78909 +               for (level = 0; level < REAL_MAX_ZTREE_HEIGHT + 1; ++level)
78910 +                       assert("nikita-3352",
78911 +                              list_empty_careful(ATOM_DIRTY_LIST(atom, level)));
78912 +       }
78913 +
78914 +       sbinfo->nr_files_committed += (unsigned)atom->nr_objects_created;
78915 +       sbinfo->nr_files_committed -= (unsigned)atom->nr_objects_deleted;
78916 +
78917 +       init_commit_handle(&ch, atom);
78918 +
78919 +       ch.free_blocks = sbinfo->blocks_free_committed;
78920 +       ch.nr_files = sbinfo->nr_files_committed;
78921 +       /* ZAM-FIXME-HANS: email me what the contention level is for the super
78922 +        * lock. */
78923 +       ch.next_oid = oid_next(super);
78924 +
78925 +       /* count overwrite set and place it in a separate list */
78926 +       ret = get_overwrite_set(&ch);
78927 +
78928 +       if (ret <= 0) {
78929 +               /* It is possible that overwrite set is empty here, it means
78930 +                  all captured nodes are clean */
78931 +               goto up_and_ret;
78932 +       }
78933 +
78934 +       /* Inform the caller about what number of dirty pages will be
78935 +        * submitted to disk. */
78936 +       *nr_submitted += ch.overwrite_set_size - ch.nr_bitmap;
78937 +
78938 +       /* count all records needed for storing of the wandered set */
78939 +       get_tx_size(&ch);
78940 +
78941 +       ret = commit_tx(&ch);
78942 +       if (ret)
78943 +               goto up_and_ret;
78944 +
78945 +       spin_lock_atom(atom);
78946 +       reiser4_atom_set_stage(atom, ASTAGE_POST_COMMIT);
78947 +       spin_unlock_atom(atom);
78948 +
78949 +       ret = write_tx_back(&ch);
78950 +       reiser4_post_write_back_hook();
78951 +
78952 +      up_and_ret:
78953 +       if (ret) {
78954 +               /* there could be fq attached to current atom; the only way to
78955 +                  remove them is: */
78956 +               current_atom_finish_all_fq();
78957 +       }
78958 +
78959 +       /* free blocks of flushed transaction */
78960 +       dealloc_tx_list(&ch);
78961 +       dealloc_wmap(&ch);
78962 +
78963 +       put_overwrite_set(&ch);
78964 +
78965 +       done_commit_handle(&ch);
78966 +
78967 +       writeout_mode_disable();
78968 +
78969 +       return ret;
78970 +}
78971 +
78972 +/* consistency checks for journal data/control blocks: header, footer, log
78973 +   records, transactions head blocks. All functions return zero on success. */
78974 +
78975 +static int check_journal_header(const jnode * node UNUSED_ARG)
78976 +{
78977 +       /* FIXME: journal header has no magic field yet. */
78978 +       return 0;
78979 +}
78980 +
78981 +/* wait for write completion for all jnodes from given list */
78982 +static int wait_on_jnode_list(struct list_head *head)
78983 +{
78984 +       jnode *scan;
78985 +       int ret = 0;
78986 +
78987 +       list_for_each_entry(scan, head, capture_link) {
78988 +               struct page *pg = jnode_page(scan);
78989 +
78990 +               if (pg) {
78991 +                       if (PageWriteback(pg))
78992 +                               wait_on_page_writeback(pg);
78993 +
78994 +                       if (PageError(pg))
78995 +                               ret++;
78996 +               }
78997 +       }
78998 +
78999 +       return ret;
79000 +}
79001 +
79002 +static int check_journal_footer(const jnode * node UNUSED_ARG)
79003 +{
79004 +       /* FIXME: journal footer has no magic field yet. */
79005 +       return 0;
79006 +}
79007 +
79008 +static int check_tx_head(const jnode * node)
79009 +{
79010 +       struct tx_header *header = (struct tx_header *)jdata(node);
79011 +
79012 +       if (memcmp(&header->magic, TX_HEADER_MAGIC, TX_HEADER_MAGIC_SIZE) != 0) {
79013 +               warning("zam-627", "tx head at block %s corrupted\n",
79014 +                       sprint_address(jnode_get_block(node)));
79015 +               return RETERR(-EIO);
79016 +       }
79017 +
79018 +       return 0;
79019 +}
79020 +
79021 +static int check_wander_record(const jnode * node)
79022 +{
79023 +       struct wander_record_header *RH =
79024 +           (struct wander_record_header *)jdata(node);
79025 +
79026 +       if (memcmp(&RH->magic, WANDER_RECORD_MAGIC, WANDER_RECORD_MAGIC_SIZE) !=
79027 +           0) {
79028 +               warning("zam-628", "wander record at block %s corrupted\n",
79029 +                       sprint_address(jnode_get_block(node)));
79030 +               return RETERR(-EIO);
79031 +       }
79032 +
79033 +       return 0;
79034 +}
79035 +
79036 +/* fill commit_handler structure by everything what is needed for update_journal_footer */
79037 +static int restore_commit_handle(struct commit_handle *ch, jnode *tx_head)
79038 +{
79039 +       struct tx_header *TXH;
79040 +       int ret;
79041 +
79042 +       ret = jload(tx_head);
79043 +       if (ret)
79044 +               return ret;
79045 +
79046 +       TXH = (struct tx_header *)jdata(tx_head);
79047 +
79048 +       ch->free_blocks = le64_to_cpu(get_unaligned(&TXH->free_blocks));
79049 +       ch->nr_files = le64_to_cpu(get_unaligned(&TXH->nr_files));
79050 +       ch->next_oid = le64_to_cpu(get_unaligned(&TXH->next_oid));
79051 +
79052 +       jrelse(tx_head);
79053 +
79054 +       list_add(&tx_head->capture_link, &ch->tx_list);
79055 +
79056 +       return 0;
79057 +}
79058 +
79059 +/* replay one transaction: restore and write overwrite set in place */
79060 +static int replay_transaction(const struct super_block *s,
79061 +                             jnode * tx_head,
79062 +                             const reiser4_block_nr * log_rec_block_p,
79063 +                             const reiser4_block_nr * end_block,
79064 +                             unsigned int nr_wander_records)
79065 +{
79066 +       reiser4_block_nr log_rec_block = *log_rec_block_p;
79067 +       struct commit_handle ch;
79068 +       LIST_HEAD(overwrite_set);
79069 +       jnode *log;
79070 +       int ret;
79071 +
79072 +       init_commit_handle(&ch, NULL);
79073 +       ch.overwrite_set = &overwrite_set;
79074 +
79075 +       restore_commit_handle(&ch, tx_head);
79076 +
79077 +       while (log_rec_block != *end_block) {
79078 +               struct wander_record_header *header;
79079 +               struct wander_entry *entry;
79080 +
79081 +               int i;
79082 +
79083 +               if (nr_wander_records == 0) {
79084 +                       warning("zam-631",
79085 +                               "number of wander records in the linked list"
79086 +                               " greater than number stored in tx head.\n");
79087 +                       ret = RETERR(-EIO);
79088 +                       goto free_ow_set;
79089 +               }
79090 +
79091 +               log = reiser4_alloc_io_head(&log_rec_block);
79092 +               if (log == NULL)
79093 +                       return RETERR(-ENOMEM);
79094 +
79095 +               ret = jload(log);
79096 +               if (ret < 0) {
79097 +                       reiser4_drop_io_head(log);
79098 +                       return ret;
79099 +               }
79100 +
79101 +               ret = check_wander_record(log);
79102 +               if (ret) {
79103 +                       jrelse(log);
79104 +                       reiser4_drop_io_head(log);
79105 +                       return ret;
79106 +               }
79107 +
79108 +               header = (struct wander_record_header *)jdata(log);
79109 +               log_rec_block = le64_to_cpu(get_unaligned(&header->next_block));
79110 +
79111 +               entry = (struct wander_entry *)(header + 1);
79112 +
79113 +               /* restore overwrite set from wander record content */
79114 +               for (i = 0; i < wander_record_capacity(s); i++) {
79115 +                       reiser4_block_nr block;
79116 +                       jnode *node;
79117 +
79118 +                       block = le64_to_cpu(get_unaligned(&entry->wandered));
79119 +                       if (block == 0)
79120 +                               break;
79121 +
79122 +                       node = reiser4_alloc_io_head(&block);
79123 +                       if (node == NULL) {
79124 +                               ret = RETERR(-ENOMEM);
79125 +                               /*
79126 +                                * FIXME-VS:???
79127 +                                */
79128 +                               jrelse(log);
79129 +                               reiser4_drop_io_head(log);
79130 +                               goto free_ow_set;
79131 +                       }
79132 +
79133 +                       ret = jload(node);
79134 +
79135 +                       if (ret < 0) {
79136 +                               reiser4_drop_io_head(node);
79137 +                               /*
79138 +                                * FIXME-VS:???
79139 +                                */
79140 +                               jrelse(log);
79141 +                               reiser4_drop_io_head(log);
79142 +                               goto free_ow_set;
79143 +                       }
79144 +
79145 +                       block = le64_to_cpu(get_unaligned(&entry->original));
79146 +
79147 +                       assert("zam-603", block != 0);
79148 +
79149 +                       jnode_set_block(node, &block);
79150 +
79151 +                       list_add_tail(&node->capture_link, ch.overwrite_set);
79152 +
79153 +                       ++entry;
79154 +               }
79155 +
79156 +               jrelse(log);
79157 +               reiser4_drop_io_head(log);
79158 +
79159 +               --nr_wander_records;
79160 +       }
79161 +
79162 +       if (nr_wander_records != 0) {
79163 +               warning("zam-632", "number of wander records in the linked list"
79164 +                       " less than number stored in tx head.\n");
79165 +               ret = RETERR(-EIO);
79166 +               goto free_ow_set;
79167 +       }
79168 +
79169 +       {                       /* write wandered set in place */
79170 +               write_jnode_list(ch.overwrite_set, NULL, NULL, 0);
79171 +               ret = wait_on_jnode_list(ch.overwrite_set);
79172 +
79173 +               if (ret) {
79174 +                       ret = RETERR(-EIO);
79175 +                       goto free_ow_set;
79176 +               }
79177 +       }
79178 +
79179 +       ret = update_journal_footer(&ch, 0);
79180 +
79181 +      free_ow_set:
79182 +
79183 +       while (!list_empty(ch.overwrite_set)) {
79184 +               jnode *cur = list_entry(ch.overwrite_set->next, jnode, capture_link);
79185 +               list_del_init(&cur->capture_link);
79186 +               jrelse(cur);
79187 +               reiser4_drop_io_head(cur);
79188 +       }
79189 +
79190 +       list_del_init(&tx_head->capture_link);
79191 +
79192 +       done_commit_handle(&ch);
79193 +
79194 +       return ret;
79195 +}
79196 +
79197 +/* find oldest committed and not played transaction and play it. The transaction
79198 + * was committed and journal header block was updated but the blocks from the
79199 + * process of writing the atom's overwrite set in-place and updating of journal
79200 + * footer block were not completed. This function completes the process by
79201 + * recovering the atom's overwrite set from their wandered locations and writes
79202 + * them in-place and updating the journal footer. */
79203 +static int replay_oldest_transaction(struct super_block *s)
79204 +{
79205 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79206 +       jnode *jf = sbinfo->journal_footer;
79207 +       unsigned int total;
79208 +       struct journal_footer *F;
79209 +       struct tx_header *T;
79210 +
79211 +       reiser4_block_nr prev_tx;
79212 +       reiser4_block_nr last_flushed_tx;
79213 +       reiser4_block_nr log_rec_block = 0;
79214 +
79215 +       jnode *tx_head;
79216 +
79217 +       int ret;
79218 +
79219 +       if ((ret = jload(jf)) < 0)
79220 +               return ret;
79221 +
79222 +       F = (struct journal_footer *)jdata(jf);
79223 +
79224 +       last_flushed_tx = le64_to_cpu(get_unaligned(&F->last_flushed_tx));
79225 +
79226 +       jrelse(jf);
79227 +
79228 +       if (sbinfo->last_committed_tx == last_flushed_tx) {
79229 +               /* all transactions are replayed */
79230 +               return 0;
79231 +       }
79232 +
79233 +       prev_tx = sbinfo->last_committed_tx;
79234 +
79235 +       /* searching for oldest not flushed transaction */
79236 +       while (1) {
79237 +               tx_head = reiser4_alloc_io_head(&prev_tx);
79238 +               if (!tx_head)
79239 +                       return RETERR(-ENOMEM);
79240 +
79241 +               ret = jload(tx_head);
79242 +               if (ret < 0) {
79243 +                       reiser4_drop_io_head(tx_head);
79244 +                       return ret;
79245 +               }
79246 +
79247 +               ret = check_tx_head(tx_head);
79248 +               if (ret) {
79249 +                       jrelse(tx_head);
79250 +                       reiser4_drop_io_head(tx_head);
79251 +                       return ret;
79252 +               }
79253 +
79254 +               T = (struct tx_header *)jdata(tx_head);
79255 +
79256 +               prev_tx = le64_to_cpu(get_unaligned(&T->prev_tx));
79257 +
79258 +               if (prev_tx == last_flushed_tx)
79259 +                       break;
79260 +
79261 +               jrelse(tx_head);
79262 +               reiser4_drop_io_head(tx_head);
79263 +       }
79264 +
79265 +       total = le32_to_cpu(get_unaligned(&T->total));
79266 +       log_rec_block = le64_to_cpu(get_unaligned(&T->next_block));
79267 +
79268 +       pin_jnode_data(tx_head);
79269 +       jrelse(tx_head);
79270 +
79271 +       ret =
79272 +           replay_transaction(s, tx_head, &log_rec_block,
79273 +                              jnode_get_block(tx_head), total - 1);
79274 +
79275 +       unpin_jnode_data(tx_head);
79276 +       reiser4_drop_io_head(tx_head);
79277 +
79278 +       if (ret)
79279 +               return ret;
79280 +       return -E_REPEAT;
79281 +}
79282 +
79283 +/* The reiser4 journal current implementation was optimized to not to capture
79284 +   super block if certain super blocks fields are modified. Currently, the set
79285 +   is (<free block count>, <OID allocator>). These fields are logged by
79286 +   special way which includes storing them in each transaction head block at
79287 +   atom commit time and writing that information to journal footer block at
79288 +   atom flush time.  For getting info from journal footer block to the
79289 +   in-memory super block there is a special function
79290 +   reiser4_journal_recover_sb_data() which should be called after disk format
79291 +   plugin re-reads super block after journal replaying.
79292 +*/
79293 +
79294 +/* get the information from journal footer in-memory super block */
79295 +int reiser4_journal_recover_sb_data(struct super_block *s)
79296 +{
79297 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79298 +       struct journal_footer *jf;
79299 +       int ret;
79300 +
79301 +       assert("zam-673", sbinfo->journal_footer != NULL);
79302 +
79303 +       ret = jload(sbinfo->journal_footer);
79304 +       if (ret != 0)
79305 +               return ret;
79306 +
79307 +       ret = check_journal_footer(sbinfo->journal_footer);
79308 +       if (ret != 0)
79309 +               goto out;
79310 +
79311 +       jf = (struct journal_footer *)jdata(sbinfo->journal_footer);
79312 +
79313 +       /* was there at least one flushed transaction?  */
79314 +       if (jf->last_flushed_tx) {
79315 +
79316 +               /* restore free block counter logged in this transaction */
79317 +               reiser4_set_free_blocks(s, le64_to_cpu(get_unaligned(&jf->free_blocks)));
79318 +
79319 +               /* restore oid allocator state */
79320 +               oid_init_allocator(s,
79321 +                                  le64_to_cpu(get_unaligned(&jf->nr_files)),
79322 +                                  le64_to_cpu(get_unaligned(&jf->next_oid)));
79323 +       }
79324 +      out:
79325 +       jrelse(sbinfo->journal_footer);
79326 +       return ret;
79327 +}
79328 +
79329 +/* reiser4 replay journal procedure */
79330 +int reiser4_journal_replay(struct super_block *s)
79331 +{
79332 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79333 +       jnode *jh, *jf;
79334 +       struct journal_header *header;
79335 +       int nr_tx_replayed = 0;
79336 +       int ret;
79337 +
79338 +       assert("zam-582", sbinfo != NULL);
79339 +
79340 +       jh = sbinfo->journal_header;
79341 +       jf = sbinfo->journal_footer;
79342 +
79343 +       if (!jh || !jf) {
79344 +               /* it is possible that disk layout does not support journal
79345 +                  structures, we just warn about this */
79346 +               warning("zam-583",
79347 +                       "journal control blocks were not loaded by disk layout plugin.  "
79348 +                       "journal replaying is not possible.\n");
79349 +               return 0;
79350 +       }
79351 +
79352 +       /* Take free block count from journal footer block. The free block
79353 +          counter value corresponds the last flushed transaction state */
79354 +       ret = jload(jf);
79355 +       if (ret < 0)
79356 +               return ret;
79357 +
79358 +       ret = check_journal_footer(jf);
79359 +       if (ret) {
79360 +               jrelse(jf);
79361 +               return ret;
79362 +       }
79363 +
79364 +       jrelse(jf);
79365 +
79366 +       /* store last committed transaction info in reiser4 in-memory super
79367 +          block */
79368 +       ret = jload(jh);
79369 +       if (ret < 0)
79370 +               return ret;
79371 +
79372 +       ret = check_journal_header(jh);
79373 +       if (ret) {
79374 +               jrelse(jh);
79375 +               return ret;
79376 +       }
79377 +
79378 +       header = (struct journal_header *)jdata(jh);
79379 +       sbinfo->last_committed_tx = le64_to_cpu(get_unaligned(&header->last_committed_tx));
79380 +
79381 +       jrelse(jh);
79382 +
79383 +       /* replay committed transactions */
79384 +       while ((ret = replay_oldest_transaction(s)) == -E_REPEAT)
79385 +               nr_tx_replayed++;
79386 +
79387 +       return ret;
79388 +}
79389 +
79390 +/* load journal control block (either journal header or journal footer block) */
79391 +static int
79392 +load_journal_control_block(jnode ** node, const reiser4_block_nr * block)
79393 +{
79394 +       int ret;
79395 +
79396 +       *node = reiser4_alloc_io_head(block);
79397 +       if (!(*node))
79398 +               return RETERR(-ENOMEM);
79399 +
79400 +       ret = jload(*node);
79401 +
79402 +       if (ret) {
79403 +               reiser4_drop_io_head(*node);
79404 +               *node = NULL;
79405 +               return ret;
79406 +       }
79407 +
79408 +       pin_jnode_data(*node);
79409 +       jrelse(*node);
79410 +
79411 +       return 0;
79412 +}
79413 +
79414 +/* unload journal header or footer and free jnode */
79415 +static void unload_journal_control_block(jnode ** node)
79416 +{
79417 +       if (*node) {
79418 +               unpin_jnode_data(*node);
79419 +               reiser4_drop_io_head(*node);
79420 +               *node = NULL;
79421 +       }
79422 +}
79423 +
79424 +/* release journal control blocks */
79425 +void reiser4_done_journal_info(struct super_block *s)
79426 +{
79427 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79428 +
79429 +       assert("zam-476", sbinfo != NULL);
79430 +
79431 +       unload_journal_control_block(&sbinfo->journal_header);
79432 +       unload_journal_control_block(&sbinfo->journal_footer);
79433 +       rcu_barrier();
79434 +}
79435 +
79436 +/* load journal control blocks */
79437 +int reiser4_init_journal_info(struct super_block *s)
79438 +{
79439 +       reiser4_super_info_data *sbinfo = get_super_private(s);
79440 +       journal_location *loc;
79441 +       int ret;
79442 +
79443 +       loc = &sbinfo->jloc;
79444 +
79445 +       assert("zam-651", loc != NULL);
79446 +       assert("zam-652", loc->header != 0);
79447 +       assert("zam-653", loc->footer != 0);
79448 +
79449 +       ret = load_journal_control_block(&sbinfo->journal_header, &loc->header);
79450 +
79451 +       if (ret)
79452 +               return ret;
79453 +
79454 +       ret = load_journal_control_block(&sbinfo->journal_footer, &loc->footer);
79455 +
79456 +       if (ret) {
79457 +               unload_journal_control_block(&sbinfo->journal_header);
79458 +       }
79459 +
79460 +       return ret;
79461 +}
79462 +
79463 +/* Make Linus happy.
79464 +   Local variables:
79465 +   c-indentation-style: "K&R"
79466 +   mode-name: "LC"
79467 +   c-basic-offset: 8
79468 +   tab-width: 8
79469 +   fill-column: 80
79470 +   End:
79471 +*/
79472 diff --git a/fs/reiser4/wander.h b/fs/reiser4/wander.h
79473 new file mode 100644
79474 index 0000000..8746710
79475 --- /dev/null
79476 +++ b/fs/reiser4/wander.h
79477 @@ -0,0 +1,135 @@
79478 +/* Copyright 2002, 2003 by Hans Reiser, licensing governed by reiser4/README */
79479 +
79480 +#if !defined (__FS_REISER4_WANDER_H__)
79481 +#define __FS_REISER4_WANDER_H__
79482 +
79483 +#include "dformat.h"
79484 +
79485 +#include <linux/fs.h>          /* for struct super_block  */
79486 +
79487 +/* REISER4 JOURNAL ON-DISK DATA STRUCTURES   */
79488 +
79489 +#define TX_HEADER_MAGIC  "TxMagic4"
79490 +#define WANDER_RECORD_MAGIC "LogMagc4"
79491 +
79492 +#define TX_HEADER_MAGIC_SIZE  (8)
79493 +#define WANDER_RECORD_MAGIC_SIZE (8)
79494 +
79495 +/* journal header block format */
79496 +struct journal_header {
79497 +       /* last written transaction head location */
79498 +       d64 last_committed_tx;
79499 +};
79500 +
79501 +typedef struct journal_location {
79502 +       reiser4_block_nr footer;
79503 +       reiser4_block_nr header;
79504 +} journal_location;
79505 +
79506 +/* The wander.c head comment describes usage and semantic of all these structures */
79507 +/* journal footer block format */
79508 +struct journal_footer {
79509 +       /* last flushed transaction location. */
79510 +       /* This block number is no more valid after the transaction it points
79511 +          to gets flushed, this number is used only at journal replaying time
79512 +          for detection of the end of on-disk list of committed transactions
79513 +          which were not flushed completely */
79514 +       d64 last_flushed_tx;
79515 +
79516 +       /* free block counter is written in journal footer at transaction
79517 +          flushing , not in super block because free blocks counter is logged
79518 +          by another way than super block fields (root pointer, for
79519 +          example). */
79520 +       d64 free_blocks;
79521 +
79522 +       /* number of used OIDs and maximal used OID are logged separately from
79523 +          super block */
79524 +       d64 nr_files;
79525 +       d64 next_oid;
79526 +};
79527 +
79528 +/* Each wander record (except the first one) has unified format with wander
79529 +   record header followed by an array of log entries */
79530 +struct wander_record_header {
79531 +       /* when there is no predefined location for wander records, this magic
79532 +          string should help reiser4fsck. */
79533 +       char magic[WANDER_RECORD_MAGIC_SIZE];
79534 +
79535 +       /* transaction id */
79536 +       d64 id;
79537 +
79538 +       /* total number of wander records in current transaction  */
79539 +       d32 total;
79540 +
79541 +       /* this block number in transaction */
79542 +       d32 serial;
79543 +
79544 +       /* number of previous block in commit */
79545 +       d64 next_block;
79546 +};
79547 +
79548 +/* The first wander record (transaction head) of written transaction has the
79549 +   special format */
79550 +struct tx_header {
79551 +       /* magic string makes first block in transaction different from other
79552 +          logged blocks, it should help fsck. */
79553 +       char magic[TX_HEADER_MAGIC_SIZE];
79554 +
79555 +       /* transaction id */
79556 +       d64 id;
79557 +
79558 +       /* total number of records (including this first tx head) in the
79559 +          transaction */
79560 +       d32 total;
79561 +
79562 +       /* align next field to 8-byte boundary; this field always is zero */
79563 +       d32 padding;
79564 +
79565 +       /* block number of previous transaction head */
79566 +       d64 prev_tx;
79567 +
79568 +       /* next wander record location */
79569 +       d64 next_block;
79570 +
79571 +       /* committed versions of free blocks counter */
79572 +       d64 free_blocks;
79573 +
79574 +       /* number of used OIDs (nr_files) and maximal used OID are logged
79575 +          separately from super block */
79576 +       d64 nr_files;
79577 +       d64 next_oid;
79578 +};
79579 +
79580 +/* A transaction gets written to disk as a set of wander records (each wander
79581 +   record size is fs block) */
79582 +
79583 +/* As it was told above a wander The rest of wander record is filled by these log entries, unused space filled
79584 +   by zeroes */
79585 +struct wander_entry {
79586 +       d64 original;           /* block original location */
79587 +       d64 wandered;           /* block wandered location */
79588 +};
79589 +
79590 +/* REISER4 JOURNAL WRITER FUNCTIONS   */
79591 +
79592 +extern int reiser4_write_logs(long *);
79593 +extern int reiser4_journal_replay(struct super_block *);
79594 +extern int reiser4_journal_recover_sb_data(struct super_block *);
79595 +
79596 +extern int reiser4_init_journal_info(struct super_block *);
79597 +extern void reiser4_done_journal_info(struct super_block *);
79598 +
79599 +extern int write_jnode_list(struct list_head *, flush_queue_t *, long *, int);
79600 +
79601 +#endif                         /* __FS_REISER4_WANDER_H__ */
79602 +
79603 +/* Make Linus happy.
79604 +   Local variables:
79605 +   c-indentation-style: "K&R"
79606 +   mode-name: "LC"
79607 +   c-basic-offset: 8
79608 +   tab-width: 8
79609 +   fill-column: 80
79610 +   scroll-step: 1
79611 +   End:
79612 +*/
79613 diff --git a/fs/reiser4/writeout.h b/fs/reiser4/writeout.h
79614 new file mode 100644
79615 index 0000000..446b63b
79616 --- /dev/null
79617 +++ b/fs/reiser4/writeout.h
79618 @@ -0,0 +1,21 @@
79619 +/* Copyright 2002, 2003, 2004 by Hans Reiser, licensing governed by reiser4/README  */
79620 +
79621 +#if !defined (__FS_REISER4_WRITEOUT_H__)
79622 +
79623 +#define WRITEOUT_SINGLE_STREAM (0x1)
79624 +#define WRITEOUT_FOR_PAGE_RECLAIM  (0x2)
79625 +#define WRITEOUT_BARRIER (0x4)
79626 +
79627 +extern int reiser4_get_writeout_flags(void);
79628 +
79629 +#endif                         /* __FS_REISER4_WRITEOUT_H__ */
79630 +
79631 +/* Make Linus happy.
79632 +   Local variables:
79633 +   c-indentation-style: "K&R"
79634 +   mode-name: "LC"
79635 +   c-basic-offset: 8
79636 +   tab-width: 8
79637 +   fill-column: 80
79638 +   End:
79639 +*/
79640 diff --git a/fs/reiser4/znode.c b/fs/reiser4/znode.c
79641 new file mode 100644
79642 index 0000000..b695111
79643 --- /dev/null
79644 +++ b/fs/reiser4/znode.c
79645 @@ -0,0 +1,1029 @@
79646 +/* Copyright 2001, 2002, 2003 by Hans Reiser, licensing governed by
79647 + * reiser4/README */
79648 +/* Znode manipulation functions. */
79649 +/* Znode is the in-memory header for a tree node. It is stored
79650 +   separately from the node itself so that it does not get written to
79651 +   disk.  In this respect znode is like buffer head or page head. We
79652 +   also use znodes for additional reiser4 specific purposes:
79653 +
79654 +    . they are organized into tree structure which is a part of whole
79655 +      reiser4 tree.
79656 +    . they are used to implement node grained locking
79657 +    . they are used to keep additional state associated with a
79658 +      node
79659 +    . they contain links to lists used by the transaction manager
79660 +
79661 +   Znode is attached to some variable "block number" which is instance of
79662 +   fs/reiser4/tree.h:reiser4_block_nr type. Znode can exist without
79663 +   appropriate node being actually loaded in memory. Existence of znode itself
79664 +   is regulated by reference count (->x_count) in it. Each time thread
79665 +   acquires reference to znode through call to zget(), ->x_count is
79666 +   incremented and decremented on call to zput().  Data (content of node) are
79667 +   brought in memory through call to zload(), which also increments ->d_count
79668 +   reference counter.  zload can block waiting on IO.  Call to zrelse()
79669 +   decreases this counter. Also, ->c_count keeps track of number of child
79670 +   znodes and prevents parent znode from being recycled until all of its
79671 +   children are. ->c_count is decremented whenever child goes out of existence
79672 +   (being actually recycled in zdestroy()) which can be some time after last
79673 +   reference to this child dies if we support some form of LRU cache for
79674 +   znodes.
79675 +
79676 +*/
79677 +/* EVERY ZNODE'S STORY
79678 +
79679 +   1. His infancy.
79680 +
79681 +   Once upon a time, the znode was born deep inside of zget() by call to
79682 +   zalloc(). At the return from zget() znode had:
79683 +
79684 +    . reference counter (x_count) of 1
79685 +    . assigned block number, marked as used in bitmap
79686 +    . pointer to parent znode. Root znode parent pointer points
79687 +      to its father: "fake" znode. This, in turn, has NULL parent pointer.
79688 +    . hash table linkage
79689 +    . no data loaded from disk
79690 +    . no node plugin
79691 +    . no sibling linkage
79692 +
79693 +   2. His childhood
79694 +
79695 +   Each node is either brought into memory as a result of tree traversal, or
79696 +   created afresh, creation of the root being a special case of the latter. In
79697 +   either case it's inserted into sibling list. This will typically require
79698 +   some ancillary tree traversing, but ultimately both sibling pointers will
79699 +   exist and JNODE_LEFT_CONNECTED and JNODE_RIGHT_CONNECTED will be true in
79700 +   zjnode.state.
79701 +
79702 +   3. His youth.
79703 +
79704 +   If znode is bound to already existing node in a tree, its content is read
79705 +   from the disk by call to zload(). At that moment, JNODE_LOADED bit is set
79706 +   in zjnode.state and zdata() function starts to return non null for this
79707 +   znode. zload() further calls zparse() that determines which node layout
79708 +   this node is rendered in, and sets ->nplug on success.
79709 +
79710 +   If znode is for new node just created, memory for it is allocated and
79711 +   zinit_new() function is called to initialise data, according to selected
79712 +   node layout.
79713 +
79714 +   4. His maturity.
79715 +
79716 +   After this point, znode lingers in memory for some time. Threads can
79717 +   acquire references to znode either by blocknr through call to zget(), or by
79718 +   following a pointer to unallocated znode from internal item. Each time
79719 +   reference to znode is obtained, x_count is increased. Thread can read/write
79720 +   lock znode. Znode data can be loaded through calls to zload(), d_count will
79721 +   be increased appropriately. If all references to znode are released
79722 +   (x_count drops to 0), znode is not recycled immediately. Rather, it is
79723 +   still cached in the hash table in the hope that it will be accessed
79724 +   shortly.
79725 +
79726 +   There are two ways in which znode existence can be terminated:
79727 +
79728 +    . sudden death: node bound to this znode is removed from the tree
79729 +    . overpopulation: znode is purged out of memory due to memory pressure
79730 +
79731 +   5. His death.
79732 +
79733 +   Death is complex process.
79734 +
79735 +   When we irrevocably commit ourselves to decision to remove node from the
79736 +   tree, JNODE_HEARD_BANSHEE bit is set in zjnode.state of corresponding
79737 +   znode. This is done either in ->kill_hook() of internal item or in
79738 +   reiser4_kill_root() function when tree root is removed.
79739 +
79740 +   At this moment znode still has:
79741 +
79742 +    . locks held on it, necessary write ones
79743 +    . references to it
79744 +    . disk block assigned to it
79745 +    . data loaded from the disk
79746 +    . pending requests for lock
79747 +
79748 +   But once JNODE_HEARD_BANSHEE bit set, last call to unlock_znode() does node
79749 +   deletion. Node deletion includes two phases. First all ways to get
79750 +   references to that znode (sibling and parent links and hash lookup using
79751 +   block number stored in parent node) should be deleted -- it is done through
79752 +   sibling_list_remove(), also we assume that nobody uses down link from
79753 +   parent node due to its nonexistence or proper parent node locking and
79754 +   nobody uses parent pointers from children due to absence of them. Second we
79755 +   invalidate all pending lock requests which still are on znode's lock
79756 +   request queue, this is done by reiser4_invalidate_lock(). Another
79757 +   JNODE_IS_DYING znode status bit is used to invalidate pending lock requests.
79758 +   Once it set all requesters are forced to return -EINVAL from
79759 +   longterm_lock_znode(). Future locking attempts are not possible because all
79760 +   ways to get references to that znode are removed already. Last, node is
79761 +   uncaptured from transaction.
79762 +
79763 +   When last reference to the dying znode is just about to be released,
79764 +   block number for this lock is released and znode is removed from the
79765 +   hash table.
79766 +
79767 +   Now znode can be recycled.
79768 +
79769 +   [it's possible to free bitmap block and remove znode from the hash
79770 +   table when last lock is released. This will result in having
79771 +   referenced but completely orphaned znode]
79772 +
79773 +   6. Limbo
79774 +
79775 +   As have been mentioned above znodes with reference counter 0 are
79776 +   still cached in a hash table. Once memory pressure increases they are
79777 +   purged out of there [this requires something like LRU list for
79778 +   efficient implementation. LRU list would also greatly simplify
79779 +   implementation of coord cache that would in this case morph to just
79780 +   scanning some initial segment of LRU list]. Data loaded into
79781 +   unreferenced znode are flushed back to the durable storage if
79782 +   necessary and memory is freed. Znodes themselves can be recycled at
79783 +   this point too.
79784 +
79785 +*/
79786 +
79787 +#include "debug.h"
79788 +#include "dformat.h"
79789 +#include "key.h"
79790 +#include "coord.h"
79791 +#include "plugin/plugin_header.h"
79792 +#include "plugin/node/node.h"
79793 +#include "plugin/plugin.h"
79794 +#include "txnmgr.h"
79795 +#include "jnode.h"
79796 +#include "znode.h"
79797 +#include "block_alloc.h"
79798 +#include "tree.h"
79799 +#include "tree_walk.h"
79800 +#include "super.h"
79801 +#include "reiser4.h"
79802 +
79803 +#include <linux/pagemap.h>
79804 +#include <linux/spinlock.h>
79805 +#include <linux/slab.h>
79806 +#include <linux/err.h>
79807 +
79808 +static z_hash_table *get_htable(reiser4_tree *,
79809 +                               const reiser4_block_nr * const blocknr);
79810 +static z_hash_table *znode_get_htable(const znode *);
79811 +static void zdrop(znode *);
79812 +
79813 +/* hash table support */
79814 +
79815 +/* compare two block numbers for equality. Used by hash-table macros */
79816 +static inline int
79817 +blknreq(const reiser4_block_nr * b1, const reiser4_block_nr * b2)
79818 +{
79819 +       assert("nikita-534", b1 != NULL);
79820 +       assert("nikita-535", b2 != NULL);
79821 +
79822 +       return *b1 == *b2;
79823 +}
79824 +
79825 +/* Hash znode by block number. Used by hash-table macros */
79826 +/* Audited by: umka (2002.06.11) */
79827 +static inline __u32
79828 +blknrhashfn(z_hash_table * table, const reiser4_block_nr * b)
79829 +{
79830 +       assert("nikita-536", b != NULL);
79831 +
79832 +       return *b & (REISER4_ZNODE_HASH_TABLE_SIZE - 1);
79833 +}
79834 +
79835 +/* The hash table definition */
79836 +#define KMALLOC(size) kmalloc((size), reiser4_ctx_gfp_mask_get())
79837 +#define KFREE(ptr, size) kfree(ptr)
79838 +TYPE_SAFE_HASH_DEFINE(z, znode, reiser4_block_nr, zjnode.key.z, zjnode.link.z,
79839 +                     blknrhashfn, blknreq);
79840 +#undef KFREE
79841 +#undef KMALLOC
79842 +
79843 +/* slab for znodes */
79844 +static struct kmem_cache *znode_cache;
79845 +
79846 +int znode_shift_order;
79847 +
79848 +/**
79849 + * init_znodes - create znode cache
79850 + *
79851 + * Initializes slab cache of znodes. It is part of reiser4 module initialization.
79852 + */
79853 +int init_znodes(void)
79854 +{
79855 +       znode_cache = kmem_cache_create("znode", sizeof(znode), 0,
79856 +                                       SLAB_HWCACHE_ALIGN |
79857 +                                       SLAB_RECLAIM_ACCOUNT, NULL, NULL);
79858 +       if (znode_cache == NULL)
79859 +               return RETERR(-ENOMEM);
79860 +
79861 +       for (znode_shift_order = 0; (1 << znode_shift_order) < sizeof(znode);
79862 +            ++znode_shift_order);
79863 +       --znode_shift_order;
79864 +       return 0;
79865 +}
79866 +
79867 +/**
79868 + * done_znodes - delete znode cache
79869 + *
79870 + * This is called on reiser4 module unloading or system shutdown.
79871 + */
79872 +void done_znodes(void)
79873 +{
79874 +       destroy_reiser4_cache(&znode_cache);
79875 +}
79876 +
79877 +/* call this to initialise tree of znodes */
79878 +int znodes_tree_init(reiser4_tree * tree /* tree to initialise znodes for */ )
79879 +{
79880 +       int result;
79881 +       assert("umka-050", tree != NULL);
79882 +
79883 +       rwlock_init(&tree->dk_lock);
79884 +
79885 +       result = z_hash_init(&tree->zhash_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79886 +       if (result != 0)
79887 +               return result;
79888 +       result = z_hash_init(&tree->zfake_table, REISER4_ZNODE_HASH_TABLE_SIZE);
79889 +       return result;
79890 +}
79891 +
79892 +/* free this znode */
79893 +void zfree(znode * node /* znode to free */ )
79894 +{
79895 +       assert("nikita-465", node != NULL);
79896 +       assert("nikita-2120", znode_page(node) == NULL);
79897 +       assert("nikita-2301", list_empty_careful(&node->lock.owners));
79898 +       assert("nikita-2302", list_empty_careful(&node->lock.requestors));
79899 +       assert("nikita-2663", (list_empty_careful(&ZJNODE(node)->capture_link) &&
79900 +                              NODE_LIST(ZJNODE(node)) == NOT_CAPTURED));
79901 +       assert("nikita-3220", list_empty(&ZJNODE(node)->jnodes));
79902 +       assert("nikita-3293", !znode_is_right_connected(node));
79903 +       assert("nikita-3294", !znode_is_left_connected(node));
79904 +       assert("nikita-3295", node->left == NULL);
79905 +       assert("nikita-3296", node->right == NULL);
79906 +
79907 +       /* not yet phash_jnode_destroy(ZJNODE(node)); */
79908 +
79909 +       kmem_cache_free(znode_cache, node);
79910 +}
79911 +
79912 +/* call this to free tree of znodes */
79913 +void znodes_tree_done(reiser4_tree * tree /* tree to finish with znodes of */ )
79914 +{
79915 +       znode *node;
79916 +       znode *next;
79917 +       z_hash_table *ztable;
79918 +
79919 +       /* scan znode hash-tables and kill all znodes, then free hash tables
79920 +        * themselves. */
79921 +
79922 +       assert("nikita-795", tree != NULL);
79923 +
79924 +       ztable = &tree->zhash_table;
79925 +
79926 +       if (ztable->_table != NULL) {
79927 +               for_all_in_htable(ztable, z, node, next) {
79928 +                       node->c_count = 0;
79929 +                       node->in_parent.node = NULL;
79930 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79931 +                       zdrop(node);
79932 +               }
79933 +
79934 +               z_hash_done(&tree->zhash_table);
79935 +       }
79936 +
79937 +       ztable = &tree->zfake_table;
79938 +
79939 +       if (ztable->_table != NULL) {
79940 +               for_all_in_htable(ztable, z, node, next) {
79941 +                       node->c_count = 0;
79942 +                       node->in_parent.node = NULL;
79943 +                       assert("nikita-2179", atomic_read(&ZJNODE(node)->x_count) == 0);
79944 +                       zdrop(node);
79945 +               }
79946 +
79947 +               z_hash_done(&tree->zfake_table);
79948 +       }
79949 +}
79950 +
79951 +/* ZNODE STRUCTURES */
79952 +
79953 +/* allocate fresh znode */
79954 +znode *zalloc(gfp_t gfp_flag /* allocation flag */ )
79955 +{
79956 +       znode *node;
79957 +
79958 +       node = kmem_cache_alloc(znode_cache, gfp_flag);
79959 +       return node;
79960 +}
79961 +
79962 +/* Initialize fields of znode
79963 +   @node:    znode to initialize;
79964 +   @parent:  parent znode;
79965 +   @tree:    tree we are in. */
79966 +void zinit(znode * node, const znode * parent, reiser4_tree * tree)
79967 +{
79968 +       assert("nikita-466", node != NULL);
79969 +       assert("umka-268", current_tree != NULL);
79970 +
79971 +       memset(node, 0, sizeof *node);
79972 +
79973 +       assert("umka-051", tree != NULL);
79974 +
79975 +       jnode_init(&node->zjnode, tree, JNODE_FORMATTED_BLOCK);
79976 +       reiser4_init_lock(&node->lock);
79977 +       init_parent_coord(&node->in_parent, parent);
79978 +}
79979 +
79980 +/*
79981 + * remove znode from indices. This is called jput() when last reference on
79982 + * znode is released.
79983 + */
79984 +void znode_remove(znode * node /* znode to remove */ , reiser4_tree * tree)
79985 +{
79986 +       assert("nikita-2108", node != NULL);
79987 +       assert("nikita-470", node->c_count == 0);
79988 +       assert_rw_write_locked(&(tree->tree_lock));
79989 +
79990 +       /* remove reference to this znode from cbk cache */
79991 +       cbk_cache_invalidate(node, tree);
79992 +
79993 +       /* update c_count of parent */
79994 +       if (znode_parent(node) != NULL) {
79995 +               assert("nikita-472", znode_parent(node)->c_count > 0);
79996 +               /* father, onto your hands I forward my spirit... */
79997 +               znode_parent(node)->c_count--;
79998 +               node->in_parent.node = NULL;
79999 +       } else {
80000 +               /* orphaned znode?! Root? */
80001 +       }
80002 +
80003 +       /* remove znode from hash-table */
80004 +       z_hash_remove_rcu(znode_get_htable(node), node);
80005 +}
80006 +
80007 +/* zdrop() -- Remove znode from the tree.
80008 +
80009 +   This is called when znode is removed from the memory. */
80010 +static void zdrop(znode * node /* znode to finish with */ )
80011 +{
80012 +       jdrop(ZJNODE(node));
80013 +}
80014 +
80015 +/*
80016 + * put znode into right place in the hash table. This is called by relocate
80017 + * code.
80018 + */
80019 +int znode_rehash(znode * node /* node to rehash */ ,
80020 +                const reiser4_block_nr * new_block_nr /* new block number */ )
80021 +{
80022 +       z_hash_table *oldtable;
80023 +       z_hash_table *newtable;
80024 +       reiser4_tree *tree;
80025 +
80026 +       assert("nikita-2018", node != NULL);
80027 +
80028 +       tree = znode_get_tree(node);
80029 +       oldtable = znode_get_htable(node);
80030 +       newtable = get_htable(tree, new_block_nr);
80031 +
80032 +       write_lock_tree(tree);
80033 +       /* remove znode from hash-table */
80034 +       z_hash_remove_rcu(oldtable, node);
80035 +
80036 +       /* assertion no longer valid due to RCU */
80037 +       /* assert("nikita-2019", z_hash_find(newtable, new_block_nr) == NULL); */
80038 +
80039 +       /* update blocknr */
80040 +       znode_set_block(node, new_block_nr);
80041 +       node->zjnode.key.z = *new_block_nr;
80042 +
80043 +       /* insert it into hash */
80044 +       z_hash_insert_rcu(newtable, node);
80045 +       write_unlock_tree(tree);
80046 +       return 0;
80047 +}
80048 +
80049 +/* ZNODE LOOKUP, GET, PUT */
80050 +
80051 +/* zlook() - get znode with given block_nr in a hash table or return NULL
80052 +
80053 +   If result is non-NULL then the znode's x_count is incremented.  Internal version
80054 +   accepts pre-computed hash index.  The hash table is accessed under caller's
80055 +   tree->hash_lock.
80056 +*/
80057 +znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const blocknr)
80058 +{
80059 +       znode *result;
80060 +       __u32 hash;
80061 +       z_hash_table *htable;
80062 +
80063 +       assert("jmacd-506", tree != NULL);
80064 +       assert("jmacd-507", blocknr != NULL);
80065 +
80066 +       htable = get_htable(tree, blocknr);
80067 +       hash = blknrhashfn(htable, blocknr);
80068 +
80069 +       rcu_read_lock();
80070 +       result = z_hash_find_index(htable, hash, blocknr);
80071 +
80072 +       if (result != NULL) {
80073 +               add_x_ref(ZJNODE(result));
80074 +               result = znode_rip_check(tree, result);
80075 +       }
80076 +       rcu_read_unlock();
80077 +
80078 +       return result;
80079 +}
80080 +
80081 +/* return hash table where znode with block @blocknr is (or should be)
80082 + * stored */
80083 +static z_hash_table *get_htable(reiser4_tree * tree,
80084 +                               const reiser4_block_nr * const blocknr)
80085 +{
80086 +       z_hash_table *table;
80087 +       if (is_disk_addr_unallocated(blocknr))
80088 +               table = &tree->zfake_table;
80089 +       else
80090 +               table = &tree->zhash_table;
80091 +       return table;
80092 +}
80093 +
80094 +/* return hash table where znode @node is (or should be) stored */
80095 +static z_hash_table *znode_get_htable(const znode * node)
80096 +{
80097 +       return get_htable(znode_get_tree(node), znode_get_block(node));
80098 +}
80099 +
80100 +/* zget() - get znode from hash table, allocating it if necessary.
80101 +
80102 +   First a call to zlook, locating a x-referenced znode if one
80103 +   exists.  If znode is not found, allocate new one and return.  Result
80104 +   is returned with x_count reference increased.
80105 +
80106 +   LOCKS TAKEN:   TREE_LOCK, ZNODE_LOCK
80107 +   LOCK ORDERING: NONE
80108 +*/
80109 +znode *zget(reiser4_tree * tree,
80110 +           const reiser4_block_nr * const blocknr,
80111 +           znode * parent, tree_level level, gfp_t gfp_flag)
80112 +{
80113 +       znode *result;
80114 +       __u32 hashi;
80115 +
80116 +       z_hash_table *zth;
80117 +
80118 +       assert("jmacd-512", tree != NULL);
80119 +       assert("jmacd-513", blocknr != NULL);
80120 +       assert("jmacd-514", level < REISER4_MAX_ZTREE_HEIGHT);
80121 +
80122 +       zth = get_htable(tree, blocknr);
80123 +       hashi = blknrhashfn(zth, blocknr);
80124 +
80125 +       /* NOTE-NIKITA address-as-unallocated-blocknr still is not
80126 +          implemented. */
80127 +
80128 +       z_hash_prefetch_bucket(zth, hashi);
80129 +
80130 +       rcu_read_lock();
80131 +       /* Find a matching BLOCKNR in the hash table.  If the znode is found,
80132 +          we obtain an reference (x_count) but the znode remains unlocked.
80133 +          Have to worry about race conditions later. */
80134 +       result = z_hash_find_index(zth, hashi, blocknr);
80135 +       /* According to the current design, the hash table lock protects new
80136 +          znode references. */
80137 +       if (result != NULL) {
80138 +               add_x_ref(ZJNODE(result));
80139 +               /* NOTE-NIKITA it should be so, but special case during
80140 +                  creation of new root makes such assertion highly
80141 +                  complicated.  */
80142 +               assert("nikita-2131", 1 || znode_parent(result) == parent ||
80143 +                      (ZF_ISSET(result, JNODE_ORPHAN)
80144 +                       && (znode_parent(result) == NULL)));
80145 +               result = znode_rip_check(tree, result);
80146 +       }
80147 +
80148 +       rcu_read_unlock();
80149 +
80150 +       if (!result) {
80151 +               znode *shadow;
80152 +
80153 +               result = zalloc(gfp_flag);
80154 +               if (!result) {
80155 +                       return ERR_PTR(RETERR(-ENOMEM));
80156 +               }
80157 +
80158 +               zinit(result, parent, tree);
80159 +               ZJNODE(result)->blocknr = *blocknr;
80160 +               ZJNODE(result)->key.z = *blocknr;
80161 +               result->level = level;
80162 +
80163 +               write_lock_tree(tree);
80164 +
80165 +               shadow = z_hash_find_index(zth, hashi, blocknr);
80166 +               if (unlikely(shadow != NULL && !ZF_ISSET(shadow, JNODE_RIP))) {
80167 +                       jnode_list_remove(ZJNODE(result));
80168 +                       zfree(result);
80169 +                       result = shadow;
80170 +               } else {
80171 +                       result->version = znode_build_version(tree);
80172 +                       z_hash_insert_index_rcu(zth, hashi, result);
80173 +
80174 +                       if (parent != NULL)
80175 +                               ++parent->c_count;
80176 +               }
80177 +
80178 +               add_x_ref(ZJNODE(result));
80179 +
80180 +               write_unlock_tree(tree);
80181 +       }
80182 +#if REISER4_DEBUG
80183 +       if (!reiser4_blocknr_is_fake(blocknr) && *blocknr != 0)
80184 +               reiser4_check_block(blocknr, 1);
80185 +#endif
80186 +       /* Check for invalid tree level, return -EIO */
80187 +       if (unlikely(znode_get_level(result) != level)) {
80188 +               warning("jmacd-504",
80189 +                       "Wrong level for cached block %llu: %i expecting %i",
80190 +                       (unsigned long long)(*blocknr), znode_get_level(result),
80191 +                       level);
80192 +               zput(result);
80193 +               return ERR_PTR(RETERR(-EIO));
80194 +       }
80195 +
80196 +       assert("nikita-1227", znode_invariant(result));
80197 +
80198 +       return result;
80199 +}
80200 +
80201 +/* ZNODE PLUGINS/DATA */
80202 +
80203 +/* "guess" plugin for node loaded from the disk. Plugin id of node plugin is
80204 +   stored at the fixed offset from the beginning of the node. */
80205 +static node_plugin *znode_guess_plugin(const znode * node      /* znode to guess
80206 +                                                                * plugin of */ )
80207 +{
80208 +       reiser4_tree *tree;
80209 +
80210 +       assert("nikita-1053", node != NULL);
80211 +       assert("nikita-1055", zdata(node) != NULL);
80212 +
80213 +       tree = znode_get_tree(node);
80214 +       assert("umka-053", tree != NULL);
80215 +
80216 +       if (reiser4_is_set(tree->super, REISER4_ONE_NODE_PLUGIN)) {
80217 +               return tree->nplug;
80218 +       } else {
80219 +               return node_plugin_by_disk_id
80220 +                   (tree, &((common_node_header *) zdata(node))->plugin_id);
80221 +#ifdef GUESS_EXISTS
80222 +               reiser4_plugin *plugin;
80223 +
80224 +               /* NOTE-NIKITA add locking here when dynamic plugins will be
80225 +                * implemented */
80226 +               for_all_plugins(REISER4_NODE_PLUGIN_TYPE, plugin) {
80227 +                       if ((plugin->u.node.guess != NULL)
80228 +                           && plugin->u.node.guess(node))
80229 +                               return plugin;
80230 +               }
80231 +               warning("nikita-1057", "Cannot guess node plugin");
80232 +               print_znode("node", node);
80233 +               return NULL;
80234 +#endif
80235 +       }
80236 +}
80237 +
80238 +/* parse node header and install ->node_plugin */
80239 +int zparse(znode * node /* znode to parse */ )
80240 +{
80241 +       int result;
80242 +
80243 +       assert("nikita-1233", node != NULL);
80244 +       assert("nikita-2370", zdata(node) != NULL);
80245 +
80246 +       if (node->nplug == NULL) {
80247 +               node_plugin *nplug;
80248 +
80249 +               nplug = znode_guess_plugin(node);
80250 +               if (likely(nplug != NULL)) {
80251 +                       result = nplug->parse(node);
80252 +                       if (likely(result == 0))
80253 +                               node->nplug = nplug;
80254 +               } else {
80255 +                       result = RETERR(-EIO);
80256 +               }
80257 +       } else
80258 +               result = 0;
80259 +       return result;
80260 +}
80261 +
80262 +/* zload with readahead */
80263 +int zload_ra(znode * node /* znode to load */ , ra_info_t * info)
80264 +{
80265 +       int result;
80266 +
80267 +       assert("nikita-484", node != NULL);
80268 +       assert("nikita-1377", znode_invariant(node));
80269 +       assert("jmacd-7771", !znode_above_root(node));
80270 +       assert("nikita-2125", atomic_read(&ZJNODE(node)->x_count) > 0);
80271 +       assert("nikita-3016", reiser4_schedulable());
80272 +
80273 +       if (info)
80274 +               formatted_readahead(node, info);
80275 +
80276 +       result = jload(ZJNODE(node));
80277 +       assert("nikita-1378", znode_invariant(node));
80278 +       return result;
80279 +}
80280 +
80281 +/* load content of node into memory */
80282 +int zload(znode * node)
80283 +{
80284 +       return zload_ra(node, NULL);
80285 +}
80286 +
80287 +/* call node plugin to initialise newly allocated node. */
80288 +int zinit_new(znode * node /* znode to initialise */ , gfp_t gfp_flags)
80289 +{
80290 +       return jinit_new(ZJNODE(node), gfp_flags);
80291 +}
80292 +
80293 +/* drop reference to node data. When last reference is dropped, data are
80294 +   unloaded. */
80295 +void zrelse(znode * node /* znode to release references to */ )
80296 +{
80297 +       assert("nikita-1381", znode_invariant(node));
80298 +
80299 +       jrelse(ZJNODE(node));
80300 +}
80301 +
80302 +/* returns free space in node */
80303 +unsigned znode_free_space(znode * node /* znode to query */ )
80304 +{
80305 +       assert("nikita-852", node != NULL);
80306 +       return node_plugin_by_node(node)->free_space(node);
80307 +}
80308 +
80309 +/* left delimiting key of znode */
80310 +reiser4_key *znode_get_rd_key(znode * node /* znode to query */ )
80311 +{
80312 +       assert("nikita-958", node != NULL);
80313 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
80314 +       assert("nikita-3067", LOCK_CNT_GTZ(rw_locked_dk));
80315 +       assert("nikita-30671", node->rd_key_version != 0);
80316 +       return &node->rd_key;
80317 +}
80318 +
80319 +/* right delimiting key of znode */
80320 +reiser4_key *znode_get_ld_key(znode * node /* znode to query */ )
80321 +{
80322 +       assert("nikita-974", node != NULL);
80323 +       assert_rw_locked(&(znode_get_tree(node)->dk_lock));
80324 +       assert("nikita-3068", LOCK_CNT_GTZ(rw_locked_dk));
80325 +       assert("nikita-30681", node->ld_key_version != 0);
80326 +       return &node->ld_key;
80327 +}
80328 +
80329 +ON_DEBUG(atomic_t delim_key_version = ATOMIC_INIT(0);
80330 +    )
80331 +
80332 +/* update right-delimiting key of @node */
80333 +reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key)
80334 +{
80335 +       assert("nikita-2937", node != NULL);
80336 +       assert("nikita-2939", key != NULL);
80337 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
80338 +       assert("nikita-3069", LOCK_CNT_GTZ(write_locked_dk));
80339 +       assert("nikita-2944",
80340 +              znode_is_any_locked(node) ||
80341 +              znode_get_level(node) != LEAF_LEVEL ||
80342 +              keyge(key, &node->rd_key) ||
80343 +              keyeq(&node->rd_key, reiser4_min_key()) ||
80344 +              ZF_ISSET(node, JNODE_HEARD_BANSHEE));
80345 +
80346 +       node->rd_key = *key;
80347 +       ON_DEBUG(node->rd_key_version = atomic_inc_return(&delim_key_version));
80348 +       return &node->rd_key;
80349 +}
80350 +
80351 +/* update left-delimiting key of @node */
80352 +reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key)
80353 +{
80354 +       assert("nikita-2940", node != NULL);
80355 +       assert("nikita-2941", key != NULL);
80356 +       assert_rw_write_locked(&(znode_get_tree(node)->dk_lock));
80357 +       assert("nikita-3070", LOCK_CNT_GTZ(write_locked_dk));
80358 +       assert("nikita-2943",
80359 +              znode_is_any_locked(node) || keyeq(&node->ld_key,
80360 +                                                 reiser4_min_key()));
80361 +
80362 +       node->ld_key = *key;
80363 +       ON_DEBUG(node->ld_key_version = atomic_inc_return(&delim_key_version));
80364 +       return &node->ld_key;
80365 +}
80366 +
80367 +/* true if @key is inside key range for @node */
80368 +int znode_contains_key(znode * node /* znode to look in */ ,
80369 +                      const reiser4_key * key /* key to look for */ )
80370 +{
80371 +       assert("nikita-1237", node != NULL);
80372 +       assert("nikita-1238", key != NULL);
80373 +
80374 +       /* left_delimiting_key <= key <= right_delimiting_key */
80375 +       return keyle(znode_get_ld_key(node), key)
80376 +           && keyle(key, znode_get_rd_key(node));
80377 +}
80378 +
80379 +/* same as znode_contains_key(), but lock dk lock */
80380 +int znode_contains_key_lock(znode * node /* znode to look in */ ,
80381 +                           const reiser4_key * key /* key to look for */ )
80382 +{
80383 +       int result;
80384 +
80385 +       assert("umka-056", node != NULL);
80386 +       assert("umka-057", key != NULL);
80387 +
80388 +       read_lock_dk(znode_get_tree(node));
80389 +       result = znode_contains_key(node, key);
80390 +       read_unlock_dk(znode_get_tree(node));
80391 +       return result;
80392 +}
80393 +
80394 +/* get parent pointer, assuming tree is not locked */
80395 +znode *znode_parent_nolock(const znode * node /* child znode */ )
80396 +{
80397 +       assert("nikita-1444", node != NULL);
80398 +       return node->in_parent.node;
80399 +}
80400 +
80401 +/* get parent pointer of znode */
80402 +znode *znode_parent(const znode * node /* child znode */ )
80403 +{
80404 +       assert("nikita-1226", node != NULL);
80405 +       assert("nikita-1406", LOCK_CNT_GTZ(rw_locked_tree));
80406 +       return znode_parent_nolock(node);
80407 +}
80408 +
80409 +/* detect uber znode used to protect in-superblock tree root pointer */
80410 +int znode_above_root(const znode * node /* znode to query */ )
80411 +{
80412 +       assert("umka-059", node != NULL);
80413 +
80414 +       return disk_addr_eq(&ZJNODE(node)->blocknr, &UBER_TREE_ADDR);
80415 +}
80416 +
80417 +/* check that @node is root---that its block number is recorder in the tree as
80418 +   that of root node */
80419 +#if REISER4_DEBUG
80420 +static int znode_is_true_root(const znode * node /* znode to query */ )
80421 +{
80422 +       assert("umka-060", node != NULL);
80423 +       assert("umka-061", current_tree != NULL);
80424 +
80425 +       return disk_addr_eq(znode_get_block(node),
80426 +                           &znode_get_tree(node)->root_block);
80427 +}
80428 +#endif
80429 +
80430 +/* check that @node is root */
80431 +int znode_is_root(const znode * node /* znode to query */ )
80432 +{
80433 +       assert("nikita-1206", node != NULL);
80434 +
80435 +       return znode_get_level(node) == znode_get_tree(node)->height;
80436 +}
80437 +
80438 +/* Returns true is @node was just created by zget() and wasn't ever loaded
80439 +   into memory. */
80440 +/* NIKITA-HANS: yes */
80441 +int znode_just_created(const znode * node)
80442 +{
80443 +       assert("nikita-2188", node != NULL);
80444 +       return (znode_page(node) == NULL);
80445 +}
80446 +
80447 +/* obtain updated ->znode_epoch. See seal.c for description. */
80448 +__u64 znode_build_version(reiser4_tree * tree)
80449 +{
80450 +       __u64 result;
80451 +
80452 +       spin_lock(&tree->epoch_lock);
80453 +       result = ++tree->znode_epoch;
80454 +       spin_unlock(&tree->epoch_lock);
80455 +       return result;
80456 +}
80457 +
80458 +void init_load_count(load_count * dh)
80459 +{
80460 +       assert("nikita-2105", dh != NULL);
80461 +       memset(dh, 0, sizeof *dh);
80462 +}
80463 +
80464 +void done_load_count(load_count * dh)
80465 +{
80466 +       assert("nikita-2106", dh != NULL);
80467 +       if (dh->node != NULL) {
80468 +               for (; dh->d_ref > 0; --dh->d_ref)
80469 +                       zrelse(dh->node);
80470 +               dh->node = NULL;
80471 +       }
80472 +}
80473 +
80474 +static int incr_load_count(load_count * dh)
80475 +{
80476 +       int result;
80477 +
80478 +       assert("nikita-2110", dh != NULL);
80479 +       assert("nikita-2111", dh->node != NULL);
80480 +
80481 +       result = zload(dh->node);
80482 +       if (result == 0)
80483 +               ++dh->d_ref;
80484 +       return result;
80485 +}
80486 +
80487 +int incr_load_count_znode(load_count * dh, znode * node)
80488 +{
80489 +       assert("nikita-2107", dh != NULL);
80490 +       assert("nikita-2158", node != NULL);
80491 +       assert("nikita-2109",
80492 +              ergo(dh->node != NULL, (dh->node == node) || (dh->d_ref == 0)));
80493 +
80494 +       dh->node = node;
80495 +       return incr_load_count(dh);
80496 +}
80497 +
80498 +int incr_load_count_jnode(load_count * dh, jnode * node)
80499 +{
80500 +       if (jnode_is_znode(node)) {
80501 +               return incr_load_count_znode(dh, JZNODE(node));
80502 +       }
80503 +       return 0;
80504 +}
80505 +
80506 +void copy_load_count(load_count * new, load_count * old)
80507 +{
80508 +       int ret = 0;
80509 +       done_load_count(new);
80510 +       new->node = old->node;
80511 +       new->d_ref = 0;
80512 +
80513 +       while ((new->d_ref < old->d_ref) && (ret = incr_load_count(new)) == 0) {
80514 +       }
80515 +
80516 +       assert("jmacd-87589", ret == 0);
80517 +}
80518 +
80519 +void move_load_count(load_count * new, load_count * old)
80520 +{
80521 +       done_load_count(new);
80522 +       new->node = old->node;
80523 +       new->d_ref = old->d_ref;
80524 +       old->node = NULL;
80525 +       old->d_ref = 0;
80526 +}
80527 +
80528 +/* convert parent pointer into coord */
80529 +void parent_coord_to_coord(const parent_coord_t * pcoord, coord_t * coord)
80530 +{
80531 +       assert("nikita-3204", pcoord != NULL);
80532 +       assert("nikita-3205", coord != NULL);
80533 +
80534 +       coord_init_first_unit_nocheck(coord, pcoord->node);
80535 +       coord_set_item_pos(coord, pcoord->item_pos);
80536 +       coord->between = AT_UNIT;
80537 +}
80538 +
80539 +/* pack coord into parent_coord_t */
80540 +void coord_to_parent_coord(const coord_t * coord, parent_coord_t * pcoord)
80541 +{
80542 +       assert("nikita-3206", pcoord != NULL);
80543 +       assert("nikita-3207", coord != NULL);
80544 +
80545 +       pcoord->node = coord->node;
80546 +       pcoord->item_pos = coord->item_pos;
80547 +}
80548 +
80549 +/* Initialize a parent hint pointer. (parent hint pointer is a field in znode,
80550 +   look for comments there) */
80551 +void init_parent_coord(parent_coord_t * pcoord, const znode * node)
80552 +{
80553 +       pcoord->node = (znode *) node;
80554 +       pcoord->item_pos = (unsigned short)~0;
80555 +}
80556 +
80557 +#if REISER4_DEBUG
80558 +
80559 +/* debugging aid: znode invariant */
80560 +static int znode_invariant_f(const znode * node /* znode to check */ ,
80561 +                            char const **msg   /* where to store error
80562 +                                                * message, if any */ )
80563 +{
80564 +#define _ergo(ant, con)                                                \
80565 +       ((*msg) = "{" #ant "} ergo {" #con "}", ergo((ant), (con)))
80566 +
80567 +#define _equi(e1, e2)                                          \
80568 +       ((*msg) = "{" #e1 "} <=> {" #e2 "}", equi((e1), (e2)))
80569 +
80570 +#define _check(exp) ((*msg) = #exp, (exp))
80571 +
80572 +       return jnode_invariant_f(ZJNODE(node), msg) &&
80573 +           /* [znode-fake] invariant */
80574 +           /* fake znode doesn't have a parent, and */
80575 +           _ergo(znode_get_level(node) == 0, znode_parent(node) == NULL) &&
80576 +           /* there is another way to express this very check, and */
80577 +           _ergo(znode_above_root(node), znode_parent(node) == NULL) &&
80578 +           /* it has special block number, and */
80579 +           _ergo(znode_get_level(node) == 0,
80580 +                 disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80581 +           /* it is the only znode with such block number, and */
80582 +           _ergo(!znode_above_root(node) && znode_is_loaded(node),
80583 +                 !disk_addr_eq(znode_get_block(node), &UBER_TREE_ADDR)) &&
80584 +           /* it is parent of the tree root node */
80585 +           _ergo(znode_is_true_root(node),
80586 +                 znode_above_root(znode_parent(node))) &&
80587 +           /* [znode-level] invariant */
80588 +           /* level of parent znode is one larger than that of child,
80589 +              except for the fake znode, and */
80590 +           _ergo(znode_parent(node) && !znode_above_root(znode_parent(node)),
80591 +                 znode_get_level(znode_parent(node)) ==
80592 +                 znode_get_level(node) + 1) &&
80593 +           /* left neighbor is at the same level, and */
80594 +           _ergo(znode_is_left_connected(node) && node->left != NULL,
80595 +                 znode_get_level(node) == znode_get_level(node->left)) &&
80596 +           /* right neighbor is at the same level */
80597 +           _ergo(znode_is_right_connected(node) && node->right != NULL,
80598 +                 znode_get_level(node) == znode_get_level(node->right)) &&
80599 +           /* [znode-connected] invariant */
80600 +           _ergo(node->left != NULL, znode_is_left_connected(node)) &&
80601 +           _ergo(node->right != NULL, znode_is_right_connected(node)) &&
80602 +           _ergo(!znode_is_root(node) && node->left != NULL,
80603 +                 znode_is_right_connected(node->left) &&
80604 +                 node->left->right == node) &&
80605 +           _ergo(!znode_is_root(node) && node->right != NULL,
80606 +                 znode_is_left_connected(node->right) &&
80607 +                 node->right->left == node) &&
80608 +           /* [znode-c_count] invariant */
80609 +           /* for any znode, c_count of its parent is greater than 0 */
80610 +           _ergo(znode_parent(node) != NULL &&
80611 +                 !znode_above_root(znode_parent(node)),
80612 +                 znode_parent(node)->c_count > 0) &&
80613 +           /* leaves don't have children */
80614 +           _ergo(znode_get_level(node) == LEAF_LEVEL,
80615 +                 node->c_count == 0) &&
80616 +           _check(node->zjnode.jnodes.prev != NULL) &&
80617 +           _check(node->zjnode.jnodes.next != NULL) &&
80618 +           /* orphan doesn't have a parent */
80619 +           _ergo(ZF_ISSET(node, JNODE_ORPHAN), znode_parent(node) == 0) &&
80620 +           /* [znode-modify] invariant */
80621 +           /* if znode is not write-locked, its checksum remains
80622 +            * invariant */
80623 +           /* unfortunately, zlock is unordered w.r.t. jnode_lock, so we
80624 +            * cannot check this. */
80625 +           /* [znode-refs] invariant */
80626 +           /* only referenced znode can be long-term locked */
80627 +           _ergo(znode_is_locked(node),
80628 +                 atomic_read(&ZJNODE(node)->x_count) != 0);
80629 +}
80630 +
80631 +/* debugging aid: check znode invariant and panic if it doesn't hold */
80632 +int znode_invariant(znode * node /* znode to check */ )
80633 +{
80634 +       char const *failed_msg;
80635 +       int result;
80636 +
80637 +       assert("umka-063", node != NULL);
80638 +       assert("umka-064", current_tree != NULL);
80639 +
80640 +       spin_lock_znode(node);
80641 +       read_lock_tree(znode_get_tree(node));
80642 +       result = znode_invariant_f(node, &failed_msg);
80643 +       if (!result) {
80644 +               /* print_znode("corrupted node", node); */
80645 +               warning("jmacd-555", "Condition %s failed", failed_msg);
80646 +       }
80647 +       read_unlock_tree(znode_get_tree(node));
80648 +       spin_unlock_znode(node);
80649 +       return result;
80650 +}
80651 +
80652 +/* return non-0 iff data are loaded into znode */
80653 +int znode_is_loaded(const znode * node /* znode to query */ )
80654 +{
80655 +       assert("nikita-497", node != NULL);
80656 +       return jnode_is_loaded(ZJNODE(node));
80657 +}
80658 +
80659 +unsigned long znode_times_locked(const znode * z)
80660 +{
80661 +       return z->times_locked;
80662 +}
80663 +
80664 +#endif                         /* REISER4_DEBUG */
80665 +
80666 +/* Make Linus happy.
80667 +   Local variables:
80668 +   c-indentation-style: "K&R"
80669 +   mode-name: "LC"
80670 +   c-basic-offset: 8
80671 +   tab-width: 8
80672 +   fill-column: 120
80673 +   End:
80674 +*/
80675 diff --git a/fs/reiser4/znode.h b/fs/reiser4/znode.h
80676 new file mode 100644
80677 index 0000000..4699d0f
80678 --- /dev/null
80679 +++ b/fs/reiser4/znode.h
80680 @@ -0,0 +1,434 @@
80681 +/* Copyright 2001, 2002, 2003, 2004 by Hans Reiser, licensing governed by
80682 + * reiser4/README */
80683 +
80684 +/* Declaration of znode (Zam's node). See znode.c for more details. */
80685 +
80686 +#ifndef __ZNODE_H__
80687 +#define __ZNODE_H__
80688 +
80689 +#include "forward.h"
80690 +#include "debug.h"
80691 +#include "dformat.h"
80692 +#include "key.h"
80693 +#include "coord.h"
80694 +#include "plugin/node/node.h"
80695 +#include "jnode.h"
80696 +#include "lock.h"
80697 +#include "readahead.h"
80698 +
80699 +#include <linux/types.h>
80700 +#include <linux/spinlock.h>
80701 +#include <linux/pagemap.h>     /* for PAGE_CACHE_SIZE */
80702 +#include <asm/atomic.h>
80703 +#include <asm/semaphore.h>
80704 +
80705 +/* znode tracks its position within parent (internal item in a parent node,
80706 + * that contains znode's block number). */
80707 +typedef struct parent_coord {
80708 +       znode *node;
80709 +       pos_in_node_t item_pos;
80710 +} parent_coord_t;
80711 +
80712 +/* &znode - node in a reiser4 tree.
80713 +
80714 +   NOTE-NIKITA fields in this struct have to be rearranged (later) to reduce
80715 +   cacheline pressure.
80716 +
80717 +   Locking:
80718 +
80719 +   Long term: data in a disk node attached to this znode are protected
80720 +   by long term, deadlock aware lock ->lock;
80721 +
80722 +   Spin lock: the following fields are protected by the spin lock:
80723 +
80724 +    ->lock
80725 +
80726 +   Following fields are protected by the global tree lock:
80727 +
80728 +    ->left
80729 +    ->right
80730 +    ->in_parent
80731 +    ->c_count
80732 +
80733 +   Following fields are protected by the global delimiting key lock (dk_lock):
80734 +
80735 +    ->ld_key (to update ->ld_key long-term lock on the node is also required)
80736 +    ->rd_key
80737 +
80738 +   Following fields are protected by the long term lock:
80739 +
80740 +    ->nr_items
80741 +
80742 +   ->node_plugin is never changed once set. This means that after code made
80743 +   itself sure that field is valid it can be accessed without any additional
80744 +   locking.
80745 +
80746 +   ->level is immutable.
80747 +
80748 +   Invariants involving this data-type:
80749 +
80750 +      [znode-fake]
80751 +      [znode-level]
80752 +      [znode-connected]
80753 +      [znode-c_count]
80754 +      [znode-refs]
80755 +      [jnode-refs]
80756 +      [jnode-queued]
80757 +      [znode-modify]
80758 +
80759 +    For this to be made into a clustering or NUMA filesystem, we would want to eliminate all of the global locks.
80760 +    Suggestions for how to do that are desired.*/
80761 +struct znode {
80762 +       /* Embedded jnode. */
80763 +       jnode zjnode;
80764 +
80765 +       /* contains three subfields, node, pos_in_node, and pos_in_unit.
80766 +
80767 +          pos_in_node and pos_in_unit are only hints that are cached to
80768 +          speed up lookups during balancing. They are not required to be up to
80769 +          date. Synched in find_child_ptr().
80770 +
80771 +          This value allows us to avoid expensive binary searches.
80772 +
80773 +          in_parent->node points to the parent of this node, and is NOT a
80774 +          hint.
80775 +        */
80776 +       parent_coord_t in_parent;
80777 +
80778 +       /*
80779 +        * sibling list pointers
80780 +        */
80781 +
80782 +       /* left-neighbor */
80783 +       znode *left;
80784 +       /* right-neighbor */
80785 +       znode *right;
80786 +
80787 +       /* long term lock on node content. This lock supports deadlock
80788 +          detection. See lock.c
80789 +        */
80790 +       zlock lock;
80791 +
80792 +       /* You cannot remove from memory a node that has children in
80793 +          memory. This is because we rely on the fact that parent of given
80794 +          node can always be reached without blocking for io. When reading a
80795 +          node into memory you must increase the c_count of its parent, when
80796 +          removing it from memory you must decrease the c_count.  This makes
80797 +          the code simpler, and the cases where it is suboptimal are truly
80798 +          obscure.
80799 +        */
80800 +       int c_count;
80801 +
80802 +       /* plugin of node attached to this znode. NULL if znode is not
80803 +          loaded. */
80804 +       node_plugin *nplug;
80805 +
80806 +       /* version of znode data. This is increased on each modification. This
80807 +        * is necessary to implement seals (see seal.[ch]) efficiently. */
80808 +       __u64 version;
80809 +
80810 +       /* left delimiting key. Necessary to efficiently perform
80811 +          balancing with node-level locking. Kept in memory only. */
80812 +       reiser4_key ld_key;
80813 +       /* right delimiting key. */
80814 +       reiser4_key rd_key;
80815 +
80816 +       /* znode's tree level */
80817 +       __u16 level;
80818 +       /* number of items in this node. This field is modified by node
80819 +        * plugin. */
80820 +       __u16 nr_items;
80821 +
80822 +#if REISER4_DEBUG
80823 +       void *creator;
80824 +       reiser4_key first_key;
80825 +       unsigned long times_locked;
80826 +       int left_version;       /* when node->left was updated */
80827 +       int right_version;      /* when node->right was updated */
80828 +       int ld_key_version;     /* when node->ld_key was updated */
80829 +       int rd_key_version;     /* when node->rd_key was updated */
80830 +#endif
80831 +
80832 +} __attribute__ ((aligned(16)));
80833 +
80834 +ON_DEBUG(extern atomic_t delim_key_version;
80835 +    )
80836 +
80837 +/* In general I think these macros should not be exposed. */
80838 +#define znode_is_locked(node)          (lock_is_locked(&node->lock))
80839 +#define znode_is_rlocked(node)         (lock_is_rlocked(&node->lock))
80840 +#define znode_is_wlocked(node)         (lock_is_wlocked(&node->lock))
80841 +#define znode_is_wlocked_once(node)    (lock_is_wlocked_once(&node->lock))
80842 +#define znode_can_be_rlocked(node)     (lock_can_be_rlocked(&node->lock))
80843 +#define is_lock_compatible(node, mode) (lock_mode_compatible(&node->lock, mode))
80844 +/* Macros for accessing the znode state. */
80845 +#define        ZF_CLR(p,f)             JF_CLR  (ZJNODE(p), (f))
80846 +#define        ZF_ISSET(p,f)           JF_ISSET(ZJNODE(p), (f))
80847 +#define        ZF_SET(p,f)             JF_SET  (ZJNODE(p), (f))
80848 +extern znode *zget(reiser4_tree * tree, const reiser4_block_nr * const block,
80849 +                  znode * parent, tree_level level, gfp_t gfp_flag);
80850 +extern znode *zlook(reiser4_tree * tree, const reiser4_block_nr * const block);
80851 +extern int zload(znode * node);
80852 +extern int zload_ra(znode * node, ra_info_t * info);
80853 +extern int zinit_new(znode * node, gfp_t gfp_flags);
80854 +extern void zrelse(znode * node);
80855 +extern void znode_change_parent(znode * new_parent, reiser4_block_nr * block);
80856 +
80857 +/* size of data in znode */
80858 +static inline unsigned
80859 +znode_size(const znode * node UNUSED_ARG /* znode to query */ )
80860 +{
80861 +       assert("nikita-1416", node != NULL);
80862 +       return PAGE_CACHE_SIZE;
80863 +}
80864 +
80865 +extern void parent_coord_to_coord(const parent_coord_t * pcoord,
80866 +                                 coord_t * coord);
80867 +extern void coord_to_parent_coord(const coord_t * coord,
80868 +                                 parent_coord_t * pcoord);
80869 +extern void init_parent_coord(parent_coord_t * pcoord, const znode * node);
80870 +
80871 +extern unsigned znode_free_space(znode * node);
80872 +
80873 +extern reiser4_key *znode_get_rd_key(znode * node);
80874 +extern reiser4_key *znode_get_ld_key(znode * node);
80875 +
80876 +extern reiser4_key *znode_set_rd_key(znode * node, const reiser4_key * key);
80877 +extern reiser4_key *znode_set_ld_key(znode * node, const reiser4_key * key);
80878 +
80879 +/* `connected' state checks */
80880 +static inline int znode_is_right_connected(const znode * node)
80881 +{
80882 +       return ZF_ISSET(node, JNODE_RIGHT_CONNECTED);
80883 +}
80884 +
80885 +static inline int znode_is_left_connected(const znode * node)
80886 +{
80887 +       return ZF_ISSET(node, JNODE_LEFT_CONNECTED);
80888 +}
80889 +
80890 +static inline int znode_is_connected(const znode * node)
80891 +{
80892 +       return znode_is_right_connected(node) && znode_is_left_connected(node);
80893 +}
80894 +
80895 +extern int znode_shift_order;
80896 +extern int znode_rehash(znode * node, const reiser4_block_nr * new_block_nr);
80897 +extern void znode_remove(znode *, reiser4_tree *);
80898 +extern znode *znode_parent(const znode * node);
80899 +extern znode *znode_parent_nolock(const znode * node);
80900 +extern int znode_above_root(const znode * node);
80901 +extern int init_znodes(void);
80902 +extern void done_znodes(void);
80903 +extern int znodes_tree_init(reiser4_tree * ztree);
80904 +extern void znodes_tree_done(reiser4_tree * ztree);
80905 +extern int znode_contains_key(znode * node, const reiser4_key * key);
80906 +extern int znode_contains_key_lock(znode * node, const reiser4_key * key);
80907 +extern unsigned znode_save_free_space(znode * node);
80908 +extern unsigned znode_recover_free_space(znode * node);
80909 +extern znode *zalloc(gfp_t gfp_flag);
80910 +extern void zinit(znode *, const znode * parent, reiser4_tree *);
80911 +extern int zparse(znode * node);
80912 +
80913 +extern int znode_just_created(const znode * node);
80914 +
80915 +extern void zfree(znode * node);
80916 +
80917 +#if REISER4_DEBUG
80918 +extern void print_znode(const char *prefix, const znode * node);
80919 +#else
80920 +#define print_znode( p, n ) noop
80921 +#endif
80922 +
80923 +/* Make it look like various znode functions exist instead of treating znodes as
80924 +   jnodes in znode-specific code. */
80925 +#define znode_page(x)               jnode_page ( ZJNODE(x) )
80926 +#define zdata(x)                    jdata ( ZJNODE(x) )
80927 +#define znode_get_block(x)          jnode_get_block ( ZJNODE(x) )
80928 +#define znode_created(x)            jnode_created ( ZJNODE(x) )
80929 +#define znode_set_created(x)        jnode_set_created ( ZJNODE(x) )
80930 +#define znode_convertible(x)        jnode_convertible (ZJNODE(x))
80931 +#define znode_set_convertible(x)    jnode_set_convertible (ZJNODE(x))
80932 +
80933 +#define znode_is_dirty(x)           jnode_is_dirty    ( ZJNODE(x) )
80934 +#define znode_check_dirty(x)        jnode_check_dirty ( ZJNODE(x) )
80935 +#define znode_make_clean(x)         jnode_make_clean   ( ZJNODE(x) )
80936 +#define znode_set_block(x, b)       jnode_set_block ( ZJNODE(x), (b) )
80937 +
80938 +#define spin_lock_znode(x)          spin_lock_jnode ( ZJNODE(x) )
80939 +#define spin_unlock_znode(x)        spin_unlock_jnode ( ZJNODE(x) )
80940 +#define spin_trylock_znode(x)       spin_trylock_jnode ( ZJNODE(x) )
80941 +#define spin_znode_is_locked(x)     spin_jnode_is_locked ( ZJNODE(x) )
80942 +#define spin_znode_is_not_locked(x) spin_jnode_is_not_locked ( ZJNODE(x) )
80943 +
80944 +#if REISER4_DEBUG
80945 +extern int znode_x_count_is_protected(const znode * node);
80946 +extern int znode_invariant(znode * node);
80947 +#endif
80948 +
80949 +/* acquire reference to @node */
80950 +static inline znode *zref(znode * node)
80951 +{
80952 +       /* change of x_count from 0 to 1 is protected by tree spin-lock */
80953 +       return JZNODE(jref(ZJNODE(node)));
80954 +}
80955 +
80956 +/* release reference to @node */
80957 +static inline void zput(znode * node)
80958 +{
80959 +       assert("nikita-3564", znode_invariant(node));
80960 +       jput(ZJNODE(node));
80961 +}
80962 +
80963 +/* get the level field for a znode */
80964 +static inline tree_level znode_get_level(const znode * node)
80965 +{
80966 +       return node->level;
80967 +}
80968 +
80969 +/* get the level field for a jnode */
80970 +static inline tree_level jnode_get_level(const jnode * node)
80971 +{
80972 +       if (jnode_is_znode(node))
80973 +               return znode_get_level(JZNODE(node));
80974 +       else
80975 +               /* unformatted nodes are all at the LEAF_LEVEL and for
80976 +                  "semi-formatted" nodes like bitmaps, level doesn't matter. */
80977 +               return LEAF_LEVEL;
80978 +}
80979 +
80980 +/* true if jnode is on leaf level */
80981 +static inline int jnode_is_leaf(const jnode * node)
80982 +{
80983 +       if (jnode_is_znode(node))
80984 +               return (znode_get_level(JZNODE(node)) == LEAF_LEVEL);
80985 +       if (jnode_get_type(node) == JNODE_UNFORMATTED_BLOCK)
80986 +               return 1;
80987 +       return 0;
80988 +}
80989 +
80990 +/* return znode's tree */
80991 +static inline reiser4_tree *znode_get_tree(const znode * node)
80992 +{
80993 +       assert("nikita-2692", node != NULL);
80994 +       return jnode_get_tree(ZJNODE(node));
80995 +}
80996 +
80997 +/* resolve race with zput */
80998 +static inline znode *znode_rip_check(reiser4_tree * tree, znode * node)
80999 +{
81000 +       jnode *j;
81001 +
81002 +       j = jnode_rip_sync(tree, ZJNODE(node));
81003 +       if (likely(j != NULL))
81004 +               node = JZNODE(j);
81005 +       else
81006 +               node = NULL;
81007 +       return node;
81008 +}
81009 +
81010 +#if defined(REISER4_DEBUG)
81011 +int znode_is_loaded(const znode * node /* znode to query */ );
81012 +#endif
81013 +
81014 +extern __u64 znode_build_version(reiser4_tree * tree);
81015 +
81016 +/* Data-handles.  A data handle object manages pairing calls to zload() and zrelse().  We
81017 +   must load the data for a node in many places.  We could do this by simply calling
81018 +   zload() everywhere, the difficulty arises when we must release the loaded data by
81019 +   calling zrelse.  In a function with many possible error/return paths, it requires extra
81020 +   work to figure out which exit paths must call zrelse and those which do not.  The data
81021 +   handle automatically calls zrelse for every zload that it is responsible for.  In that
81022 +   sense, it acts much like a lock_handle.
81023 +*/
81024 +typedef struct load_count {
81025 +       znode *node;
81026 +       int d_ref;
81027 +} load_count;
81028 +
81029 +extern void init_load_count(load_count * lc);  /* Initialize a load_count set the current node to NULL. */
81030 +extern void done_load_count(load_count * dh);  /* Finalize a load_count: call zrelse() if necessary */
81031 +extern int incr_load_count_znode(load_count * dh, znode * node);       /* Set the argument znode to the current node, call zload(). */
81032 +extern int incr_load_count_jnode(load_count * dh, jnode * node);       /* If the argument jnode is formatted, do the same as
81033 +                                                                        * incr_load_count_znode, otherwise do nothing (unformatted nodes
81034 +                                                                        * don't require zload/zrelse treatment). */
81035 +extern void move_load_count(load_count * new, load_count * old);       /* Move the contents of a load_count.  Old handle is released. */
81036 +extern void copy_load_count(load_count * new, load_count * old);       /* Copy the contents of a load_count.  Old handle remains held. */
81037 +
81038 +/* Variable initializers for load_count. */
81039 +#define INIT_LOAD_COUNT ( load_count * ){ .node = NULL, .d_ref = 0 }
81040 +#define INIT_LOAD_COUNT_NODE( n ) ( load_count ){ .node = ( n ), .d_ref = 0 }
81041 +/* A convenience macro for use in assertions or debug-only code, where loaded
81042 +   data is only required to perform the debugging check.  This macro
81043 +   encapsulates an expression inside a pair of calls to zload()/zrelse(). */
81044 +#define WITH_DATA( node, exp )                         \
81045 +({                                                     \
81046 +       long __with_dh_result;                          \
81047 +       znode *__with_dh_node;                          \
81048 +                                                       \
81049 +       __with_dh_node = ( node );                      \
81050 +       __with_dh_result = zload( __with_dh_node );     \
81051 +       if( __with_dh_result == 0 ) {                   \
81052 +               __with_dh_result = ( long )( exp );     \
81053 +               zrelse( __with_dh_node );               \
81054 +       }                                               \
81055 +       __with_dh_result;                               \
81056 +})
81057 +
81058 +/* Same as above, but accepts a return value in case zload fails. */
81059 +#define WITH_DATA_RET( node, ret, exp )                        \
81060 +({                                                     \
81061 +       int __with_dh_result;                           \
81062 +       znode *__with_dh_node;                          \
81063 +                                                       \
81064 +       __with_dh_node = ( node );                      \
81065 +       __with_dh_result = zload( __with_dh_node );     \
81066 +       if( __with_dh_result == 0 ) {                   \
81067 +               __with_dh_result = ( int )( exp );      \
81068 +               zrelse( __with_dh_node );               \
81069 +       } else                                          \
81070 +               __with_dh_result = ( ret );             \
81071 +       __with_dh_result;                               \
81072 +})
81073 +
81074 +#define WITH_COORD(coord, exp)                 \
81075 +({                                             \
81076 +       coord_t *__coord;                       \
81077 +                                               \
81078 +       __coord = (coord);                      \
81079 +       coord_clear_iplug(__coord);             \
81080 +       WITH_DATA(__coord->node, exp);          \
81081 +})
81082 +
81083 +#if REISER4_DEBUG
81084 +#define STORE_COUNTERS                                         \
81085 +       reiser4_lock_counters_info __entry_counters =           \
81086 +               *reiser4_lock_counters()
81087 +#define CHECK_COUNTERS                                                 \
81088 +ON_DEBUG_CONTEXT(                                                      \
81089 +({                                                                     \
81090 +       __entry_counters.x_refs = reiser4_lock_counters() -> x_refs;    \
81091 +       __entry_counters.t_refs = reiser4_lock_counters() -> t_refs;    \
81092 +       __entry_counters.d_refs = reiser4_lock_counters() -> d_refs;    \
81093 +       assert("nikita-2159",                                           \
81094 +              !memcmp(&__entry_counters, reiser4_lock_counters(),      \
81095 +                      sizeof __entry_counters));                       \
81096 +}) )
81097 +
81098 +#else
81099 +#define STORE_COUNTERS
81100 +#define CHECK_COUNTERS noop
81101 +#endif
81102 +
81103 +/* __ZNODE_H__ */
81104 +#endif
81105 +
81106 +/* Make Linus happy.
81107 +   Local variables:
81108 +   c-indentation-style: "K&R"
81109 +   mode-name: "LC"
81110 +   c-basic-offset: 8
81111 +   tab-width: 8
81112 +   fill-column: 120
81113 +   End:
81114 +*/
81115 diff --git a/include/linux/fs.h b/include/linux/fs.h
81116 index 1410e53..dd12411 100644
81117 --- a/include/linux/fs.h
81118 +++ b/include/linux/fs.h
81119 @@ -1165,6 +1165,8 @@ struct super_operations {
81120         void (*clear_inode) (struct inode *);
81121         void (*umount_begin) (struct vfsmount *, int);
81122
81123 +       void (*sync_inodes) (struct super_block *sb,
81124 +                               struct writeback_control *wbc);
81125         int (*show_options)(struct seq_file *, struct vfsmount *);
81126         int (*show_stats)(struct seq_file *, struct vfsmount *);
81127  #ifdef CONFIG_QUOTA
81128 @@ -1583,6 +1585,7 @@ extern int invalidate_inode_pages2(struct address_space *mapping);
81129  extern int invalidate_inode_pages2_range(struct address_space *mapping,
81130                                          pgoff_t start, pgoff_t end);
81131  extern int write_inode_now(struct inode *, int);
81132 +extern void generic_sync_sb_inodes(struct super_block *, struct writeback_control *);
81133  extern int filemap_fdatawrite(struct address_space *);
81134  extern int filemap_flush(struct address_space *);
81135  extern int filemap_fdatawait(struct address_space *);
81136 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
81137 index d69ddbe..ed3e15f 100644
81138 --- a/lib/radix-tree.c
81139 +++ b/lib/radix-tree.c
81140 @@ -151,6 +151,7 @@ int radix_tree_preload(gfp_t gfp_mask)
81141  out:
81142         return ret;
81143  }
81144 +EXPORT_SYMBOL(radix_tree_preload);
81145
81146  static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
81147                 int offset)
81148 diff --git a/mm/filemap.c b/mm/filemap.c
81149 index 8332c77..b16d2cb 100644
81150 --- a/mm/filemap.c
81151 +++ b/mm/filemap.c
81152 @@ -121,6 +121,7 @@ void __remove_from_page_cache(struct page *page)
81153         mapping->nrpages--;
81154         __dec_zone_page_state(page, NR_FILE_PAGES);
81155  }
81156 +EXPORT_SYMBOL(__remove_from_page_cache);
81157
81158  void remove_from_page_cache(struct page *page)
81159  {
81160 @@ -132,6 +133,7 @@ void remove_from_page_cache(struct page *page)
81161         __remove_from_page_cache(page);
81162         write_unlock_irq(&mapping->tree_lock);
81163  }
81164 +EXPORT_SYMBOL(remove_from_page_cache);
81165
81166  static int sync_page(void *word)
81167  {
81168 @@ -465,6 +467,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
81169                 lru_cache_add(page);
81170         return ret;
81171  }
81172 +EXPORT_SYMBOL(add_to_page_cache_lru);
81173
81174  #ifdef CONFIG_NUMA
81175  struct page *__page_cache_alloc(gfp_t gfp)
81176 @@ -738,6 +741,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
81177         read_unlock_irq(&mapping->tree_lock);
81178         return ret;
81179  }
81180 +EXPORT_SYMBOL(find_get_pages);
81181
81182  /**
81183   * find_get_pages_contig - gang contiguous pagecache lookup
81184 @@ -798,6 +802,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
81185         read_unlock_irq(&mapping->tree_lock);
81186         return ret;
81187  }
81188 +EXPORT_SYMBOL(find_get_pages_tag);
81189
81190  /**
81191   * grab_cache_page_nowait - returns locked page at given index in given cache
81192 diff --git a/mm/readahead.c b/mm/readahead.c
81193 index 0f539e8..9db41de 100644
81194 --- a/mm/readahead.c
81195 +++ b/mm/readahead.c
81196 @@ -568,6 +568,7 @@ void handle_ra_miss(struct address_space *mapping,
81197         ra->flags &= ~RA_FLAG_INCACHE;
81198         ra->cache_hit = 0;
81199  }
81200 +EXPORT_SYMBOL_GPL(handle_ra_miss);
81201
81202  /*
81203   * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a