Perform true sequential bulk read prefetching in xfs_repair

author Barry Naujok <bnaujok@sgi.com>

Mon, 16 Jul 2007 15:55:26 +0000 (15:55 +0000)

committer Barry Naujok <bnaujok@sgi.com>

Mon, 16 Jul 2007 15:55:26 +0000 (15:55 +0000)
author Barry Naujok <bnaujok@sgi.com>
Mon, 16 Jul 2007 15:55:26 +0000 (15:55 +0000)
committer Barry Naujok <bnaujok@sgi.com>
Mon, 16 Jul 2007 15:55:26 +0000 (15:55 +0000)
diff --git a/include/cache.h b/include/cache.h

index 6cac806fe2e133ec41ae1223ba55eb611200b789..241796d42b88c92ed1a9d23108c5b1efa17a29c7 100644 (file)
--- a/include/cache.h
+++ b/include/cache.h
@@ -18,6 +18,8 @@
  #ifndef __CACHE_H__
  #define __CACHE_H__
  
+#define        HASH_CACHE_RATIO        8
+
  /*
   * Simple, generic implementation of a cache (arbitrary data).
   * Provides a hash table with a capped number of cache entries.
@@ -28,8 +30,9 @@ struct cache_hash;
  struct cache_node;
  
  typedef void *cache_key_t;
+
  typedef void (*cache_walk_t)(struct cache_node *);
-typedef struct cache_node * (*cache_node_alloc_t)(void);
+typedef struct cache_node * (*cache_node_alloc_t)(cache_key_t);
  typedef void (*cache_node_flush_t)(struct cache_node *);
  typedef void (*cache_node_relse_t)(struct cache_node *);
  typedef unsigned int (*cache_node_hash_t)(cache_key_t, unsigned int);
@@ -84,5 +87,6 @@ int cache_node_get(struct cache *, cache_key_t, struct cache_node **);
  void cache_node_put(struct cache_node *);
  int cache_node_purge(struct cache *, cache_key_t, struct cache_node *);
  void cache_report(FILE *fp, const char *, struct cache *);
+int cache_overflowed(struct cache *);
  
  #endif /* __CACHE_H__ */
diff --git a/include/libxfs.h b/include/libxfs.h

index ea53d771d701a145489d5901e8a926f4fdb94254..8de631782fe09a22a7b4dd65a0f30ab55c06599c 100644 (file)
--- a/include/libxfs.h
+++ b/include/libxfs.h
@@ -190,8 +190,8 @@ typedef struct xfs_mount {
  #define LIBXFS_MOUNT_32BITINOOPT       0x0008
  #define LIBXFS_MOUNT_COMPAT_ATTR       0x0010
  
-#define LIBXFS_IHASHSIZE(sbp)          (1<<16) /* tweak based on icount? */
-#define LIBXFS_BHASHSIZE(sbp)          (1<<16) /* ditto, on blocks used? */
+#define LIBXFS_IHASHSIZE(sbp)          (1<<10)
+#define LIBXFS_BHASHSIZE(sbp)          (1<<10)
  
  extern xfs_mount_t     *libxfs_mount (xfs_mount_t *, xfs_sb_t *,
                                 dev_t, dev_t, dev_t, int);
@@ -216,10 +216,17 @@ typedef struct xfs_buf {
         xfs_daddr_t             b_blkno;
         unsigned                b_bcount;
         dev_t                   b_dev;
+       pthread_mutex_t         b_lock;
         void                    *b_fsprivate;
         void                    *b_fsprivate2;
         void                    *b_fsprivate3;
         char                    *b_addr;
+#ifdef XFS_BUF_TRACING
+       struct list_head        b_lock_list;
+       const char              *b_func;
+       const char              *b_file;
+       int                     b_line;
+#endif
  } xfs_buf_t;
  
  enum xfs_buf_flags_t { /* b_flags bits */
@@ -247,25 +254,49 @@ enum xfs_buf_flags_t {    /* b_flags bits */
  #define XFS_BUF_FSPRIVATE3(bp,type)    ((type)(bp)->b_fsprivate3)
  #define XFS_BUF_SET_FSPRIVATE3(bp,val) (bp)->b_fsprivate3 = (void *)(val)
  
-extern xfs_buf_t       *libxfs_getsb (xfs_mount_t *, int);
-extern xfs_buf_t       *libxfs_readbuf (dev_t, xfs_daddr_t, int, int);
-extern int     libxfs_readbufr (dev_t, xfs_daddr_t, xfs_buf_t *, int, int);
-extern int     libxfs_writebuf (xfs_buf_t *, int);
-extern int     libxfs_writebufr (xfs_buf_t *);
-extern int     libxfs_writebuf_int (xfs_buf_t *, int);
-
  /* Buffer Cache Interfaces */
+
  extern struct cache    *libxfs_bcache;
  extern struct cache_operations libxfs_bcache_operations;
-extern void    libxfs_bcache_purge (void);
-extern void    libxfs_bcache_flush (void);
-extern xfs_buf_t       *libxfs_getbuf (dev_t, xfs_daddr_t, int);
+
+#ifdef XFS_BUF_TRACING
+
+#define libxfs_readbuf(dev, daddr, len, flags) \
+               libxfs_trace_readbuf(__FUNCTION__, __FILE__, __LINE__, (dev), (daddr), (len), (flags))
+#define libxfs_writebuf(buf, flags) \
+               libxfs_trace_writebuf(__FUNCTION__, __FILE__, __LINE__, (buf), (flags))
+#define libxfs_getbuf(dev, daddr, len) \
+               libxfs_trace_getbuf(__FUNCTION__, __FILE__, __LINE__, (dev), (daddr), (len))
+#define libxfs_putbuf(buf) \
+               libxfs_trace_putbuf(__FUNCTION__, __FILE__, __LINE__, (buf))
+
+extern xfs_buf_t *libxfs_trace_readbuf(const char *, const char *, int, dev_t, xfs_daddr_t, int, int);
+extern int     libxfs_trace_writebuf(const char *, const char *, int, xfs_buf_t *, int);
+extern xfs_buf_t *libxfs_trace_getbuf(const char *, const char *, int, dev_t, xfs_daddr_t, int);
+extern void    libxfs_trace_putbuf (const char *, const char *, int, xfs_buf_t *);
+
+#else
+
+extern xfs_buf_t *libxfs_readbuf(dev_t, xfs_daddr_t, int, int);
+extern int     libxfs_writebuf(xfs_buf_t *, int);
+extern xfs_buf_t *libxfs_getbuf(dev_t, xfs_daddr_t, int);
  extern void    libxfs_putbuf (xfs_buf_t *);
-extern void    libxfs_purgebuf (xfs_buf_t *);
+
+#endif
+
+extern xfs_buf_t *libxfs_getsb(xfs_mount_t *, int);
+extern void    libxfs_bcache_purge(void);
+extern void    libxfs_bcache_flush(void);
+extern void    libxfs_purgebuf(xfs_buf_t *);
+extern int     libxfs_bcache_overflowed(void);
+extern int     libxfs_bcache_usage(void);
  
  /* Buffer (Raw) Interfaces */
-extern xfs_buf_t       *libxfs_getbufr (dev_t, xfs_daddr_t, int);
-extern void    libxfs_putbufr (xfs_buf_t *);
+extern xfs_buf_t *libxfs_getbufr(dev_t, xfs_daddr_t, int);
+extern void    libxfs_putbufr(xfs_buf_t *);
+
+extern int     libxfs_writebuf_int(xfs_buf_t *, int);
+extern int     libxfs_readbufr(dev_t, xfs_daddr_t, xfs_buf_t *, int, int);
  
  extern int libxfs_bhash_size;
  extern int libxfs_ihash_size;
@@ -556,29 +587,10 @@ extern unsigned int       libxfs_log2_roundup(unsigned int i);
  extern void cmn_err(int, char *, ...);
  enum ce { CE_DEBUG, CE_CONT, CE_NOTE, CE_WARN, CE_ALERT, CE_PANIC };
  
-/* lio interface */
-/* lio_listio(3) interface (POSIX linked asynchronous I/O) */
-extern int libxfs_lio_ino_count;
-extern int libxfs_lio_dir_count;
-extern int libxfs_lio_aio_count;
-
-extern int libxfs_lio_init(void);
-extern void libxfs_lio_allocate(void);
-extern void *libxfs_get_lio_buffer(int type);
-extern void libxfs_put_lio_buffer(void *buffer);
-extern int libxfs_readbuf_list(dev_t dev, int nent, void *voidp, int type);
-
-typedef struct  libxfs_lio_req {
-       xfs_daddr_t     blkno;
-       int             len;    /* bbs */
-} libxfs_lio_req_t;
-
-#define        LIBXFS_LIO_TYPE_INO             0x1
-#define        LIBXFS_LIO_TYPE_DIR             0x2
-#define        LIBXFS_LIO_TYPE_RAW             0x3
  
  #define LIBXFS_BBTOOFF64(bbs)  (((xfs_off_t)(bbs)) << BBSHIFT)
-extern int libxfs_nproc(void);
+extern int             libxfs_nproc(void);
+extern unsigned long   libxfs_physmem(void);   /* in kilobytes */
  
  #include <xfs/xfs_ialloc.h>
  #include <xfs/xfs_rtalloc.h>
diff --git a/include/linux.h b/include/linux.h

index 152efc53c5c12e4ae5517273bf48412b494dee10..440f29beaff5996c7e1ddadb27a9cf3a02d301ec 100644 (file)
--- a/include/linux.h
+++ b/include/linux.h
@@ -119,9 +119,4 @@ typedef enum {B_FALSE, B_TRUE}      boolean_t;
  #define _BOOLEAN_T_DEFINED     1
  #endif
  
-#ifdef __USE_GNU
-typedef struct aiocb64 aiocb64_t;
-#define        _AIOCB64_T_DEFINED      1
-#endif
-
  #endif /* __XFS_LINUX_H__ */
diff --git a/libxfs/Makefile b/libxfs/Makefile

index b3a820df89bcf8efb275b7d16ceb76b03a16d746..8cc780ec8331dcc5815954857d6f174f1780d92d 100644 (file)
--- a/libxfs/Makefile
+++ b/libxfs/Makefile
@@ -11,7 +11,7 @@ LT_REVISION = 0
  LT_AGE = 0
  
  HFILES = xfs.h init.h
-CFILES = bit.c cache.c init.c lio.c logitem.c rdwr.c trans.c util.c \
+CFILES = bit.c cache.c init.c logitem.c rdwr.c trans.c util.c \
         xfs_alloc.c xfs_ialloc.c xfs_rtalloc.c \
         xfs_inode.c xfs_btree.c xfs_alloc_btree.c xfs_ialloc_btree.c \
         xfs_bmap_btree.c xfs_da_btree.c xfs_dir.c xfs_dir_leaf.c \
diff --git a/libxfs/cache.c b/libxfs/cache.c

index 1a844671b92dd3a74ddf93d55d4069318fbc0ab5..c37356d0dd6f27d7ad1a652cbd2c1c7c0442f34b 100644 (file)
--- a/libxfs/cache.c
+++ b/libxfs/cache.c
@@ -31,7 +31,6 @@
  #define CACHE_DEBUG 1
  #undef CACHE_ABORT
  /* #define CACHE_ABORT 1 */
-#define        HASH_CACHE_RATIO        8
  
  static unsigned int cache_generic_bulkrelse(struct cache *, struct list_head *);
  
@@ -275,7 +274,8 @@ cache_shake(
  struct cache_node *
  cache_node_allocate(
         struct cache *          cache,
-       struct cache_hash *     hashlist)
+       struct cache_hash *     hashlist,
+       cache_key_t             key)
  {
         unsigned int            nodesfree;
         struct cache_node *     node;
@@ -290,7 +290,7 @@ cache_node_allocate(
         pthread_mutex_unlock(&cache->c_mutex);
         if (!nodesfree)
                 return NULL;
-       if (!(node = cache->alloc())) { /* uh-oh */
+       if (!(node = cache->alloc(key))) {      /* uh-oh */
                 pthread_mutex_lock(&cache->c_mutex);
                 cache->c_count--;
                 pthread_mutex_unlock(&cache->c_mutex);
@@ -302,6 +302,13 @@ cache_node_allocate(
         return node;
  }
  
+int
+cache_overflowed(
+       struct cache *          cache)
+{
+       return (cache->c_maxcount == cache->c_max);
+}
+
  /*
   * Lookup in the cache hash table.  With any luck we'll get a cache
   * hit, in which case this will all be over quickly and painlessly.
@@ -341,7 +348,7 @@ cache_node_get(
                 break;
         }
         if (pos == head) {
-               node = cache_node_allocate(cache, hash);
+               node = cache_node_allocate(cache, hash, key);
                 if (!node) {
                         priority = cache_shake(cache, hash, priority);
                         goto restart;
@@ -428,7 +435,7 @@ cache_purge(
  }
  
  /*
- * Flush all nodes in the cache to disk. 
+ * Flush all nodes in the cache to disk.
   */
  void
  cache_flush(
@@ -439,13 +446,13 @@ cache_flush(
         struct list_head *      pos;
         struct cache_node *     node;
         int                     i;
-       
+
         if (!cache->flush)
                 return;
-       
+
         for (i = 0; i < cache->c_hashsize; i++) {
                 hash = &cache->c_hash[i];
-               
+
                 pthread_mutex_lock(&hash->ch_mutex);
                 head = &hash->ch_list;
                 for (pos = head->next; pos != head; pos = pos->next) {
@@ -505,10 +512,10 @@ cache_report(FILE *fp, const char *name, struct cache * cache)
                 total += i*hash_bucket_lengths[i];
                 if (hash_bucket_lengths[i] == 0)
                         continue;
-               fprintf(fp, "Hash buckets with  %2d entries %5ld (%3ld%%)\n", 
+               fprintf(fp, "Hash buckets with  %2d entries %5ld (%3ld%%)\n",
                         i, hash_bucket_lengths[i], (i*hash_bucket_lengths[i]*100)/cache->c_count);
         }
         if (hash_bucket_lengths[i])     /* last report bucket is the overflow bucket */
-               fprintf(fp, "Hash buckets with >%2d entries %5ld (%3ld%%)\n", 
+               fprintf(fp, "Hash buckets with >%2d entries %5ld (%3ld%%)\n",
                         i-1, hash_bucket_lengths[i], ((cache->c_count-total)*100)/cache->c_count);
  }
diff --git a/libxfs/darwin.c b/libxfs/darwin.c

index 3bf1eb482fbe8c5b75091949eee5a8a10cee86b2..e2a3e62090342110d5a8ce110589ef59d7c06012 100644 (file)
--- a/libxfs/darwin.c
+++ b/libxfs/darwin.c
@@ -21,6 +21,7 @@
  #include <sys/mount.h>
  #include <sys/ioctl.h>
  #include <xfs/libxfs.h>
+#include <sys/sysctl.h>
  
  int platform_has_uuid = 1;
  extern char *progname;
@@ -90,13 +91,6 @@ platform_findsizes(char *path, int fd, long long *sz, int *bsz)
         *bsz = BBSIZE;
  }
  
-/* ARGSUSED */
-int
-platform_aio_init(int aio_count)
-{
-       return 0;               /* aio/lio_listio not available */
-}
-
  char *
  platform_findrawpath(char *path)
  {
@@ -124,5 +118,28 @@ platform_align_blockdev(void)
  int
  platform_nproc(void)
  {
-       return 1;
+       int             ncpu;
+       size_t          len = sizeof(ncpu);
+       static int      mib[2] = {CTL_HW, HW_NCPU};
+
+       if (sysctl(mib, 2, &ncpu, &len, NULL, 0) < 0)
+               ncpu = 1;
+
+       return ncpu;
  }
+
+unsigned long
+platform_physmem(void)
+{
+       unsigned long   physmem;
+       size_t          len = sizeof(physmem);
+       static int      mib[2] = {CTL_HW, HW_PHYSMEM};
+
+       if (sysctl(mib, 2, &physmem, &len, NULL, 0) < 0) {
+               fprintf(stderr, _("%s: can't determine memory size\n"),
+                       progname);
+               exit(1);
+       }
+       return physmem >> 10;
+}
+
diff --git a/libxfs/freebsd.c b/libxfs/freebsd.c

index abae71b98120ca93cbb12948389fa73a37ca9bef..a6278d736c1b4c574dd0f117acf3cfa6045d39bd 100644 (file)
--- a/libxfs/freebsd.c
+++ b/libxfs/freebsd.c
@@ -149,13 +149,6 @@ platform_findsizes(char *path, int fd, long long *sz, int *bsz)
         *bsz = (int)ssize;
  }
  
-/* ARGSUSED */
-int
-platform_aio_init(int aio_count)
-{
-       return 0;               /* aio/lio_listio not available */
-}
-
  char *
  platform_findrawpath(char *path)
  {
@@ -183,5 +176,27 @@ platform_align_blockdev(void)
  int
  platform_nproc(void)
  {
-       return 1;
+       int             ncpu;
+       size_t          len = sizeof(ncpu);
+       static int      mib[2] = {CTL_HW, HW_NCPU};
+
+       if (sysctl(mib, 2, &ncpu, &len, NULL, 0) < 0)
+               ncpu = 1;
+
+       return ncpu;
+}
+
+unsigned long
+platform_physmem(void)
+{
+       unsigned long   physmem;
+       size_t          len = sizeof(physmem);
+       static int      mib[2] = {CTL_HW, HW_PHYSMEM};
+
+       if (sysctl(mib, 2, &physmem, &len, NULL, 0) < 0) {
+               fprintf(stderr, _("%s: can't determine memory size\n"),
+                       progname);
+               exit(1);
+       }
+       return physmem >> 10;
  }
diff --git a/libxfs/init.c b/libxfs/init.c

index fbe0d629e4c44bb7a9cbe81ee1bdc84c822013e1..fe288299f2aab1e11b2b427383f78032823bee01 100644 (file)
--- a/libxfs/init.c
+++ b/libxfs/init.c
@@ -732,3 +732,9 @@ libxfs_nproc(void)
  {
         return platform_nproc();
  }
+
+unsigned long
+libxfs_physmem(void)
+{
+       return platform_physmem();
+}
+\ No newline at end of file
diff --git a/libxfs/init.h b/libxfs/init.h

index 1cb2b587f0a2681f277a48dc185e1fa3749d4e4c..1f27af67bad4bc67d4d34b0e9944d62f9c0cff4c 100644 (file)
--- a/libxfs/init.h
+++ b/libxfs/init.h
@@ -32,8 +32,8 @@ extern char *platform_findrawpath (char *path);
  extern char *platform_findblockpath (char *path);
  extern int platform_direct_blockdev (void);
  extern int platform_align_blockdev (void);
-extern int platform_aio_init (int aio_count);
  extern int platform_nproc(void);
+extern unsigned long platform_physmem(void);   /* in kilobytes */
  extern int platform_has_uuid;
  
  #endif /* LIBXFS_INIT_H */
diff --git a/libxfs/irix.c b/libxfs/irix.c

index 99d75b8e8a94366ad6a538d54620ae50982392ea..9385b512f3d8855d51ff58a1174ea1cbc901c947 100644 (file)
--- a/libxfs/irix.c
+++ b/libxfs/irix.c
@@ -17,7 +17,6 @@
   */
  
  #include <xfs/libxfs.h>
-#include <aio.h>
  #include <diskinfo.h>
  #include <sys/sysmp.h>
  
@@ -68,19 +67,6 @@ platform_findsizes(char *path, int fd, long long *sz, int *bsz)
         *bsz = BBSIZE;
  }
  
-int
-platform_aio_init(int aio_count)
-{
-       struct aioinit aio_init;
-
-       memset(&aio_init, 0, sizeof(aio_init));
-       aio_init.aio_threads = aio_count;
-       aio_init.aio_numusers = aio_count;
-
-       aio_sgi_init64(&aio_init);
-       return (1);             /* aio/lio_listio available */
-}
-
  char *
  platform_findrawpath(char *path)
  {
@@ -111,3 +97,15 @@ platform_nproc(void)
         return sysmp(MP_NPROCS);
  }
  
+unsigned long
+platform_physmem(void)
+{
+       struct rminfo ri;
+
+       if (sysmp(MP_SAGET, MPSA_RMINFO, &ri, sizeof(ri)) < 0)
+               fprintf(stderr, _("%s: can't determine memory size\n"),
+                       progname);
+               exit(1);
+       }
+       return (ri.physmem >> 10) * getpagesize();      /* kilobytes */
+}
+\ No newline at end of file
diff --git a/libxfs/linux.c b/libxfs/linux.c

index ef59c77dbab1a237bbb24b6d8ab3375adef9f65d..6022b7bbc48d3059f71564acb5b54ba59bdd3e49 100644 (file)
--- a/libxfs/linux.c
+++ b/libxfs/linux.c
@@ -20,11 +20,11 @@
  #include <xfs/libxfs.h>
  #include <mntent.h>
  #include <sys/stat.h>
-#include <aio.h>
  #undef ustat
  #include <sys/ustat.h>
  #include <sys/mount.h>
  #include <sys/ioctl.h>
+#include <sys/sysinfo.h>
  
  int platform_has_uuid = 1;
  extern char *progname;
@@ -174,19 +174,6 @@ platform_findsizes(char *path, int fd, long long *sz, int *bsz)
                 max_block_alignment = *bsz;
  }
  
-int
-platform_aio_init(int aio_count)
-{
-       struct aioinit lcl_aio_init;
-
-       memset(&lcl_aio_init, 0, sizeof(lcl_aio_init));
-       lcl_aio_init.aio_threads = aio_count;
-       lcl_aio_init.aio_numusers = aio_count;
-
-       aio_init(&lcl_aio_init);
-       return (1);             /* aio/lio_listio available */
-}
-
  char *
  platform_findrawpath(char *path)
  {
@@ -218,3 +205,16 @@ platform_nproc(void)
  {
         return sysconf(_SC_NPROCESSORS_ONLN);
  }
+
+unsigned long
+platform_physmem(void)
+{
+       struct sysinfo  si;
+
+       if (sysinfo(&si) < 0) {
+               fprintf(stderr, _("%s: can't determine memory size\n"),
+                       progname);
+               exit(1);
+       }
+       return (si.totalram >> 10) * si.mem_unit;       /* kilobytes */
+}
diff --git a/libxfs/lio.c b/libxfs/lio.c

deleted file mode 100644 (file)

index c1d1712..0000000
--- a/libxfs/lio.c
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <xfs/libxfs.h>
-#include "init.h"
-#include "aio.h"
-
-#define        DEF_PREFETCH_INOS       16
-#define        DEF_PREFETCH_DIRS       16
-#define        DEF_PREFETCH_AIO        32
-int    libxfs_lio_ino_count = DEF_PREFETCH_INOS;
-int    libxfs_lio_dir_count = DEF_PREFETCH_DIRS;
-int    libxfs_lio_aio_count = DEF_PREFETCH_AIO;
-
-static pthread_key_t lio_ino_key;
-static pthread_key_t lio_dir_key;
-
-void
-libxfs_lio_allocate(void)
-{
-#ifdef _AIOCB64_T_DEFINED
-       size_t          size;
-       void            *voidp;
-
-       /*
-        * allocate a per-thread buffer which will be used in libxfs_readbuf_list
-        * in the following order:
-        * libxfs_lio_req_t array
-        * aiocb64_t array
-        * aiocb64_t * array
-        * xfs_buf_t * array
-        */
-       size = sizeof(libxfs_lio_req_t) + sizeof(aiocb64_t) +  sizeof(aiocb64_t *) + sizeof(xfs_buf_t *);
-
-       voidp = malloc(libxfs_lio_ino_count*size);
-       if (voidp == NULL) {
-               fprintf(stderr, "lio_allocate: cannot allocate thread specific storage\n");
-               exit(1);
-               /* NO RETURN */
-               return;
-       }
-       pthread_setspecific(lio_ino_key,  voidp);
-
-       voidp = malloc(libxfs_lio_dir_count*size);
-       if (voidp == NULL) {
-               fprintf(stderr, "lio_allocate: cannot allocate thread specific storage\n");
-               exit(1);
-               /* NO RETURN */
-               return;
-       }
-       pthread_setspecific(lio_dir_key,  voidp);
-#endif /* _AIOCB64_T_DEFINED */
-}
-
-int
-libxfs_lio_init(void)
-{
-#ifdef _AIOCB64_T_DEFINED
-       if (platform_aio_init(libxfs_lio_aio_count)) {
-               pthread_key_create(&lio_ino_key, NULL);
-               pthread_key_create(&lio_dir_key, NULL);
-               return (1);
-       }
-#endif /* _AIOCB64_T_DEFINED */
-       return (0);
-}
-
-void *
-libxfs_get_lio_buffer(int type)
-{
-#ifdef _AIOCB64_T_DEFINED
-       if (type == LIBXFS_LIO_TYPE_INO)
-               return pthread_getspecific(lio_ino_key);
-       if (type == LIBXFS_LIO_TYPE_DIR)
-               return pthread_getspecific(lio_dir_key);
-       if (type == LIBXFS_LIO_TYPE_RAW) {
-               /* use the inode buffers since there is
-                * no overlap with the other requests.
-                */
-               return pthread_getspecific(lio_ino_key);
-       }
-       fprintf(stderr, "get_lio_buffer: invalid type 0x%x\n", type);
-       exit(1);
-#endif
-       return NULL;
-}
-
-/* ARGSUSED */
-void
-libxfs_put_lio_buffer(void *buffer)
-{
-       return; /* nothing to do */
-}
-
-static int
-lio_compare(const void *e1, const void *e2)
-{
-       libxfs_lio_req_t *r1 = (libxfs_lio_req_t *) e1;
-       libxfs_lio_req_t *r2 = (libxfs_lio_req_t *) e2;
-
-       return (int) (r1->blkno - r2->blkno);
-}
-
-int
-libxfs_readbuf_list(dev_t dev, int nent, void *voidp, int type)
-{
-#ifdef _AIOCB64_T_DEFINED
-       libxfs_lio_req_t        *rblp;
-       xfs_buf_t               *bp, **bplist;
-       aiocb64_t               *aioclist, **aiocptr;
-       int                     i, nbp, err;
-       int                     fd;
-
-       if (nent <= 0)
-               return 0;
-       if ((type == LIBXFS_LIO_TYPE_INO) || (type == LIBXFS_LIO_TYPE_RAW)) {
-               if (libxfs_lio_ino_count == 0)
-                       return (0);
-               if (nent > libxfs_lio_ino_count)
-                       nent = libxfs_lio_ino_count;
-       }
-       else if (type == LIBXFS_LIO_TYPE_DIR) {
-               if (libxfs_lio_dir_count == 0)
-                       return (0);
-               if (nent > libxfs_lio_dir_count)
-                       nent = libxfs_lio_dir_count;
-               if (nent > 2)
-                       qsort(voidp, nent, sizeof(libxfs_lio_req_t), lio_compare);
-       }
-       else {
-               fprintf(stderr, "Invalid type 0x%x in libxfs_readbuf_list\n", type);
-               abort();
-               /* NO RETURN */
-               return (0);
-       }
-
-       /* space for lio_listio processing, see libxfs_lio_allocate */
-       rblp = (libxfs_lio_req_t *) voidp;
-       aioclist = (aiocb64_t *) (rblp + nent);
-       aiocptr = (aiocb64_t **) (aioclist + nent);
-       bplist = (xfs_buf_t **) (aiocptr + nent);
-
-       bzero(aioclist, nent*sizeof(aiocb64_t));
-
-       /* look in buffer cache */
-       for (i = 0, nbp = 0; i < nent; i++) {
-               ASSERT(rblp[i].len);
-               bp = libxfs_getbuf(dev, rblp[i].blkno, rblp[i].len);
-               if (bp == NULL)
-                       continue;
-               if (bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY)) {
-                       /* already in cache */
-                       libxfs_putbuf(bp);
-                       continue;
-               }
-               bplist[nbp++] = bp;
-       }
-
-       if (nbp == 0)
-               return (0); /* Nothing to do */
-
-       if (nbp == 1) {
-               libxfs_putbuf(bplist[0]);       /* single buffer, no point */
-               return (0);
-       }
-
-       fd = libxfs_device_to_fd(dev);
-
-       for (i = 0; i < nbp; i++) {
-               aioclist[i].aio_fildes = fd;
-               aioclist[i].aio_nbytes = XFS_BUF_COUNT(bplist[i]);
-               aioclist[i].aio_buf = XFS_BUF_PTR(bplist[i]);
-               aioclist[i].aio_offset = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i]));
-               aioclist[i].aio_lio_opcode = LIO_READ;
-               aiocptr[i] = &aioclist[i];
-       }
-
-       err = lio_listio64(LIO_WAIT, aiocptr, nbp, NULL);
-
-       if (err != 0) {
-               fprintf(stderr, "lio_listio (%d entries) failure err = %d\n", nbp, err);
-       }
-
-       for (i = 0; i < nbp; i++) {
-               /* buffer with data in cache available via future libxfs_readbuf */
-               if (err == 0)
-                       bplist[i]->b_flags |= LIBXFS_B_UPTODATE;
-               libxfs_putbuf(bplist[i]);
-       }
-
-       return (err == 0? nbp : -1);
-#else  /* _AIOCB64_T_DEFINED */
-       return -1;
-#endif /* _AIOCB64_T_DEFINED */
-}
diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c

index 76b51c6d07e2fe096b7f6ea3a0bc5f0de40dd816..921ced7fa1c827b81ccb3f38b9fca39f19fb1519 100644 (file)
--- a/libxfs/rdwr.c
+++ b/libxfs/rdwr.c
@@ -24,6 +24,8 @@
  #define BDSTRAT_SIZE   (256 * 1024)
  #define min(x, y)      ((x) < (y) ? (x) : (y))
  
+#define IO_BCOMPARE_CHECK
+
  void
  libxfs_device_zero(dev_t dev, xfs_daddr_t start, uint len)
  {
@@ -183,6 +185,71 @@ libxfs_log_header(
         return BBTOB(len);
  }
  
+/*
+ * Simple I/O (buffer cache) interface
+ */
+
+
+#ifdef XFS_BUF_TRACING
+
+#undef libxfs_readbuf
+#undef libxfs_writebuf
+#undef libxfs_getbuf
+#undef libxfs_putbuf
+
+xfs_buf_t      *libxfs_readbuf(dev_t, xfs_daddr_t, int, int);
+int            libxfs_writebuf(xfs_buf_t *, int);
+xfs_buf_t      *libxfs_getbuf(dev_t, xfs_daddr_t, int);
+void           libxfs_putbuf (xfs_buf_t *);
+
+xfs_buf_t *
+libxfs_trace_readbuf(const char *func, const char *file, int line, dev_t dev, xfs_daddr_t blkno, int len, int flags)
+{
+       xfs_buf_t       *bp = libxfs_readbuf(dev, blkno, len, flags);
+
+       bp->b_func = func;
+       bp->b_file = file;
+       bp->b_line = line;
+
+       return bp;
+}
+
+int
+libxfs_trace_writebuf(const char *func, const char *file, int line, xfs_buf_t *bp, int flags)
+{
+       bp->b_func = func;
+       bp->b_file = file;
+       bp->b_line = line;
+
+       return libxfs_writebuf(bp, flags);
+}
+
+xfs_buf_t *
+libxfs_trace_getbuf(const char *func, const char *file, int line, dev_t device, xfs_daddr_t blkno, int len)
+{
+       xfs_buf_t       *bp = libxfs_getbuf(device, blkno, len);
+
+       bp->b_func = func;
+       bp->b_file = file;
+       bp->b_line = line;
+
+       return bp;
+}
+
+void
+libxfs_trace_putbuf(const char *func, const char *file, int line, xfs_buf_t *bp)
+{
+       bp->b_func = func;
+       bp->b_file = file;
+       bp->b_line = line;
+
+       libxfs_putbuf(bp);
+}
+
+
+#endif
+
+
  xfs_buf_t *
  libxfs_getsb(xfs_mount_t *mp, int flags)
  {
@@ -190,23 +257,18 @@ libxfs_getsb(xfs_mount_t *mp, int flags)
                                 XFS_FSS_TO_BB(mp, 1), flags);
  }
  
-
-/*
- * Simple I/O (buffer cache) interface
- */
-
  xfs_zone_t     *xfs_buf_zone;
  
  typedef struct {
         dev_t           device;
         xfs_daddr_t     blkno;
-       unsigned int    count;
+       unsigned int    bblen;
  } xfs_bufkey_t;
  
  static unsigned int
  libxfs_bhash(cache_key_t key, unsigned int hashsize)
  {
-       return ((unsigned int)((xfs_bufkey_t *)key)->blkno) % hashsize;
+       return (((unsigned int)((xfs_bufkey_t *)key)->blkno) >> 5) % hashsize;
  }
  
  static int
@@ -218,16 +280,17 @@ libxfs_bcompare(struct cache_node *node, cache_key_t key)
  #ifdef IO_BCOMPARE_CHECK
         if (bp->b_dev == bkey->device &&
             bp->b_blkno == bkey->blkno &&
-           bp->b_bcount != bkey->count)
-               fprintf(stderr, "Badness in key lookup (length)\n"
-                       "bp=(bno %llu, len %u bb) key=(bno %llu, len %u bbs)\n",
+           bp->b_bcount != BBTOB(bkey->bblen))
+               fprintf(stderr, "%lx: Badness in key lookup (length)\n"
+                       "bp=(bno %llu, len %u bytes) key=(bno %llu, len %u bytes)\n",
+                       pthread_self(),
                         (unsigned long long)bp->b_blkno, (int)bp->b_bcount,
-                       (unsigned long long)bkey->blkno, (int)bkey->count);
+                       (unsigned long long)bkey->blkno, BBTOB(bkey->bblen));
  #endif
  
         return (bp->b_dev == bkey->device &&
                 bp->b_blkno == bkey->blkno &&
-               bp->b_bcount == bkey->count);
+               bp->b_bcount == BBTOB(bkey->bblen));
  }
  
  void
@@ -238,27 +301,6 @@ libxfs_bprint(xfs_buf_t *bp)
                 bp->b_flags, bp->b_node.cn_count);
  }
  
-static void
-libxfs_brelse(struct cache_node *node)
-{
-       xfs_buf_t               *bp = (xfs_buf_t *)node;
-       xfs_buf_log_item_t      *bip;
-       extern xfs_zone_t       *xfs_buf_item_zone;
-
-       if (bp != NULL) {
-               if (bp->b_flags & LIBXFS_B_DIRTY)
-                       libxfs_writebufr(bp);
-               bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
-               if (bip)
-                   libxfs_zone_free(xfs_buf_item_zone, bip);
-               free(bp->b_addr);
-               bp->b_addr = NULL;
-               bp->b_flags = 0;
-               free(bp);
-               bp = NULL;
-       }
-}
-
  static void
  libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes)
  {
@@ -274,6 +316,10 @@ libxfs_initbuf(xfs_buf_t *bp, dev_t device, xfs_daddr_t bno, unsigned int bytes)
                         strerror(errno));
                 exit(1);
         }
+#ifdef XFS_BUF_TRACING
+       list_head_init(&bp->b_lock_list);
+#endif
+       pthread_mutex_init(&bp->b_lock, NULL);
  }
  
  xfs_buf_t *
@@ -282,41 +328,63 @@ libxfs_getbufr(dev_t device, xfs_daddr_t blkno, int len)
         xfs_buf_t       *bp;
  
         bp = libxfs_zone_zalloc(xfs_buf_zone);
-       libxfs_initbuf(bp, device, blkno, BBTOB(len));
+       if (bp != NULL)
+               libxfs_initbuf(bp, device, blkno, BBTOB(len));
+#ifdef IO_DEBUG
+       printf("%lx: %s: allocated %u bytes buffer, key=%llu(%llu), %p\n",
+               pthread_self(), __FUNCTION__, BBTOB(len),
+               (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
+#endif
         return bp;
  }
  
-void
-libxfs_putbufr(xfs_buf_t *bp)
-{
-       libxfs_brelse((struct cache_node *)bp);
-}
+
+#ifdef XFS_BUF_TRACING
+struct list_head       lock_buf_list = {&lock_buf_list, &lock_buf_list};
+int                    lock_buf_count = 0;
+#endif
  
  xfs_buf_t *
  libxfs_getbuf(dev_t device, xfs_daddr_t blkno, int len)
  {
         xfs_buf_t       *bp;
         xfs_bufkey_t    key;
-       unsigned int    bytes = BBTOB(len);
+       int             miss;
  
         key.device = device;
         key.blkno = blkno;
-       key.count = bytes;
-
-       if (cache_node_get(libxfs_bcache, &key, (struct cache_node **)&bp)) {
+       key.bblen = len;
+
+       miss = cache_node_get(libxfs_bcache, &key, (struct cache_node **)&bp);
+       if (bp) {
+               pthread_mutex_lock(&bp->b_lock);
+#ifdef XFS_BUF_TRACING
+               pthread_mutex_lock(&libxfs_bcache->c_mutex);
+               lock_buf_count++;
+               list_add(&bp->b_lock_list, &lock_buf_list);
+               pthread_mutex_unlock(&libxfs_bcache->c_mutex);
+#endif
  #ifdef IO_DEBUG
-               fprintf(stderr, "%s: allocated %ubytes buffer, key=%llu(%llu), %p\n",
-                       __FUNCTION__, BBTOB(len),
-                       (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
+               printf("%lx %s: %s buffer %p for bno = %llu\n",
+                       pthread_self(), __FUNCTION__, miss ? "miss" : "hit",
+                       bp, (long long)LIBXFS_BBTOOFF64(blkno));
  #endif
-               libxfs_initbuf(bp, device, blkno, bytes);
         }
+
         return bp;
  }
  
  void
  libxfs_putbuf(xfs_buf_t *bp)
  {
+#ifdef XFS_BUF_TRACING
+       pthread_mutex_lock(&libxfs_bcache->c_mutex);
+       lock_buf_count--;
+       ASSERT(lock_buf_count >= 0);
+       list_del_init(&bp->b_lock_list);
+       pthread_mutex_unlock(&libxfs_bcache->c_mutex);
+#endif
+       pthread_mutex_unlock(&bp->b_lock);
         cache_node_put((struct cache_node *)bp);
  }
  
@@ -327,15 +395,18 @@ libxfs_purgebuf(xfs_buf_t *bp)
  
         key.device = bp->b_dev;
         key.blkno = bp->b_blkno;
-       key.count = bp->b_bcount;
+       key.bblen = bp->b_bcount >> BBSHIFT;
  
         cache_node_purge(libxfs_bcache, &key, (struct cache_node *)bp);
  }
  
  static struct cache_node *
-libxfs_balloc(void)
+libxfs_balloc(cache_key_t key)
  {
-       return libxfs_zone_zalloc(xfs_buf_zone);
+       xfs_bufkey_t    *bufkey = (xfs_bufkey_t *)key;
+
+       return (struct cache_node *)libxfs_getbufr(bufkey->device,
+                                       bufkey->blkno, bufkey->bblen);
  }
  
  int
@@ -354,8 +425,9 @@ libxfs_readbufr(dev_t dev, xfs_daddr_t blkno, xfs_buf_t *bp, int len, int flags)
                 return errno;
         }
  #ifdef IO_DEBUG
-       fprintf(stderr, "readbufr read %ubytes, blkno=%llu(%llu), %p\n",
-               bytes, (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
+       printf("%lx: %s: read %u bytes, blkno=%llu(%llu), %p\n",
+               pthread_self(), __FUNCTION__, bytes,
+               (long long)LIBXFS_BBTOOFF64(blkno), (long long)blkno, bp);
  #endif
         if (bp->b_dev == dev &&
             bp->b_blkno == blkno &&
@@ -371,7 +443,7 @@ libxfs_readbuf(dev_t dev, xfs_daddr_t blkno, int len, int flags)
         int             error;
  
         bp = libxfs_getbuf(dev, blkno, len);
-       if (!(bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
+       if (bp && !(bp->b_flags & (LIBXFS_B_UPTODATE|LIBXFS_B_DIRTY))) {
                 error = libxfs_readbufr(dev, blkno, bp, len, flags);
                 if (error) {
                         libxfs_putbuf(bp);
@@ -403,9 +475,10 @@ libxfs_writebufr(xfs_buf_t *bp)
                 return EIO;
         }
  #ifdef IO_DEBUG
-       fprintf(stderr, "writebufr wrote %ubytes, blkno=%llu(%llu), %p\n",
-               bp->b_bcount, (long long)LIBXFS_BBTOOFF64(bp->b_blkno),
-               (long long)bp->b_blkno, bp);
+       printf("%lx: %s: wrote %u bytes, blkno=%llu(%llu), %p\n",
+                       pthread_self(), __FUNCTION__, bp->b_bcount,
+                       (long long)LIBXFS_BBTOOFF64(bp->b_blkno),
+                       (long long)bp->b_blkno, bp);
  #endif
         bp->b_flags |= LIBXFS_B_UPTODATE;
         bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_EXIT);
@@ -432,7 +505,7 @@ libxfs_iomove(xfs_buf_t *bp, uint boff, int len, void *data, int flags)
  {
  #ifdef IO_DEBUG
         if (boff + len > bp->b_bcount) {
-               fprintf(stderr, "Badness, iomove out of range!\n"
+               printf("Badness, iomove out of range!\n"
                         "bp=(bno %llu, bytes %u) range=(boff %u, bytes %u)\n",
                         (long long)bp->b_blkno, bp->b_bcount, boff, len);
                 abort();
@@ -460,6 +533,35 @@ libxfs_bflush(struct cache_node *node)
                 libxfs_writebufr(bp);
  }
  
+static void
+libxfs_brelse(struct cache_node *node)
+{
+       xfs_buf_t               *bp = (xfs_buf_t *)node;
+       xfs_buf_log_item_t      *bip;
+       extern xfs_zone_t       *xfs_buf_item_zone;
+
+       if (bp != NULL) {
+               if (bp->b_flags & LIBXFS_B_DIRTY)
+                       libxfs_writebufr(bp);
+               bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+               if (bip)
+                       libxfs_zone_free(xfs_buf_item_zone, bip);
+               free(bp->b_addr);
+               pthread_mutex_destroy(&bp->b_lock);
+               bp->b_addr = NULL;
+               bp->b_flags = 0;
+               free(bp);
+               bp = NULL;
+       }
+}
+
+void
+libxfs_putbufr(xfs_buf_t *bp)
+{
+       libxfs_brelse((struct cache_node *)bp);
+}
+
+
  void
  libxfs_bcache_purge(void)
  {
@@ -472,6 +574,12 @@ libxfs_bcache_flush(void)
         cache_flush(libxfs_bcache);
  }
  
+int
+libxfs_bcache_overflowed(void)
+{
+       return cache_overflowed(libxfs_bcache);
+}
+
  struct cache_operations libxfs_bcache_operations = {
         /* .hash */     libxfs_bhash,
         /* .alloc */    libxfs_balloc,
@@ -637,7 +745,7 @@ libxfs_iput(xfs_inode_t *ip, uint lock_flags)
  }
  
  static struct cache_node *
-libxfs_ialloc(void)
+libxfs_ialloc(cache_key_t key)
  {
         return libxfs_zone_zalloc(xfs_inode_zone);
  }
diff --git a/libxfs/trans.c b/libxfs/trans.c

index 815b82bbe073c016ab55e719bade4e9874bc3d72..c37dd808c7ad1b6a94665363591273f86ce39b46 100644 (file)
--- a/libxfs/trans.c
+++ b/libxfs/trans.c
@@ -644,18 +644,17 @@ buf_item_done(
         XFS_BUF_SET_FSPRIVATE2(bp, NULL);       /* remove xact ptr */
  
         hold = (bip->bli_flags & XFS_BLI_HOLD);
-       if (bip->bli_flags & (XFS_BLI_DIRTY|XFS_BLI_STALE)) {
+       if (bip->bli_flags & XFS_BLI_DIRTY) {
  #ifdef XACT_DEBUG
                 fprintf(stderr, "flushing/staling buffer %p (hold=%d)\n",
                         bp, hold);
  #endif
-               if (bip->bli_flags & XFS_BLI_DIRTY)
-                       libxfs_writebuf_int(bp, 0);
-               if (hold)
-                       bip->bli_flags &= ~XFS_BLI_HOLD;
-               else
-                       libxfs_putbuf(bp);
+               libxfs_writebuf_int(bp, 0);
         }
+       if (hold)
+               bip->bli_flags &= ~XFS_BLI_HOLD;
+       else
+               libxfs_putbuf(bp);
         /* release the buf item */
         kmem_zone_free(xfs_buf_item_zone, bip);
  }
diff --git a/man/man8/xfs_repair.8 b/man/man8/xfs_repair.8

index f0d7a5b3226e8dcd6fae4bdb6de5b244f102294b..b4d02c5510b7e048bfb8c31e65d489e2a5a8d8ac 100644 (file)
--- a/man/man8/xfs_repair.8
+++ b/man/man8/xfs_repair.8
@@ -2,41 +2,54 @@
  .SH NAME
  xfs_repair \- repair an XFS filesystem
  .SH SYNOPSIS
-.nf
-\f3xfs_repair\f1 [ \f3\-dLMnPvV\f1 ] [ \f3\-o\f1 subopt[=value] ]
-       [\f3-t\f1 interval] [\f3-l\f1 logdev] [\f3-r\f1 rtdev] xfs_special
-.sp .8v
-\f3xfs_repair\f1 \f3\-f\f1 [ \f3\-dLMnPvV\f1 ] [ \f3\-o\f1 subopt[=value] ]
-       [\f3-t\f1 interval] [\f3-l\f1 logdev] [\f3-r\f1 rtdev] ... file
-.fi
+.B xfs_repair
+[
+.B \-dfLnPv
+] [
+.B \-o
+.I subopt\c
+[\c
+.B =\c
+.IR value ]
+] [
+.B \-t
+.I interval
+] [
+.B \-l
+.I logdev
+] [
+.B \-r
+.I rtdev
+]
+.I device
+.br
+.B xfs_repair \-V
  .SH DESCRIPTION
-.I xfs_repair
-is a parallelized version of
-.IR xfs_repair (1m)
-that repairs corrupt or damaged XFS filesystems
+.B xfs_repair
+repairs corrupt or damaged XFS filesystems
  (see
-.IR xfs (5)).
+.BR xfs (5)).
  The filesystem is specified using the
-.I xfs_special
-argument which should be the device name of the
-disk partition or volume containing
-the filesystem.
-If given the name of a block device,
-.I xfs_repair
+.I device
+argument which should be the device name of the disk partition or
+volume containing the filesystem. If given the name of a block device,
+.B xfs_repair
  will attempt to find the raw device associated
-with the specified block device and will use the raw device
-instead.
+with the specified block device and will use the raw device instead.
  .PP
  Regardless, the filesystem to be repaired
  must be unmounted,
  otherwise, the resulting filesystem may be inconsistent or corrupt.
-.PP
-The options to \f2xfs_repair\f1 are:
+.SH OPTIONS
  .TP
  .B \-f
-Specifies that the special device is actually a file (see the
-\f2mkfs.xfs\f1 \f3\-d\f1 \f2file\f1 option).
-This might happen if an image copy
+Specifies that the filesystem image to be processed is stored in a
+regular file at
+.I device
+(see the
+.B mkfs.xfs \-d
+.I file
+option). This might happen if an image copy
  of a filesystem has been copied or written into an ordinary file.
  This option implies that any external log or realtime section
  is also in an ordinary file.
@@ -44,106 +57,91 @@ is also in an ordinary file.
  .B \-L
  Force Log Zeroing.
  Forces
-.I xfs_repair
+.B xfs_repair
  to zero the log even if it is dirty (contains metadata changes).
  When using this option the filesystem will likely appear to be corrupt,
  and can cause the loss of user files and/or data.
  .TP
-\f3-l\f1 \f2logdev\f1
+.BI \-l " logdev"
  Specifies the device special file where the filesystem's external
-log resides.
-Only for those filesystems which use an external log.
+log resides. Only for those filesystems which use an external log.
  See the
-\f2mkfs.xfs\f1 \f3\-l\f1 option, and refer to
-.IR xfs (5)
+.B mkfs.xfs \-l
+option, and refer to
+.BR xfs (5)
  for a detailed description of the XFS log.
  .TP
-\f3-r\f1 \f2rtdev\f1
+.BI \-r " rtdev"
  Specifies the device special file where the filesystem's realtime
-section resides.
-Only for those filesystems which use a realtime section.
+section resides. Only for those filesystems which use a realtime section.
  See the
-\f2mkfs.xfs\f1 \f3\-r\f1 option, and refer to
-.IR xfs (5)
+.B mkfs.xfs \-r
+option, and refer to
+.BR xfs (5)
  for a detailed description of the XFS realtime section.
  .TP
  .B \-n
-No modify mode.
-Specifies that
-.I xfs_repair
+No modify mode. Specifies that
+.B xfs_repair
  should not modify the filesystem but should only scan the
  filesystem and indicate what repairs would have been made.
-.TP
-\f3-o\f1 \f2subopt[=value]\f1
+.HP
+.B \-o
+.I subopt\c
+[\c
+.B =\c
+.IR value ]
+.br
  Override what the program might conclude about the filesystem
  if left to its own devices.
  .IP
  The
-.B assume_xfs
-suboption
-specifies that the filesystem is an XFS filesystem.
-Normally, if
-.I xfs_repair
-cannot find an XFS superblock, it checks to see if the
-filesystem is an EFS filesystem before it tries to
-regenerate the XFS superblock.
-If the
-.B assume_xfs
-option is in effect,
-.I xfs_repair
-will assume that the filesystem is an XFS filesystem and
-will ignore an EFS superblock if one is found.
-.IP
-The
+.IR subopt ions
+supported are:
+.RS 1.0i
+.TP
  .BI ihash= ihashsize
-suboption modifies the default xfs_repair inode cache hash size.
-The total number of inode cache entries are limited to 8 times this
-amount.
-.IP
-The
+overrides the default inode cache hash size. The total number of
+inode cache entries are limited to 8 times this amount. The default
+.I ihashsize
+is 1024 (for a total of 8192 entries).
+.TP
  .BI bhash= bhashsize
-suboption modifies the default xfs_repair buffer cache hash size.
-The total number of buffer cache entries are limited to 8 times this
-amount.
-.IP
-The
-.BI pfino= inode_blocks
-suboption modifies the default size of read ahead xfs_repair inode
-blocks.
-.IP
-The
-.BI pfdir= dir_blocks
-suboption modifies the default size of read ahead xfs_repair dir
-blocks.
-.IP
-The
-.BI thread= thread_count
-suboption modifies the number of xfs_repair parallel threads.
+overrides the default buffer cache hash size. The total number of
+buffer cache entries are limited to 8 times this amount. The default
+size is set to use up the remainder of 75% of the system's physical
+RAM.
+size
  .TP
-\f3-t\f1 \f2interval\f1
-Modify reporting interval. During long runs xfs_repair outputs
-its progress every 15 minutes. Reporting is only activated when
-xfs_repair is multi-threaded.
+.BI ag_stride= ags_per_concat_unit
+This creates additional processing threads to parallel process
+AGs that span multiple concat units. This can significantly
+reduce repair times on concat based filesystems.
+.RE
  .TP
-.B \-M
-Disable multi-threaded mode. Normally, xfs_repair runs with twice
-the number of threads as processors.
+.B \-t " interval"
+Modify reporting interval. During long runs
+.B xfs_repair
+outputs its progress every 15 minutes. Reporting is only activated when
+ag_stride is enabled.
  .TP
  .B \-P
-Disable read ahead of inode and directory blocks. If applicable,
-a read ahead of up to 16 additional blocks is done.
+Disable prefetching of inode and directory blocks.
  .TP
  .B \-v
  Verbose output.
  .TP
  .B \-d
-Repair dangerously. Allow xfs_repair to repair an XFS filesystem
-mounted read only. This is typically done on a root fileystem from
-single user mode, immediately followed by a reboot.
+Repair dangerously. Allow
+.B xfs_repair
+to repair an XFS filesystem mounted read only. This is typically done
+on a root fileystem from single user mode, immediately followed by a reboot.
+.TP
+.B \-V
+Prints out the current version number and exits.
  .SS Checks Performed
  Inconsistencies corrected include the following:
-.TP
-1.
+.IP 1.
  Inode and inode blockmap (addressing) checks:
  bad magic number in inode,
  bad magic numbers in inode blockmap blocks,
@@ -151,8 +149,7 @@ extents out of order,
  incorrect number of records in inode blockmap blocks,
  blocks claimed that are not in a legal data area of the filesystem,
  blocks that are claimed by more than one inode.
-.TP
-2.
+.IP 2.
  Inode allocation map checks:
  bad magic number in inode map blocks,
  inode state as indicated by map (free or in-use) inconsistent
@@ -161,14 +158,12 @@ inodes referenced by the filesystem that do not appear in
  the inode allocation map,
  inode allocation map referencing blocks that do not appear
  to contain inodes.
-.TP
-3.
+.IP 3.
  Size checks:
  number of blocks claimed by inode inconsistent with inode size,
  directory size not block aligned,
  inode size not consistent with inode format.
-.TP
-4.
+.IP 4.
  Directory checks:
  bad magic numbers in directory blocks,
  incorrect number of entries in a directory block,
@@ -180,24 +175,20 @@ missing or incorrect dot and dotdot entries,
  entries out of hashvalue order,
  incorrect internal directory pointers,
  directory type not consistent with inode format and size.
-.TP
-5.
+.IP 5.
  Pathname checks:
  files or directories not referenced by a pathname starting from
  the filesystem root,
  illegal pathname components.
-.TP
-6.
+.IP 6.
  Link count checks:
  link counts that do not agree with the number of
  directory references to the inode.
-.TP
-7.
+.IP 7.
  Freemap checks:
  blocks claimed free by the freemap but also claimed by an inode,
  blocks unclaimed by any inode but not appearing in the freemap.
-.TP
-8.
+.IP 8.
  Super Block checks:
  total free block and/or free i-node count incorrect,
  filesystem geometry inconsistent,
@@ -209,34 +200,30 @@ reconnected by placing them in the
  directory.
  The name assigned is the inode number.
  .SS Disk Errors
-.I xfs_repair
-aborts on most disk I/O errors.
-Therefore, if you are trying
+.B xfs_repair
+aborts on most disk I/O errors. Therefore, if you are trying
  to repair a filesystem that was damaged due to a disk drive failure,
-steps should be taken to ensure that
-all blocks in the filesystem are readable and writeable
-before attempting to use
-.I xfs_repair
-to repair the filesystem.
-A possible method is using
-.IR dd (8)
+steps should be taken to ensure that all blocks in the filesystem are
+readable and writeable before attempting to use
+.B xfs_repair
+to repair the filesystem. A possible method is using
+.BR dd (8)
  to copy the data onto a good disk.
  .SS lost+found
  The directory
  .I lost+found
  does not have to already exist in the filesystem being repaired.
-If the directory does not exist, it is automatically created.
-If the \f2lost+found\f1 directory already exists,
-the \f2lost+found\f1
-directory is deleted and recreated every time \f2xfs_repair\f1
-runs.
-This ensures that there are no name conflicts in \f2lost+found\f1.
-However, if you rename a file in \f2lost+found\f1 and leave it there,
-if \f2xfs_repair\f1 is run again, that file is renamed back to
-its inode number.
+If the directory does not exist, it is automatically created if required.
+If it already exists, it will be checked for consistency and if valid
+will be used for additional orphaned files. Invalid
+.I lost+found
+directories are removed and recreated. Existing files in a valid
+.I lost+found
+are not removed or renamed.
  .SS Corrupted Superblocks
  XFS has both primary and secondary superblocks.
-\f2xfs_repair\f1 uses information in the primary superblock
+.B xfs_repair
+uses information in the primary superblock
  to automatically find and validate the primary superblock
  against the secondary superblocks before proceeding.
  Should the primary be too corrupted to be useful in locating
@@ -244,75 +231,110 @@ the secondary superblocks, the program scans the filesystem
  until it finds and validates some secondary superblocks.
  At that point, it generates a primary superblock.
  .SS Quotas
-If quotas are in use, it is possible that \f2xfs_repair\f1 will clear
-some or all of the filesystem quota information.
+If quotas are in use, it is possible that
+.B xfs_repair
+will clear some or all of the filesystem quota information.
  If so, the program issues a warning just before it terminates.
  If all quota information is lost, quotas are disabled and the
  program issues a warning to that effect.
  .PP
-Note that \f2xfs_repair\f1 does not check the validity of quota limits.
-It is recommended that you check the quota limit information manually
-after \f2xfs_repair\f1.
+Note that
+.B xfs_repair
+does not check the validity of quota limits. It is recommended
+that you check the quota limit information manually after
+.BR xfs_repair .
  Also, space usage information is automatically regenerated the
  next time the filesystem is mounted with quotas turned on, so the
  next quota mount of the filesystem may take some time.
  .SH DIAGNOSTICS
-.I xfs_repair
+.B xfs_repair
  issues informative messages as it proceeds
  indicating what it has found that is abnormal or any corrective
  action that it has taken.
  Most of the messages are completely understandable only to those
  who are knowledgeable about the structure of the filesystem.
  Some of the more common messages are explained here.
-Note that the language of the messages is slightly different
-if \f2xfs_repair\f1 is run in no-modify mode because the program is not
-changing anything on disk.
+Note that the language of the messages is slightly different if
+.B xfs_repair
+is run in no-modify mode because the program is not changing anything on disk.
  No-modify mode indicates what it would do to repair the filesystem
  if run without the no-modify flag.
  .PP
-disconnected inode \f3xxxx\f1, moving to \f2lost+found\f1
+.B disconnected inode
+.IB ino ,
+.B moving to lost+found
  .IP
  An inode numbered
-.B xxxx
+.I ino
  was not connected to the filesystem
-directory tree and was reconnected to the \f2lost+found\f1 directory.
-The inode is assigned the name of its inode number (i-number).
-If a \f2lost+found\f1 directory does not exist, it is automatically
-created.
+directory tree and was reconnected to the
+.I lost+found
+directory. The inode is assigned the name of its inode number
+.RI ( ino ).
+If a
+.I lost+found
+directory does not exist, it is automatically created.
  .PP
-disconnected dir inode \f3xxxx\f1, moving to \f2lost+found\f1
+.B disconnected dir inode
+.IB ino ,
+.B moving to lost+found
  .IP
  As above only the inode is a directory inode.
-If a directory inode is attached to \f2lost+found\f1, all of its
-children (if any) stay attached to the directory and therefore
+If a directory inode is attached to
+.IR lost+found ,
+all of its children (if any) stay attached to the directory and therefore
  get automatically reconnected when the directory is reconnected.
  .PP
-imap claims in-use inode \f3xxxx\f1 is free, correcting imap
+.B imap claims in-use inode
+.I ino
+.B is free, correcting imap
  .IP
-The inode allocation map thinks that inode \f3xxxx\f1 is
-free whereas examination of the inode indicates that the
+The inode allocation map thinks that inode
+.I ino
+is free whereas examination of the inode indicates that the
  inode may be in use (although it may be disconnected).
  The program updates the inode allocation map.
  .PP
-imap claims free inode \f3xxxx\f1 is in use, correcting imap
+.B imap claims free inode
+.I ino
+.B is in use, correcting imap
  .IP
-The inode allocation map thinks that inode \f3xxxx\f1 is
-in use whereas examination of the inode indicates that the
+The inode allocation map thinks that inode
+.I ino
+is in use whereas examination of the inode indicates that the
  inode is not in use and therefore is free.
  The program updates the inode allocation map.
  .PP
-resetting inode \f3xxxx\f1 nlinks from \f3x\f1 to \f3y\f1
+.B resetting inode
+.I ino
+.B nlinks from
+.I x
+.B to
+.I y
  .IP
  The program detected a mismatch between the
-number of valid directory entries referencing inode \f3xxxx\f1
+number of valid directory entries referencing inode
+.I ino
  and the number of references recorded in the inode and corrected the
  the number in the inode.
  .PP
-\f3fork-type\f1 fork in ino \f3xxxx\f1 claims used block \f3yyyy\f1
+.I fork-type
+.B fork in ino
+.I ino
+.B claims used block
+.I bno
  .IP
-Inode \f3xxxx\f1 claims a block \f3yyyy\f1 that is used (claimed)
-by either another inode or the filesystem itself for metadata storage.
-The \f3fork-type\f1 is either \f3data\f1 or \f3attr\f1
+Inode
+.I ino
+claims a block
+.I bno
+that is used (claimed) by either another inode or the filesystem
+itself for metadata storage. The
+.I fork-type
+is either
+.B data
+or
+.B attr
  indicating whether the problem lies in the portion of the
  inode that tracks regular data or the portion of the inode
  that stores XFS attributes.
@@ -320,114 +342,165 @@ If the inode is a real-time (rt) inode, the message says so.
  Any inode that claims blocks used by the filesystem is deleted.
  If two or more inodes claim the same block, they are both deleted.
  .PP
-\f3fork-type\f1 fork in ino \f3xxxx\f1 claims dup extent ...
+.I fork-type
+.B fork in ino
+.I ino
+.B claims dup extent ...
  .IP
-Inode \f3xxxx\f1 claims a block in an extent known to be
-claimed more than once.
+Inode
+.I ino
+claims a block in an extent known to be claimed more than once.
  The offset in the inode, start and length of the extent is given.
  The message is slightly different
  if the inode is a real-time (rt) inode and the extent is therefore
  a real-time (rt) extent.
  .PP
-inode \f3xxxx\f1 - bad extent ...
+.B inode
+.I ino
+.B \- bad extent ...
  .IP
-An extent record in the blockmap of inode \f3xxxx\f1 claims
-blocks that are out of the legal range of the filesystem.
-The message supplies the start, end, and file offset of
-the extent.
-The message is slightly different
-if the extent is a real-time (rt) extent.
+An extent record in the blockmap of inode
+.I ino
+claims blocks that are out of the legal range of the filesystem.
+The message supplies the start, end, and file offset of the extent.
+The message is slightly different if the extent is a real-time (rt) extent.
  .PP
-bad \f3fork-type\f1 fork in inode \f3xxxx\f1
+.B bad
+.I fork-type
+.B fork in inode
+.I ino
  .IP
  There was something structurally wrong or inconsistent with the
  data structures that map offsets to filesystem blocks.
  .PP
-cleared inode \f3xxxx\f1
+.B cleared inode
+.I ino
  .IP
  There was something wrong with the inode that
  was uncorrectable so the program freed the inode.
  This usually happens because the inode claims
  blocks that are used by something else or the inode itself
-is badly corrupted.
-Typically, this message
+is badly corrupted. Typically, this message
  is preceded by one or more messages indicating why the
  inode needed to be cleared.
  .PP
-bad attribute fork in inode \f3xxxx\f1, clearing attr fork
+.B bad attribute fork in inode
+.IR ino ,
+.B clearing attr fork
  .IP
  There was something wrong with the portion of the inode that
  stores XFS attributes (the attribute fork) so the program reset
  the attribute fork.
  As a result of this, all attributes on that inode are lost.
  .PP
-correcting nextents for inode \f3xxxx\f1, was \f3x\f1 - counted \f3y\f1
+.B correcting nextents for inode
+.IR ino ,
+.B was
+.I x
+.B \- counted
+.I y
  .IP
  The program found that the number of extents used to store
  the data in the inode is wrong and corrected the number.
  The message refers to nextents if the count is wrong
  on the number of extents used to store attribute information.
  .PP
-entry \f3"name"\f1 in dir \f3xxxx\f1 not consistent
-with ..
-value (\f3yyyy\f1) in dir ino \f3xxxx\f1,
-junking entry \f3"name"\f1 in directory inode \f3xxxx\f1
+.B entry
+.I name
+.B in dir
+.I dir_ino
+.B not consistent with .. value
+.BI ( xxxx )
+.B in dir ino
+.IB ino ,
+.B junking entry
+.I name
+.B in directory inode
+.I dir_ino
  .IP
-The entry \f3"name"\f1 in directory inode \f3xxxx\f1 references a
-directory inode \f3yyyy\f1.
-However, the ..\& entry in directory \f3yyyy\f1 does not point
-back to directory \f3xxxx\f1,
-so the program deletes the entry \f3"name"\f1 in directory inode
-\f3xxxx\f1.
-If the directory inode \f3yyyy\f1 winds up becoming a disconnected
-inode as a result of this, it is moved to \f2lost+found\f1 later.
+The entry
+.I name
+in directory inode
+.I dir_ino
+references a directory inode
+.IR ino .
+However, the ..\& entry in directory
+.I ino
+does not point back to directory
+.IR dir_ino ,
+so the program deletes the entry
+.I name
+in directory inode
+.IR dir_ino .
+If the directory inode
+.I ino
+winds up becoming a disconnected inode as a result of this, it is moved to
+.I lost+found
+later.
  .PP
-entry \f3"name"\f1 in dir \f3xxxx\f1 references already
-connected dir ino \f3yyyy\f1,
-junking entry \f3"name"\f1 in directory inode \f3xxxx\f1
+.B entry
+.I name
+.B in dir
+.I dir_ino
+.B references already connected dir ino
+.IB ino ,
+.B junking entry
+.I name
+.B in directory inode
+.I dir_ino
  .IP
-The entry \f3"name"\f1 in directory inode \f3xxxx\f1 points to a
-directory inode \f3yyyy\f1 that is known to be a child of another
-directory.
+The entry
+.I name
+in directory inode
+.I dir_ino
+points to a directory inode
+.I ino
+that is known to be a child of another directory.
  Therefore, the entry is invalid and is deleted.
  This message refers to an entry in a small directory.
  If this were a large directory, the last phrase would read
  "will clear entry".
  .PP
-entry references free inode \f3xxxx\f1 in directory \f3yyyy\f1,
-will clear entry
+.B entry references free inode
+.I ino
+.B in directory
+.IB dir_ino ,
+.B will clear entry
  .IP
-An entry in directory inode \f3yyyy\f1 references an inode \f3xxxx\f1
-that is known to be free.
-The entry is therefore invalid and is deleted.
+An entry in directory inode
+.I dir_ino
+references an inode
+.I ino
+that is known to be free. The entry is therefore invalid and is deleted.
  This message refers to a large directory.
  If the directory were small, the message would read "junking entry ...".
  .SH EXIT STATUS
-.I xfs_repair -n
+.B xfs_repair \-n
  (no modify node)
  will return a status of 1 if filesystem corruption was detected and
  0 if no filesystem corruption was detected.
-.I xfs_repair
+.B xfs_repair
  run without the \-n option will always return a status code of 0.
  .SH BUGS
  The filesystem to be checked and repaired must have been
  unmounted cleanly using normal system administration procedures
  (the
-.IR umount (8)
+.BR umount (8)
  command or system shutdown), not as a result of a crash or system reset.
  If the filesystem has not been unmounted cleanly, mount it and unmount
  it cleanly before running
-.IR xfs_repair .
+.BR xfs_repair .
  .PP
-.I xfs_repair
+.B xfs_repair
  does not do a thorough job on XFS extended attributes.
  The structure of the attribute fork will be consistent,
  but only the contents of attribute forks that will fit into
  an inode are checked.
  This limitation will be fixed in the future.
  .PP
-The no-modify mode (\f3\-n\f1 option) is not completely
-accurate.
+The no-modify mode
+.RB ( \-n
+option) is not completely accurate.
  It does not catch inconsistencies in the freespace and inode
  maps, particularly lost blocks or subtly corrupted maps (trees).
  .PP
@@ -442,9 +515,9 @@ and be sent to an XFS maintainer to be analysed and
  .B xfs_repair
  fixed and/or improved.
  .SH SEE ALSO
-dd(1),
-mkfs.xfs(8),
-umount(8),
-xfs_check(8),
-xfs_metadump(8),
-xfs(5).
+.BR dd (1),
+.BR mkfs.xfs (8),
+.BR umount (8),
+.BR xfs_check (8),
+.BR xfs_metadump (8),
+.BR xfs (5).
diff --git a/repair/Makefile b/repair/Makefile

index 37caf27513e7cf5e86164edecabbbe39766d20fa..955f25cd806747f58064a3adb09ed5c2fc70083d 100644 (file)
--- a/repair/Makefile
+++ b/repair/Makefile
@@ -8,14 +8,15 @@ include $(TOPDIR)/include/builddefs
  LTCOMMAND = xfs_repair
  
  HFILES = agheader.h attr_repair.h avl.h avl64.h bmap.h dinode.h dir.h \
-       dir2.h dir_stack.h err_protos.h globals.h incore.h protos.h rt.h \
-       progress.h scan.h versions.h prefetch.h threads.h
+       dir2.h err_protos.h globals.h incore.h protos.h rt.h \
+       progress.h scan.h versions.h prefetch.h radix-tree.h threads.h
  
  CFILES = agheader.c attr_repair.c avl.c avl64.c bmap.c dino_chunks.c \
-       dinode.c dir.c dir2.c dir_stack.c globals.c incore.c \
+       dinode.c dir.c dir2.c globals.c incore.c \
         incore_bmc.c init.c incore_ext.c incore_ino.c phase1.c \
-       phase2.c phase3.c phase4.c phase5.c phase6.c phase7.c rt.c sb.c \
-       progress.c prefetch.c scan.c threads.c versions.c xfs_repair.c
+       phase2.c phase3.c phase4.c phase5.c phase6.c phase7.c \
+       progress.c prefetch.c radix-tree.c rt.c sb.c scan.c threads.c \
+       versions.c xfs_repair.c
  
  LLDLIBS = $(LIBXFS) $(LIBXLOG) $(LIBUUID) $(LIBPTHREAD) $(LIBRT)
  LTDEPENDENCIES = $(LIBXFS) $(LIBXLOG)
@@ -39,6 +40,7 @@ include $(BUILDRULES)
  # -DXR_BLD_INO_TRACE   building on-disk inode allocation btrees
  # -DXR_BLD_ADD_EXTENT  track phase 5 block extent creation
  # -DXR_BCKPTR_DBG      parent list debugging info
+# -DXR_PF_TRACE                prefetch trace
  #
  #CFLAGS += ...
  
diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c

index aa05b1569f4c4d72ad7a0c50e7637066ed18e9c0..74124eda3ee3a0fb721d3a7097f7a04840cabc6c 100644 (file)
--- a/repair/dino_chunks.c
+++ b/repair/dino_chunks.c
@@ -25,9 +25,8 @@
  #include "err_protos.h"
  #include "dir.h"
  #include "dinode.h"
-#include "prefetch.h"
-#include "threads.h"
  #include "versions.h"
+#include "prefetch.h"
  #include "progress.h"
  
  /*
@@ -150,19 +149,18 @@ verify_inode_chunk(xfs_mount_t            *mp,
                 if (check_inode_block(mp, ino) == 0)
                         return(0);
  
-               PREPAIR_RW_WRITE_LOCK(&per_ag_lock[agno]);
+               pthread_mutex_lock(&ag_locks[agno]);
+
                 switch (state = get_agbno_state(mp, agno, agbno))  {
                 case XR_E_INO:
                         do_warn(
                 _("uncertain inode block %d/%d already known\n"),
                                 agno, agbno);
-                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
                         break;
                 case XR_E_UNKNOWN:
                 case XR_E_FREE1:
                 case XR_E_FREE:
                         set_agbno_state(mp, agno, agbno, XR_E_INO);
-                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
                         break;
                 case XR_E_MULT:
                 case XR_E_INUSE:
@@ -175,17 +173,18 @@ verify_inode_chunk(xfs_mount_t            *mp,
                 _("inode block %d/%d multiply claimed, (state %d)\n"),
                                 agno, agbno, state);
                         set_agbno_state(mp, agno, agbno, XR_E_MULT);
-                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
+                       pthread_mutex_unlock(&ag_locks[agno]);
                         return(0);
                 default:
                         do_warn(
                 _("inode block %d/%d bad state, (state %d)\n"),
                                 agno, agbno, state);
                         set_agbno_state(mp, agno, agbno, XR_E_INO);
-                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
                         break;
                 }
  
+               pthread_mutex_unlock(&ag_locks[agno]);
+
                 start_agino = XFS_OFFBNO_TO_AGINO(mp, agbno, 0);
                 *start_ino = XFS_AGINO_TO_INO(mp, agno, start_agino);
  
@@ -432,7 +431,7 @@ verify_inode_chunk(xfs_mount_t              *mp,
          * user data -- we're probably here as a result of a directory
          * entry or an iunlinked pointer
          */
-       PREPAIR_RW_WRITE_LOCK(&per_ag_lock[agno]);
+       pthread_mutex_lock(&ag_locks[agno]);
         for (j = 0, cur_agbno = chunk_start_agbno;
                         cur_agbno < chunk_stop_agbno; cur_agbno++)  {
                 switch (state = get_agbno_state(mp, agno, cur_agbno))  {
@@ -456,11 +455,11 @@ verify_inode_chunk(xfs_mount_t            *mp,
                 }
  
                 if (j) {
-                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
+                       pthread_mutex_unlock(&ag_locks[agno]);
                         return(0);
                 }
         }
-       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
+       pthread_mutex_unlock(&ag_locks[agno]);
  
         /*
          * ok, chunk is good.  put the record into the tree if required,
@@ -483,7 +482,8 @@ verify_inode_chunk(xfs_mount_t              *mp,
  
         set_inode_used(irec_p, agino - start_agino);
  
-       PREPAIR_RW_WRITE_LOCK(&per_ag_lock[agno]);
+       pthread_mutex_lock(&ag_locks[agno]);
+
         for (cur_agbno = chunk_start_agbno;
                         cur_agbno < chunk_stop_agbno; cur_agbno++)  {
                 switch (state = get_agbno_state(mp, agno, cur_agbno))  {
@@ -513,7 +513,7 @@ verify_inode_chunk(xfs_mount_t              *mp,
                         break;
                 }
         }
-       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
+       pthread_mutex_unlock(&ag_locks[agno]);
  
         return(ino_cnt);
  }
@@ -566,21 +566,27 @@ verify_aginode_chunk_irec(xfs_mount_t     *mp,
   *
   * *bogus is set to 1 if the entire set of inodes is bad.
   */
+
  /* ARGSUSED */
-int
-process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
-                       ino_tree_node_t *first_irec, int ino_discovery,
-                       int check_dups, int extra_attr_check, int *bogus)
+static int
+process_inode_chunk(
+       xfs_mount_t             *mp,
+       xfs_agnumber_t          agno,
+       int                     num_inos,
+       ino_tree_node_t         *first_irec,
+       int                     ino_discovery,
+       int                     check_dups,
+       int                     extra_attr_check,
+       int                     *bogus)
  {
         xfs_ino_t               parent;
         ino_tree_node_t         *ino_rec;
-       xfs_buf_t               *bp;
+       xfs_buf_t               **bplist;
         xfs_dinode_t            *dino;
         int                     icnt;
         int                     status;
         int                     is_used;
         int                     state;
-       int                     done;
         int                     ino_dirty;
         int                     irec_offset;
         int                     ibuf_offset;
@@ -589,6 +595,10 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
         int                     dirty = 0;
         int                     cleared = 0;
         int                     isa_dir = 0;
+       int                     blks_per_cluster;
+       int                     cluster_count;
+       int                     bp_index;
+       int                     cluster_offset;
  
         ASSERT(first_irec != NULL);
         ASSERT(XFS_AGINO_TO_OFFSET(mp, first_irec->ino_startnum) == 0);
@@ -596,44 +606,77 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
         *bogus = 0;
         ASSERT(XFS_IALLOC_BLOCKS(mp) > 0);
  
+       blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+       if (blks_per_cluster == 0)
+               blks_per_cluster = 1;
+       cluster_count = XFS_INODES_PER_CHUNK / inodes_per_cluster;
+       ASSERT(cluster_count > 0);
+
         /*
          * get all blocks required to read in this chunk (may wind up
          * having to process more chunks in a multi-chunk per block fs)
          */
         agbno = XFS_AGINO_TO_AGBNO(mp, first_irec->ino_startnum);
  
-       bp = libxfs_readbuf(mp->m_dev, XFS_AGB_TO_DADDR(mp, agno, agbno),
-                       XFS_FSB_TO_BB(mp, XFS_IALLOC_BLOCKS(mp)), 0);
-       if (!bp) {
-               do_warn(_("cannot read inode %llu, disk block %lld, cnt %d\n"),
-                       XFS_AGINO_TO_INO(mp, agno, first_irec->ino_startnum),
-                       XFS_AGB_TO_DADDR(mp, agno, agbno),
-                       (int)XFS_FSB_TO_BB(mp, XFS_IALLOC_BLOCKS(mp)));
-               return(1);
-       }
-
         /*
          * set up first irec
          */
         ino_rec = first_irec;
+
+       bplist = malloc(cluster_count * sizeof(xfs_buf_t *));
+       if (bplist == NULL)
+               do_error(_("failed to allocate %d bytes of memory\n"),
+                       cluster_count * sizeof(xfs_buf_t*));
+
+       for (bp_index = 0; bp_index < cluster_count; bp_index++) {
+#ifdef XR_PF_TRACE
+               pftrace("about to read off %llu in AG %d",
+                       (long long)XFS_AGB_TO_DADDR(mp, agno, agbno), agno);
+#endif
+               bplist[bp_index] = libxfs_readbuf(mp->m_dev,
+                                       XFS_AGB_TO_DADDR(mp, agno, agbno),
+                                       XFS_FSB_TO_BB(mp, blks_per_cluster), 0);
+               if (!bplist[bp_index]) {
+                       do_warn(_("cannot read inode %llu, disk block %lld, cnt %d\n"),
+                               XFS_AGINO_TO_INO(mp, agno, first_irec->ino_startnum),
+                               XFS_AGB_TO_DADDR(mp, agno, agbno),
+                               (int)XFS_FSB_TO_BB(mp, blks_per_cluster));
+                       while (bp_index > 0) {
+                               bp_index--;
+                               libxfs_putbuf(bplist[bp_index]);
+                       }
+                       free(bplist);
+                       return(1);
+               }
+               agbno += blks_per_cluster;
+
+#ifdef XR_PF_TRACE
+               pftrace("readbuf %p (%llu, %d) in AG %d", bplist[bp_index],
+                       (long long)XFS_BUF_ADDR(bplist[bp_index]),
+                       XFS_BUF_COUNT(bplist[bp_index]), agno);
+#endif
+       }
+       agbno = XFS_AGINO_TO_AGBNO(mp, first_irec->ino_startnum);
+
         /*
          * initialize counters
          */
         irec_offset = 0;
         ibuf_offset = 0;
+       cluster_offset = 0;
         icnt = 0;
         status = 0;
-       done = 0;
+       bp_index = 0;
  
         /*
          * verify inode chunk if necessary
          */
         if (ino_discovery)  {
-               while (!done)  {
+               for (;;)  {
                         /*
                          * make inode pointer
                          */
-                       dino = XFS_MAKE_IPTR(mp, bp, icnt);
+                       dino = XFS_MAKE_IPTR(mp, bplist[bp_index], cluster_offset);
                         agino = irec_offset + ino_rec->ino_startnum;
  
                         /*
@@ -651,6 +694,7 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
  
                         irec_offset++;
                         icnt++;
+                       cluster_offset++;
  
                         if (icnt == XFS_IALLOC_INODES(mp) &&
                                         irec_offset == XFS_INODES_PER_CHUNK)  {
@@ -658,8 +702,6 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
                                  * done! - finished up irec and block
                                  * simultaneously
                                  */
-                               libxfs_putbuf(bp);
-                               done = 1;
                                 break;
                         } else if (irec_offset == XFS_INODES_PER_CHUNK)  {
                                 /*
@@ -669,6 +711,10 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
                                 ASSERT(ino_rec->ino_startnum == agino + 1);
                                 irec_offset = 0;
                         }
+                       if (cluster_offset == inodes_per_cluster) {
+                               bp_index++;
+                               cluster_offset = 0;
+                       }
                 }
  
                 /*
@@ -677,8 +723,9 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
                  */
                 if (!status)  {
                         *bogus = 1;
-                       if (!done) /* already free'd */
-                         libxfs_putbuf(bp);
+                       for (bp_index = 0; bp_index < cluster_count; bp_index++)
+                               libxfs_putbuf(bplist[bp_index]);
+                       free(bplist);
                         return(0);
                 }
  
@@ -688,56 +735,40 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
                 ino_rec = first_irec;
  
                 irec_offset = 0;
-               ibuf_offset = 0;
+               cluster_offset = 0;
+               bp_index = 0;
                 icnt = 0;
                 status = 0;
-               done = 0;
-
-               /* nathans TODO ... memory leak here?: */
-
-               /*
-                * get first block
-                */
-               bp = libxfs_readbuf(mp->m_dev,
-                               XFS_AGB_TO_DADDR(mp, agno, agbno),
-                               XFS_FSB_TO_BB(mp, XFS_IALLOC_BLOCKS(mp)), 0);
-               if (!bp) {
-                       do_warn(_("can't read inode %llu, disk block %lld, "
-                               "cnt %d\n"), XFS_AGINO_TO_INO(mp, agno, agino),
-                               XFS_AGB_TO_DADDR(mp, agno, agbno),
-                               (int)XFS_FSB_TO_BB(mp, XFS_IALLOC_BLOCKS(mp)));
-                       return(1);
-               }
         }
  
         /*
          * mark block as an inode block in the incore bitmap
          */
-       PREPAIR_RW_WRITE_LOCK(&per_ag_lock[agno]);
+       pthread_mutex_lock(&ag_locks[agno]);
         switch (state = get_agbno_state(mp, agno, agbno))  {
-       case XR_E_INO:  /* already marked */
-               break;
-       case XR_E_UNKNOWN:
-       case XR_E_FREE:
-       case XR_E_FREE1:
-               set_agbno_state(mp, agno, agbno, XR_E_INO);
-               break;
-       case XR_E_BAD_STATE:
-               do_error(_("bad state in block map %d\n"), state);
-               break;
-       default:
-               set_agbno_state(mp, agno, agbno, XR_E_MULT);
-               do_warn(_("inode block %llu multiply claimed, state was %d\n"),
-                       XFS_AGB_TO_FSB(mp, agno, agbno), state);
-               break;
+               case XR_E_INO:  /* already marked */
+                       break;
+               case XR_E_UNKNOWN:
+               case XR_E_FREE:
+               case XR_E_FREE1:
+                       set_agbno_state(mp, agno, agbno, XR_E_INO);
+                       break;
+               case XR_E_BAD_STATE:
+                       do_error(_("bad state in block map %d\n"), state);
+                       break;
+               default:
+                       set_agbno_state(mp, agno, agbno, XR_E_MULT);
+                       do_warn(_("inode block %llu multiply claimed, state was %d\n"),
+                               XFS_AGB_TO_FSB(mp, agno, agbno), state);
+                       break;
         }
-       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
+       pthread_mutex_unlock(&ag_locks[agno]);
  
-       while (!done)  {
+       for (;;) {
                 /*
                  * make inode pointer
                  */
-               dino = XFS_MAKE_IPTR(mp, bp, icnt);
+               dino = XFS_MAKE_IPTR(mp, bplist[bp_index], cluster_offset);
                 agino = irec_offset + ino_rec->ino_startnum;
  
                 is_used = 3;
@@ -868,18 +899,24 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
                 irec_offset++;
                 ibuf_offset++;
                 icnt++;
+               cluster_offset++;
  
                 if (icnt == XFS_IALLOC_INODES(mp) &&
                                 irec_offset == XFS_INODES_PER_CHUNK)  {
                         /*
                          * done! - finished up irec and block simultaneously
                          */
-                       if (dirty && !no_modify)
-                               libxfs_writebuf(bp, 0);
-                       else
-                               libxfs_putbuf(bp);
-
-                       done = 1;
+                       for (bp_index = 0; bp_index < cluster_count; bp_index++) {
+#ifdef XR_PF_TRACE
+                               pftrace("put/writebuf %p (%llu) in AG %d", bplist[bp_index],
+                                       (long long)XFS_BUF_ADDR(bplist[bp_index]), agno);
+#endif
+                               if (dirty && !no_modify)
+                                       libxfs_writebuf(bplist[bp_index], 0);
+                               else
+                                       libxfs_putbuf(bplist[bp_index]);
+                       }
+                       free(bplist);
                         break;
                 } else if (ibuf_offset == mp->m_sb.sb_inopblock)  {
                         /*
@@ -889,7 +926,7 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
                         ibuf_offset = 0;
                         agbno++;
  
-                       PREPAIR_RW_WRITE_LOCK(&per_ag_lock[agno]);
+                       pthread_mutex_lock(&ag_locks[agno]);
                         switch (state = get_agbno_state(mp, agno, agbno))  {
                         case XR_E_INO:  /* already marked */
                                 break;
@@ -909,7 +946,7 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
                                         XFS_AGB_TO_FSB(mp, agno, agbno), state);
                                 break;
                         }
-                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
+                       pthread_mutex_unlock(&ag_locks[agno]);
  
                 } else if (irec_offset == XFS_INODES_PER_CHUNK)  {
                         /*
@@ -919,6 +956,10 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
                         ASSERT(ino_rec->ino_startnum == agino + 1);
                         irec_offset = 0;
                 }
+               if (cluster_offset == inodes_per_cluster) {
+                       bp_index++;
+                       cluster_offset = 0;
+               }
         }
         return(0);
  }
@@ -936,16 +977,21 @@ process_inode_chunk(xfs_mount_t *mp, xfs_agnumber_t agno, int num_inos,
   * phase 4 after we've run through and set the bitmap once.
   */
  void
-process_aginodes(xfs_mount_t *mp, xfs_agnumber_t agno,
-               int ino_discovery, int check_dups, int extra_attr_check)
+process_aginodes(
+       xfs_mount_t             *mp,
+       prefetch_args_t         *pf_args,
+       xfs_agnumber_t          agno,
+       int                     ino_discovery,
+       int                     check_dups,
+       int                     extra_attr_check)
  {
-       int num_inos, bogus;
-       ino_tree_node_t *ino_rec, *first_ino_rec, *prev_ino_rec;
-       ino_tree_node_t *ino_ra;
-
-       ino_ra = do_prefetch ? prefetch_inode_chunks(mp, agno, NULL) : NULL;
-
+       int                     num_inos, bogus;
+       ino_tree_node_t         *ino_rec, *first_ino_rec, *prev_ino_rec;
+#ifdef XR_PF_TRACE
+       int                     count;
+#endif
         first_ino_rec = ino_rec = findfirst_inode_rec(agno);
+
         while (ino_rec != NULL)  {
                 /*
                  * paranoia - step through inode records until we step
@@ -957,7 +1003,6 @@ process_aginodes(xfs_mount_t *mp, xfs_agnumber_t agno,
                  */
                 num_inos = XFS_INODES_PER_CHUNK;
                 while (num_inos < XFS_IALLOC_INODES(mp) && ino_rec != NULL)  {
-                       ASSERT(ino_rec != NULL);
                         /*
                          * inodes chunks will always be aligned and sized
                          * correctly
@@ -968,11 +1013,18 @@ process_aginodes(xfs_mount_t *mp, xfs_agnumber_t agno,
  
                 ASSERT(num_inos == XFS_IALLOC_INODES(mp));
  
-               if (do_prefetch && ino_ra && (first_ino_rec->ino_startnum >= ino_ra->ino_startnum))
-                       ino_ra = prefetch_inode_chunks(mp, agno, ino_ra);
+               if (pf_args) {
+                       sem_post(&pf_args->ra_count);
+#ifdef XR_PF_TRACE
+                       sem_getvalue(&pf_args->ra_count, &count);
+                       pftrace("processing inode chunk %p in AG %d (sem count = %d)",
+                               first_ino_rec, agno, count);
+#endif
+               }
  
                 if (process_inode_chunk(mp, agno, num_inos, first_ino_rec,
-                               ino_discovery, check_dups, extra_attr_check, &bogus))  {
+                               ino_discovery, check_dups, extra_attr_check,
+                               &bogus))  {
                         /* XXX - i/o error, we've got a problem */
                         abort();
                 }
diff --git a/repair/dinode.c b/repair/dinode.c

index eba620cea9024be98eb553c770e6ddbcaf226cd6..e9e78c96d7a3e0e6494294fd5a6e7a8a886400cb 100644 (file)
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -514,28 +514,121 @@ get_bmbt_reclist(
         return(NULLDFSBNO);
  }
  
-/*
- * process_bmbt_reclist_int is the most compute intensive
- * function in repair. The following macros reduce the
- * the large number of lock/unlock steps it would otherwise
- * call.
- */
-#define        PROCESS_BMBT_DECL(type, var)    type var
  
-#define        PROCESS_BMBT_LOCK(agno)                                                 \
-       if (do_parallel && (agno != locked_agno)) {                             \
-               if (locked_agno != -1)  /* release old ag lock */               \
-                       PREPAIR_RW_UNLOCK_NOTEST(&per_ag_lock[locked_agno]);    \
-               PREPAIR_RW_WRITE_LOCK_NOTEST(&per_ag_lock[agno]);               \
-               locked_agno = agno;                                             \
+static int
+process_rt_rec(
+       xfs_mount_t             *mp,
+       xfs_bmbt_rec_32_t       *rp,
+       xfs_ino_t               ino,
+       xfs_drfsbno_t           *tot,
+       int                     check_dups)
+{
+       xfs_dfsbno_t            b;
+       xfs_drtbno_t            ext;
+       xfs_dfilblks_t          c;              /* count */
+       xfs_dfsbno_t            s;              /* start */
+       xfs_dfiloff_t           o;              /* offset */
+       int                     state;
+       int                     flag;           /* extent flag */
+       int                     pwe;            /* partially-written extent */
+
+       convert_extent(rp, &o, &s, &c, &flag);
+
+       /*
+        * check numeric validity of the extent
+        */
+       if (s >= mp->m_sb.sb_rblocks)  {
+               do_warn(_("inode %llu - bad rt extent start block number "
+                               "%llu, offset %llu\n"), ino, s, o);
+               return 1;
+       }
+       if (s + c - 1 >= mp->m_sb.sb_rblocks)  {
+               do_warn(_("inode %llu - bad rt extent last block number %llu, "
+                               "offset %llu\n"), ino, s + c - 1, o);
+               return 1;
+       }
+       if (s + c - 1 < s)  {
+               do_warn(_("inode %llu - bad rt extent overflows - start %llu, "
+                               "end %llu, offset %llu\n"),
+                               ino, s, s + c - 1, o);
+               return 1;
+       }
+
+       /*
+        * verify that the blocks listed in the record
+        * are multiples of an extent
+        */
+       if (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) == 0 &&
+                       (s % mp->m_sb.sb_rextsize != 0 ||
+                        c % mp->m_sb.sb_rextsize != 0)) {
+               do_warn(_("malformed rt inode extent [%llu %llu] (fs rtext "
+                               "size = %u)\n"), s, c, mp->m_sb.sb_rextsize);
+               return 1;
+       }
+
+       /*
+        * set the appropriate number of extents
+        */
+       for (b = s; b < s + c; b += mp->m_sb.sb_rextsize)  {
+               ext = (xfs_drtbno_t) b / mp->m_sb.sb_rextsize;
+               pwe = XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && flag &&
+                               (b % mp->m_sb.sb_rextsize != 0);
+
+               if (check_dups == 1)  {
+                       if (search_rt_dup_extent(mp, ext) && !pwe)  {
+                               do_warn(_("data fork in rt ino %llu claims "
+                                               "dup rt extent, off - %llu, "
+                                               "start - %llu, count %llu\n"),
+                                               ino, o, s, c);
+                               return 1;
+                       }
+                       continue;
+               }
+
+               state = get_rtbno_state(mp, ext);
+
+               switch (state)  {
+                       case XR_E_FREE:
+                       case XR_E_UNKNOWN:
+                               set_rtbno_state(mp, ext, XR_E_INUSE);
+                               break;
+
+                       case XR_E_BAD_STATE:
+                               do_error(_("bad state in rt block map %llu\n"),
+                                               ext);
+
+                       case XR_E_FS_MAP:
+                       case XR_E_INO:
+                       case XR_E_INUSE_FS:
+                               do_error(_("data fork in rt inode %llu found "
+                                               "metadata block %llu in rt bmap\n"),
+                                               ino, ext);
+
+                       case XR_E_INUSE:
+                               if (pwe)
+                                       break;
+
+                       case XR_E_MULT:
+                               set_rtbno_state(mp, ext, XR_E_MULT);
+                               do_warn(_("data fork in rt inode %llu claims "
+                                               "used rt block %llu\n"),
+                                               ino, ext);
+                               return 1;
+
+                       case XR_E_FREE1:
+                       default:
+                               do_error(_("illegal state %d in rt block map "
+                                               "%llu\n"), state, b);
+               }
         }
  
-#define        PROCESS_BMBT_UNLOCK_RETURN(val)                                         \
-       do {                                                                    \
-               if (locked_agno != -1)                                          \
-                       PREPAIR_RW_UNLOCK_NOTEST(&per_ag_lock[locked_agno]);    \
-               return (val);                                                   \
-       } while (0)
+       /*
+        * bump up the block counter
+        */
+       *tot += c;
+
+       return 0;
+}
  
  /*
   * return 1 if inode should be cleared, 0 otherwise
@@ -560,7 +653,6 @@ process_bmbt_reclist_int(
         int                     whichfork)
  {
         xfs_dfsbno_t            b;
-       xfs_drtbno_t            ext;
         xfs_dfilblks_t          c;              /* count */
         xfs_dfilblks_t          cp = 0;         /* prev count */
         xfs_dfsbno_t            s;              /* start */
@@ -572,12 +664,11 @@ process_bmbt_reclist_int(
         int                     i;
         int                     state;
         int                     flag;           /* extent flag */
-       int                     pwe;            /* partially-written extent */
         xfs_dfsbno_t            e;
         xfs_agnumber_t          agno;
         xfs_agblock_t           agbno;
-       PROCESS_BMBT_DECL
-                               (xfs_agnumber_t, locked_agno=-1);
+       xfs_agnumber_t          locked_agno = -1;
+       int                     error = 1;
  
         if (whichfork == XFS_DATA_FORK)
                 forkname = _("data");
@@ -596,11 +687,10 @@ process_bmbt_reclist_int(
                 else
                         *last_key = o;
                 if (i > 0 && op + cp > o)  {
-                       do_warn(
-       _("bmap rec out of order, inode %llu entry %d "
-         "[o s c] [%llu %llu %llu], %d [%llu %llu %llu]\n"),
+                       do_warn(_("bmap rec out of order, inode %llu entry %d "
+                               "[o s c] [%llu %llu %llu], %d [%llu %llu %llu]\n"),
                                 ino, i, o, s, c, i-1, op, sp, cp);
-                       PROCESS_BMBT_UNLOCK_RETURN(1);
+                       goto done;
                 }
                 op = o;
                 cp = c;
@@ -610,150 +700,18 @@ process_bmbt_reclist_int(
                  * check numeric validity of the extent
                  */
                 if (c == 0)  {
-                       do_warn(
-       _("zero length extent (off = %llu, fsbno = %llu) in ino %llu\n"),
-                               o, s, ino);
-                       PROCESS_BMBT_UNLOCK_RETURN(1);
-               }
-               if (type == XR_INO_RTDATA) {
-                       if (s >= mp->m_sb.sb_rblocks)  {
-                               do_warn(
-       _("inode %llu - bad rt extent start block number %llu, offset %llu\n"),
-                                       ino, s, o);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                       }
-                       if (s + c - 1 >= mp->m_sb.sb_rblocks)  {
-                               do_warn(
-       _("inode %llu - bad rt extent last block number %llu, offset %llu\n"),
-                                       ino, s + c - 1, o);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                       }
-                       if (s + c - 1 < s)  {
-                               do_warn(
-       _("inode %llu - bad rt extent overflows - start %llu, end %llu, "
-         "offset %llu\n"),
-                                       ino, s, s + c - 1, o);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                       }
-               } else  {
-                       switch (verify_dfsbno_range(mp, s, c)) {
-                       case XR_DFSBNORANGE_VALID:
-                               break;
-                       case XR_DFSBNORANGE_BADSTART:
-                               do_warn(
-       _("inode %llu - bad extent starting block number %llu, offset %llu\n"),
-                                       ino, s, o);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                       case XR_DFSBNORANGE_BADEND:
-                               do_warn(
-       _("inode %llu - bad extent last block number %llu, offset %llu\n"),
-                                       ino, s + c - 1, o);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                       case XR_DFSBNORANGE_OVERFLOW:
-                               do_warn(
-
-       _("inode %llu - bad extent overflows - start %llu, end %llu, "
-         "offset %llu\n"),
-                                       ino, s, s + c - 1, o);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                       }
-                       if (o >= fs_max_file_offset)  {
-                               do_warn(
-       _("inode %llu - extent offset too large - start %llu, count %llu, "
-         "offset %llu\n"),
-                                       ino, s, c, o);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                       }
+                       do_warn(_("zero length extent (off = %llu, "
+                               "fsbno = %llu) in ino %llu\n"), o, s, ino);
+                       goto done;
                 }
  
-               /*
-                * realtime file data fork
-                */
-               if (type == XR_INO_RTDATA && whichfork == XFS_DATA_FORK)  {
+               if (type == XR_INO_RTDATA && whichfork == XFS_DATA_FORK) {
                         /*
-                        * XXX - verify that the blocks listed in the record
-                        * are multiples of an extent
+                        * realtime bitmaps don't use AG locks, so returning
+                        * immediately is fine for this code path.
                          */
-                       if (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) == 0
-                          && (s % mp->m_sb.sb_rextsize != 0 ||
-                                       c % mp->m_sb.sb_rextsize != 0))  {
-                               do_warn(
-       _("malformed rt inode extent [%llu %llu] (fs rtext size = %u)\n"),
-                                       s, c, mp->m_sb.sb_rextsize);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                       }
-
-                       /*
-                        * XXX - set the appropriate number of extents
-                        */
-                       for (b = s; b < s + c; b += mp->m_sb.sb_rextsize)  {
-                               ext = (xfs_drtbno_t) b / mp->m_sb.sb_rextsize;
-                               if (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) &&
-                                   flag && (b % mp->m_sb.sb_rextsize != 0)) {
-                                       pwe = 1;
-                               } else {
-                                       pwe = 0;
-                               }
-
-                               if (check_dups == 1)  {
-                                       if (search_rt_dup_extent(mp, ext) &&
-                                           !pwe)  {
-                                               do_warn(
-       _("data fork in rt ino %llu claims dup rt extent, off - %llu, "
-         "start - %llu, count %llu\n"),
-                                                       ino, o, s, c);
-                                               PROCESS_BMBT_UNLOCK_RETURN(1);
-                                       }
-                                       continue;
-                               }
-
-                               state = get_rtbno_state(mp, ext);
-
-                               switch (state)  {
-                               case XR_E_FREE:
-/* XXX - turn this back on after we
-       run process_rtbitmap() in phase2
-                                       do_warn(
-                       _("%s fork in rt ino %llu claims free rt block %llu\n"),
-                                               forkname, ino, ext);
-*/
-                                       /* fall through ... */
-                               case XR_E_UNKNOWN:
-                                       set_rtbno_state(mp, ext, XR_E_INUSE);
-                                       break;
-                               case XR_E_BAD_STATE:
-                                       do_error(
-                               _("bad state in rt block map %llu\n"), ext);
-                                       abort();
-                                       break;
-                               case XR_E_FS_MAP:
-                               case XR_E_INO:
-                               case XR_E_INUSE_FS:
-                                       do_error(
-       _("%s fork in rt inode %llu found metadata block %llu in %s bmap\n"),
-                                               forkname, ino, ext, ftype);
-                               case XR_E_INUSE:
-                                       if (pwe)
-                                               break;
-                               case XR_E_MULT:
-                                       set_rtbno_state(mp, ext, XR_E_MULT);
-                                       do_warn(
-       _("%s fork in rt inode %llu claims used rt block %llu\n"),
-                                               forkname, ino, ext);
-                                       PROCESS_BMBT_UNLOCK_RETURN(1);
-                               case XR_E_FREE1:
-                               default:
-                                       do_error(
-                               _("illegal state %d in %s block map %llu\n"),
-                                               state, ftype, b);
-                               }
-                       }
-
-                       /*
-                        * bump up the block counter
-                        */
-                       *tot += c;
-
+                       if (process_rt_rec(mp, rp, ino, tot, check_dups))
+                               return 1;
                         /*
                          * skip rest of loop processing since that's
                          * all for regular file forks and attr forks
@@ -761,10 +719,38 @@ process_bmbt_reclist_int(
                         continue;
                 }
  
-
                 /*
                  * regular file data fork or attribute fork
                  */
+               switch (verify_dfsbno_range(mp, s, c)) {
+                       case XR_DFSBNORANGE_VALID:
+                               break;
+
+                       case XR_DFSBNORANGE_BADSTART:
+                               do_warn(_("inode %llu - bad extent starting "
+                                       "block number %llu, offset %llu\n"),
+                                       ino, s, o);
+                               goto done;
+
+                       case XR_DFSBNORANGE_BADEND:
+                               do_warn(_("inode %llu - bad extent last block "
+                                       "number %llu, offset %llu\n"),
+                                       ino, s + c - 1, o);
+                               goto done;
+
+                       case XR_DFSBNORANGE_OVERFLOW:
+                               do_warn(_("inode %llu - bad extent overflows - "
+                                       "start %llu, end %llu, offset %llu\n"),
+                                       ino, s, s + c - 1, o);
+                               goto done;
+               }
+               if (o >= fs_max_file_offset)  {
+                       do_warn(_("inode %llu - extent offset too large - "
+                               "start %llu, count %llu, offset %llu\n"),
+                               ino, s, c, o);
+                       goto done;
+               }
+
                 if (blkmapp && *blkmapp)
                         blkmap_set_ext(blkmapp, o, s, c);
                 /*
@@ -774,35 +760,36 @@ process_bmbt_reclist_int(
                 agno = XFS_FSB_TO_AGNO(mp, s);
                 agbno = XFS_FSB_TO_AGBNO(mp, s);
                 e = s + c;
-               PROCESS_BMBT_LOCK(agno);
-               for (b = s; b < e; b++, agbno++)  {
-                       if (check_dups == 1)  {
-                               /*
-                                * if we're just checking the bmap for dups,
-                                * return if we find one, otherwise, continue
-                                * checking each entry without setting the
-                                * block bitmap
-                                */
+               if (agno != locked_agno) {
+                       if (locked_agno != -1)
+                               pthread_mutex_unlock(&ag_locks[locked_agno]);
+                       pthread_mutex_lock(&ag_locks[agno]);
+                       locked_agno = agno;
+               }
+
+               if (check_dups) {
+                       /*
+                        * if we're just checking the bmap for dups,
+                        * return if we find one, otherwise, continue
+                        * checking each entry without setting the
+                        * block bitmap
+                        */
+                       for (b = s; b < e; b++, agbno++)  {
                                 if (search_dup_extent(mp, agno, agbno)) {
-                                       do_warn(
-       _("%s fork in ino %llu claims dup extent, off - %llu, "
-         "start - %llu, cnt %llu\n"),
+                                       do_warn(_("%s fork in ino %llu claims "
+                                               "dup extent, off - %llu, "
+                                               "start - %llu, cnt %llu\n"),
                                                 forkname, ino, o, s, c);
-                                       PROCESS_BMBT_UNLOCK_RETURN(1);
+                                       goto done;
                                 }
-                               continue;
-                       }
-
-                       /* FIX FOR BUG 653709 -- EKN
-                        * realtime attribute fork, should be valid block number
-                        * in regular data space, not realtime partion.
-                        */
-                       if (type == XR_INO_RTDATA && whichfork == XFS_ATTR_FORK) {
-                         if (mp->m_sb.sb_agcount < agno)
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
                         }
+                       *tot += c;
+                       continue;
+               }
  
-                       /* Process in chunks of 16 (XR_BB_UNIT/XR_BB)
+               for (b = s; b < e; b++, agbno++)  {
+                       /*
+                        * Process in chunks of 16 (XR_BB_UNIT/XR_BB)
                          * for common XR_E_UNKNOWN to XR_E_INUSE transition
                          */
                         if (((agbno & XR_BB_MASK) == 0) && ((s + c - b) >= (XR_BB_UNIT/XR_BB))) {
@@ -816,45 +803,49 @@ process_bmbt_reclist_int(
                         }
  
                         state = get_agbno_state(mp, agno, agbno);
+
                         switch (state)  {
                         case XR_E_FREE:
                         case XR_E_FREE1:
-                               do_warn(
-                       _("%s fork in ino %llu claims free block %llu\n"),
+                               do_warn(_("%s fork in ino %llu claims free "
+                                       "block %llu\n"),
                                         forkname, ino, (__uint64_t) b);
                                 /* fall through ... */
                         case XR_E_UNKNOWN:
                                 set_agbno_state(mp, agno, agbno, XR_E_INUSE);
                                 break;
+
                         case XR_E_BAD_STATE:
                                 do_error(_("bad state in block map %llu\n"), b);
-                               abort();
-                               break;
+
                         case XR_E_FS_MAP:
                         case XR_E_INO:
                         case XR_E_INUSE_FS:
-                               do_warn(
-                       _("%s fork in inode %llu claims metadata block %llu\n"),
+                               do_warn(_("%s fork in inode %llu claims "
+                                       "metadata block %llu\n"),
                                         forkname, ino, (__uint64_t) b);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
+                               goto done;
+
                         case XR_E_INUSE:
                         case XR_E_MULT:
                                 set_agbno_state(mp, agno, agbno, XR_E_MULT);
-                               do_warn(
-                       _("%s fork in %s inode %llu claims used block %llu\n"),
+                               do_warn(_("%s fork in %s inode %llu claims "
+                                       "used block %llu\n"),
                                         forkname, ftype, ino, (__uint64_t) b);
-                               PROCESS_BMBT_UNLOCK_RETURN(1);
+                               goto done;
+
                         default:
-                               do_error(
-                       _("illegal state %d in block map %llu\n"),
+                               do_error(_("illegal state %d in block map %llu\n"),
                                         state, b);
-                               abort();
                         }
                 }
                 *tot += c;
         }
-
-       PROCESS_BMBT_UNLOCK_RETURN(0);
+       error = 0;
+done:
+       if (locked_agno != -1)
+               pthread_mutex_unlock(&ag_locks[locked_agno]);
+       return error;
  }
  
  /*
diff --git a/repair/dinode.h b/repair/dinode.h

index c785cd264abcf14e6ce82357c2fa3aaa273de570..83c297a625c12da599701b28e67815779971d5f3 100644 (file)
--- a/repair/dinode.h
+++ b/repair/dinode.h
@@ -18,6 +18,8 @@
  #ifndef _XR_DINODE_H
  #define _XR_DINODE_H
  
+#include "prefetch.h"
+
  struct blkmap;
  
  int
@@ -116,6 +118,7 @@ process_uncertain_aginodes(xfs_mount_t              *mp,
                                 xfs_agnumber_t  agno);
  void
  process_aginodes(xfs_mount_t   *mp,
+               prefetch_args_t *pf_args,
                 xfs_agnumber_t  agno,
                 int             check_dirs,
                 int             check_dups,
diff --git a/repair/dir.c b/repair/dir.c

index a1a8095833025d12c9e31d1e40d99380e6c9821d..0ec51ee222689424ed2bfbeb2819dbbdcc05abe1 100644 (file)
--- a/repair/dir.c
+++ b/repair/dir.c
@@ -26,7 +26,6 @@
  #include "dinode.h"
  #include "dir.h"
  #include "bmap.h"
-#include "prefetch.h"
  
  #if XFS_DIR_LEAF_MAPSIZE >= XFS_ATTR_LEAF_MAPSIZE
  #define XR_DA_LEAF_MAPSIZE     XFS_DIR_LEAF_MAPSIZE
@@ -781,9 +780,6 @@ traverse_int_dablock(xfs_mount_t    *mp,
         node = NULL;
         da_cursor->active = 0;
  
-       if (do_prefetch && (whichfork == XFS_DATA_FORK))
-               prefetch_dir1(mp, bno, da_cursor);
-
         do {
                 /*
                  * read in each block along the way and set up cursor
diff --git a/repair/dir2.c b/repair/dir2.c

index b471de098ddfcbb23067cf283f4e201266c70ea8..fb8b7febcd08f3d45601f48416922b7dcd1f55d3 100644 (file)
--- a/repair/dir2.c
+++ b/repair/dir2.c
@@ -26,6 +26,7 @@
  #include "dir2.h"
  #include "bmap.h"
  #include "prefetch.h"
+#include "progress.h"
  
  /*
   * Tag bad directory entries with this.
@@ -87,10 +88,19 @@ da_read_buf(
         xfs_buf_t       *bparray[4];
         xfs_buf_t       **bplist;
         xfs_dabuf_t     *dabuf;
-       int             i;
+       int             i, j;
         int             off;
+       int             nblocks;
+
+       /*
+        * due to limitations in libxfs_cache, we need to read the
+        * blocks in fsblock size chunks
+        */
+
+       for (i = 0, nblocks = 0; i < nex; i++)
+               nblocks += bmp[i].blockcount;
  
-       if (nex > (sizeof(bparray)/sizeof(xfs_buf_t *))) {
+       if (nblocks > (sizeof(bparray)/sizeof(xfs_buf_t *))) {
                 bplist = calloc(nex, sizeof(*bplist));
                 if (bplist == NULL) {
                         do_error(_("couldn't malloc dir2 buffer list\n"));
@@ -101,21 +111,39 @@ da_read_buf(
                 /* common case avoids calloc/free */
                 bplist = bparray;
         }
-       for (i = 0; i < nex; i++) {
-               bplist[i] = libxfs_readbuf(mp->m_dev,
-                               XFS_FSB_TO_DADDR(mp, bmp[i].startblock),
-                               XFS_FSB_TO_BB(mp, bmp[i].blockcount), 0);
-               if (!bplist[i])
-                       goto failed;
+       for (i = 0, j = 0; j < nex; j++) {
+               xfs_dfsbno_t    bno;
+               int             c;
+
+               bno = bmp[j].startblock;
+               for (c = 0; c < bmp[j].blockcount; c++, bno++) {
+#ifdef XR_PF_TRACE
+                       pftrace("about to read off %llu",
+                               (long long)XFS_FSB_TO_DADDR(mp, bno));
+#endif
+                       bplist[i] = libxfs_readbuf(mp->m_dev,
+                                       XFS_FSB_TO_DADDR(mp, bno),
+                                       XFS_FSB_TO_BB(mp, 1), 0);
+                       if (!bplist[i])
+                               goto failed;
+#ifdef XR_PF_TRACE
+                       pftrace("readbuf %p (%llu, %d)", bplist[i],
+                               (long long)XFS_BUF_ADDR(bplist[i]),
+                               XFS_BUF_COUNT(bplist[i]));
+#endif
+                       i++;
+               }
         }
-       dabuf = malloc(XFS_DA_BUF_SIZE(nex));
+       ASSERT(i == nblocks);
+
+       dabuf = malloc(XFS_DA_BUF_SIZE(nblocks));
         if (dabuf == NULL) {
                 do_error(_("couldn't malloc dir2 buffer header\n"));
                 exit(1);
         }
         dabuf->dirty = 0;
         dabuf->nbuf = nex;
-       if (nex == 1) {
+       if (nblocks == 1) {
                 bp = bplist[0];
                 dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
                 dabuf->data = XFS_BUF_PTR(bp);
@@ -130,7 +158,7 @@ da_read_buf(
                         do_error(_("couldn't malloc dir2 buffer data\n"));
                         exit(1);
                 }
-               for (i = off = 0; i < nex; i++, off += XFS_BUF_COUNT(bp)) {
+               for (i = off = 0; i < nblocks; i++, off += XFS_BUF_COUNT(bp)) {
                         bp = bplist[i];
                         bcopy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
                                 XFS_BUF_COUNT(bp));
@@ -140,7 +168,7 @@ da_read_buf(
                 free(bplist);
         return dabuf;
  failed:
-       for (i = 0; i < nex; i++)
+       for (i = 0; i < nblocks; i++)
                 libxfs_putbuf(bplist[i]);
         if (bplist != bparray)
                 free(bplist);
@@ -236,8 +264,12 @@ da_brelse(
                 bcopy(dabuf->bps, bplist, nbuf * sizeof(*bplist));
         }
         da_buf_done(dabuf);
-       for (i = 0; i < nbuf; i++)
+       for (i = 0; i < nbuf; i++) {
+#ifdef XR_PF_TRACE
+               pftrace("putbuf %p (%llu)", bplist[i], (long long)XFS_BUF_ADDR(bplist[i]));
+#endif
                 libxfs_putbuf(bplist[i]);
+       }
         if (bplist != &bp)
                 free(bplist);
  }
@@ -853,7 +885,7 @@ process_sf_dir2(
  
         sfp = &dip->di_u.di_dir2sf;
         max_size = XFS_DFORK_DSIZE(dip, mp);
-       num_entries = INT_GET(sfp->hdr.count, ARCH_CONVERT);
+       num_entries = sfp->hdr.count;
         ino_dir_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
         offset = XFS_DIR2_DATA_FIRST_OFFSET;
         bad_offset = *repair = 0;
@@ -1986,9 +2018,6 @@ process_leaf_node_dir2(
         int                     t;
         bmap_ext_t              lbmp;
  
-       if (do_prefetch)
-               prefetch_dir2(mp, blkmap);
-
         *repair = *dot = *dotdot = good = 0;
         *parent = NULLFSINO;
         ndbno = NULLDFILOFF;
diff --git a/repair/dir_stack.c b/repair/dir_stack.c

deleted file mode 100644 (file)

index 8cf5835..0000000
--- a/repair/dir_stack.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <libxfs.h>
-#include "dir_stack.h"
-#include "err_protos.h"
-#include "threads.h"
-
-/*
- * a directory stack for holding directories while
- * we traverse filesystem hierarchy subtrees.
- * names are kind of misleading as this is really
- * implemented as an inode stack.  so sue me...
- */
-
-static dir_stack_t     dirstack_freelist;
-static int             dirstack_init = 0;
-static pthread_mutex_t dirstack_mutex;
-static pthread_mutexattr_t dirstack_mutexattr;
-
-
-void
-dir_stack_init(dir_stack_t *stack)
-{
-       stack->cnt = 0;
-       stack->head = NULL;
-
-       if (dirstack_init == 0)  {
-               dirstack_init = 1;
-               PREPAIR_MTX_ATTR_INIT(&dirstack_mutexattr);
-#ifdef PTHREAD_MUTEX_SPINBLOCK_NP
-               PREPAIR_MTX_ATTR_SET(&dirstack_mutexattr, PTHREAD_MUTEX_SPINBLOCK_NP);
-#endif
-               PREPAIR_MTX_LOCK_INIT(&dirstack_mutex, &dirstack_mutexattr);
-               dir_stack_init(&dirstack_freelist);
-       }
-
-       stack->cnt = 0;
-       stack->head = NULL;
-
-       return;
-}
-
-static void
-dir_stack_push(dir_stack_t *stack, dir_stack_elem_t *elem)
-{
-       ASSERT(stack->cnt > 0 || (stack->cnt == 0 && stack->head == NULL));
-
-       elem->next = stack->head;
-       stack->head = elem;
-       stack->cnt++;
-
-       return;
-}
-
-static dir_stack_elem_t *
-dir_stack_pop(dir_stack_t *stack)
-{
-       dir_stack_elem_t *elem;
-
-       if (stack->cnt == 0)  {
-               ASSERT(stack->head == NULL);
-               return(NULL);
-       }
-
-       elem = stack->head;
-
-       ASSERT(elem != NULL);
-
-       stack->head = elem->next;
-       elem->next = NULL;
-       stack->cnt--;
-
-       return(elem);
-}
-
-void
-push_dir(dir_stack_t *stack, xfs_ino_t ino)
-{
-       dir_stack_elem_t *elem;
-
-       PREPAIR_MTX_LOCK(&dirstack_mutex);
-       if (dirstack_freelist.cnt == 0)  {
-               if ((elem = malloc(sizeof(dir_stack_elem_t))) == NULL)  {
-                       PREPAIR_MTX_UNLOCK(&dirstack_mutex);
-                       do_error(
-               _("couldn't malloc dir stack element, try more swap\n"));
-                       exit(1);
-               }
-       } else  {
-               elem = dir_stack_pop(&dirstack_freelist);
-       }
-       PREPAIR_MTX_UNLOCK(&dirstack_mutex);
-
-       elem->ino = ino;
-
-       dir_stack_push(stack, elem);
-
-       return;
-}
-
-xfs_ino_t
-pop_dir(dir_stack_t *stack)
-{
-       dir_stack_elem_t *elem;
-       xfs_ino_t ino;
-
-       elem = dir_stack_pop(stack);
-
-       if (elem == NULL)
-               return(NULLFSINO);
-
-       ino = elem->ino;
-       elem->ino = NULLFSINO;
-
-       PREPAIR_MTX_LOCK(&dirstack_mutex);
-       dir_stack_push(&dirstack_freelist, elem);
-       PREPAIR_MTX_UNLOCK(&dirstack_mutex);
-
-       return(ino);
-}
diff --git a/repair/dir_stack.h b/repair/dir_stack.h

deleted file mode 100644 (file)

index 30f4982..0000000
--- a/repair/dir_stack.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-typedef struct dir_stack_elem  {
-       xfs_ino_t               ino;
-       struct dir_stack_elem   *next;
-} dir_stack_elem_t;
-
-typedef struct dir_stack  {
-       int                     cnt;
-       dir_stack_elem_t        *head;
-} dir_stack_t;
-
-
-void           dir_stack_init(dir_stack_t *stack);
-
-void           push_dir(dir_stack_t *stack, xfs_ino_t ino);
-xfs_ino_t      pop_dir(dir_stack_t *stack);
diff --git a/repair/globals.h b/repair/globals.h

index d0db09a3e546446fc2587a4cf62165d8cb34134e..b3ca22a108e46c1bf04c3bc0e85b0e063571ba09 100644 (file)
--- a/repair/globals.h
+++ b/repair/globals.h
@@ -148,7 +148,7 @@ EXTERN xfs_agblock_t        inobt_root;
  /* configuration vars -- fs geometry dependent */
  
  EXTERN int             inodes_per_block;
-EXTERN int             inodes_per_cluster;     /* inodes per inode buffer */
+EXTERN int             inodes_per_cluster;
  EXTERN unsigned int    glob_agcount;
  EXTERN int             chunks_pblock;  /* # of 64-ino chunks per allocation */
  EXTERN int             max_symlink_blocks;
@@ -192,11 +192,16 @@ extern size_t             ts_dirbuf_size;
  extern size_t          ts_dir_freemap_size;
  extern size_t          ts_attr_freemap_size;
  
-EXTERN pthread_rwlock_t        *per_ag_lock;
+EXTERN pthread_mutex_t *ag_locks;
  
-EXTERN int report_interval;
-EXTERN __uint64_t *prog_rpt_done;
+EXTERN int             report_interval;
+EXTERN __uint64_t      *prog_rpt_done;
+
+#ifdef XR_PF_TRACE
+EXTERN FILE            *pf_trace_file;
+#endif
  
  EXTERN int             ag_stride;
+EXTERN int             thread_count;
  
  #endif /* _XFS_REPAIR_GLOBAL_H */
diff --git a/repair/incore.c b/repair/incore.c

index 8598b6f1f60a3f59387a8acb7691a0d0f00b18c7..4fe92d5e1670e8c41529f396d06b0cb9a243a627 100644 (file)
--- a/repair/incore.c
+++ b/repair/incore.c
@@ -61,11 +61,12 @@ setup_bmap(xfs_agnumber_t agno, xfs_agblock_t numblocks, xfs_drtbno_t rtblocks)
         size_t size = 0;
  
         ba_bmap = (__uint64_t**)malloc(agno*sizeof(__uint64_t *));
-       if (!ba_bmap)  {
+       if (!ba_bmap)
                 do_error(_("couldn't allocate block map pointers\n"));
-               return;
-       }
-       PREPAIR_RW_LOCK_ALLOC(per_ag_lock, agno);
+       ag_locks = malloc(agno * sizeof(pthread_mutex_t));
+       if (!ag_locks)
+               do_error(_("couldn't allocate block map locks\n"));
+
         for (i = 0; i < agno; i++)  {
                 size = roundup((numblocks+(NBBY/XR_BB)-1) / (NBBY/XR_BB),
                                 sizeof(__uint64_t));
@@ -77,7 +78,7 @@ setup_bmap(xfs_agnumber_t agno, xfs_agblock_t numblocks, xfs_drtbno_t rtblocks)
                         return;
                 }
                 bzero(ba_bmap[i], size);
-               PREPAIR_RW_LOCK_INIT(&per_ag_lock[i], NULL);
+               pthread_mutex_init(&ag_locks[i], NULL);
         }
  
         if (rtblocks == 0)  {
diff --git a/repair/incore.h b/repair/incore.h

index ddc2a89fea8733f25b41cdffbc20f1933959546e..1f0f45a9bec2c22149a2e818e362f477cbeda451 100644 (file)
--- a/repair/incore.h
+++ b/repair/incore.h
@@ -16,6 +16,10 @@
   * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   */
  
+#ifndef XFS_REPAIR_INCORE_H
+#define XFS_REPAIR_INCORE_H
+
+#include "avl.h"
  /*
   * contains definition information.  implementation (code)
   * is spread out in separate files.
@@ -603,7 +607,7 @@ get_inode_disk_nlinks(ino_tree_node_t *ino_rec, int ino_offset)
  #define add_inode_refchecked(ino, ino_rec, ino_offset) \
                 XFS_INOPROC_SET_PROC((ino_rec), (ino_offset))
  #define is_inode_refchecked(ino, ino_rec, ino_offset) \
-               (XFS_INOPROC_IS_PROC(ino_rec, ino_offset) == 0LL ? 0 : 1)
+               (XFS_INOPROC_IS_PROC(ino_rec, ino_offset) != 0LL)
  #else
  void add_inode_refchecked(xfs_ino_t ino,
                         ino_tree_node_t *ino_rec, int ino_offset);
@@ -647,3 +651,5 @@ typedef struct bm_cursor  {
  } bmap_cursor_t;
  
  void init_bm_cursor(bmap_cursor_t *cursor, int num_level);
+
+#endif /* XFS_REPAIR_INCORE_H */
diff --git a/repair/incore_ext.c b/repair/incore_ext.c

index 59a567fe68e8737ace41d826e0df9c614829b942..a2acbf4a6c6f072d8c21db2e2660d8d88212046e 100644 (file)
--- a/repair/incore_ext.c
+++ b/repair/incore_ext.c
@@ -95,9 +95,9 @@ static ba_rec_t               *rt_ba_list;
  /*
   * locks.
   */
-static pthread_rwlock_t ext_flist_lock;
-static pthread_rwlock_t rt_ext_tree_lock;
-static pthread_rwlock_t rt_ext_flist_lock;
+static pthread_mutex_t ext_flist_lock;
+static pthread_mutex_t rt_ext_tree_lock;
+static pthread_mutex_t rt_ext_flist_lock;
  
  /*
   * extent tree stuff is avl trees of duplicate extents,
@@ -112,7 +112,7 @@ mk_extent_tree_nodes(xfs_agblock_t new_startblock,
         extent_tree_node_t *new;
         extent_alloc_rec_t *rec;
  
-       PREPAIR_RW_WRITE_LOCK(&ext_flist_lock);
+       pthread_mutex_lock(&ext_flist_lock);
         if (ext_flist.cnt == 0)  {
                 ASSERT(ext_flist.list == NULL);
  
@@ -139,7 +139,7 @@ mk_extent_tree_nodes(xfs_agblock_t new_startblock,
         ext_flist.list = (extent_tree_node_t *) new->avl_node.avl_nextino;
         ext_flist.cnt--;
         new->avl_node.avl_nextino = NULL;
-       PREPAIR_RW_UNLOCK(&ext_flist_lock);
+       pthread_mutex_unlock(&ext_flist_lock);
  
         /* initialize node */
  
@@ -155,11 +155,11 @@ mk_extent_tree_nodes(xfs_agblock_t new_startblock,
  void
  release_extent_tree_node(extent_tree_node_t *node)
  {
-       PREPAIR_RW_WRITE_LOCK(&ext_flist_lock);
+       pthread_mutex_lock(&ext_flist_lock);
         node->avl_node.avl_nextino = (avlnode_t *) ext_flist.list;
         ext_flist.list = node;
         ext_flist.cnt++;
-       PREPAIR_RW_UNLOCK(&ext_flist_lock);
+       pthread_mutex_unlock(&ext_flist_lock);
  
         return;
  }
@@ -327,11 +327,11 @@ add_bcnt_extent(xfs_agnumber_t agno, xfs_agblock_t startblock,
                  * avl tree code doesn't handle dups so insert
                  * onto linked list in increasing startblock order
                  *
-                * when called from mk_incore_fstree, 
+                * when called from mk_incore_fstree,
                  * startblock is in increasing order.
                  * current is an "anchor" node.
                  * quick check if the new ext goes to the end.
-                * if so, append at the end, using the last field 
+                * if so, append at the end, using the last field
                  * of the "anchor".
                  */
                 ASSERT(current->last != NULL);
@@ -341,7 +341,7 @@ add_bcnt_extent(xfs_agnumber_t agno, xfs_agblock_t startblock,
                         return;
                 }
  
-               /* 
+               /*
                  * scan, to find the proper location for new entry.
                  * this scan is *very* expensive and gets worse with
                  * with increasing entries.
@@ -670,7 +670,7 @@ mk_rt_extent_tree_nodes(xfs_drtbno_t new_startblock,
         rt_extent_tree_node_t *new;
         rt_extent_alloc_rec_t *rec;
  
-       PREPAIR_RW_WRITE_LOCK(&rt_ext_flist_lock);
+       pthread_mutex_lock(&rt_ext_flist_lock);
         if (rt_ext_flist.cnt == 0)  {
                 ASSERT(rt_ext_flist.list == NULL);
  
@@ -697,7 +697,7 @@ mk_rt_extent_tree_nodes(xfs_drtbno_t new_startblock,
         rt_ext_flist.list = (rt_extent_tree_node_t *) new->avl_node.avl_nextino;
         rt_ext_flist.cnt--;
         new->avl_node.avl_nextino = NULL;
-       PREPAIR_RW_UNLOCK(&rt_ext_flist_lock);
+       pthread_mutex_unlock(&rt_ext_flist_lock);
  
         /* initialize node */
  
@@ -776,7 +776,7 @@ add_rt_dup_extent(xfs_drtbno_t startblock, xfs_extlen_t blockcount)
         xfs_drtbno_t new_startblock;
         xfs_extlen_t new_blockcount;
  
-       PREPAIR_RW_WRITE_LOCK(&rt_ext_tree_lock);
+       pthread_mutex_lock(&rt_ext_tree_lock);
         avl64_findranges(rt_ext_tree_ptr, startblock - 1,
                 startblock + blockcount + 1,
                 (avl64node_t **) &first, (avl64node_t **) &last);
@@ -794,7 +794,7 @@ add_rt_dup_extent(xfs_drtbno_t startblock, xfs_extlen_t blockcount)
                         do_error(_("duplicate extent range\n"));
                 }
  
-               PREPAIR_RW_UNLOCK(&rt_ext_tree_lock);
+               pthread_mutex_unlock(&rt_ext_tree_lock);
                 return;
         }
  
@@ -819,7 +819,7 @@ add_rt_dup_extent(xfs_drtbno_t startblock, xfs_extlen_t blockcount)
                  */
                 if (ext->rt_startblock <= startblock &&
                                 ext->rt_blockcount >= blockcount) {
-                       PREPAIR_RW_UNLOCK(&rt_ext_tree_lock);
+                       pthread_mutex_unlock(&rt_ext_tree_lock);
                         return;
                 }
                 /*
@@ -849,7 +849,7 @@ add_rt_dup_extent(xfs_drtbno_t startblock, xfs_extlen_t blockcount)
                 do_error(_("duplicate extent range\n"));
         }
  
-       PREPAIR_RW_UNLOCK(&rt_ext_tree_lock);
+       pthread_mutex_unlock(&rt_ext_tree_lock);
         return;
  }
  
@@ -862,12 +862,12 @@ search_rt_dup_extent(xfs_mount_t *mp, xfs_drtbno_t bno)
  {
         int ret;
  
-       PREPAIR_RW_READ_LOCK(&rt_ext_tree_lock);
+       pthread_mutex_lock(&rt_ext_tree_lock);
         if (avl64_findrange(rt_ext_tree_ptr, bno) != NULL)
                 ret = 1;
         else
                 ret = 0;
-       PREPAIR_RW_UNLOCK(&rt_ext_tree_lock);
+       pthread_mutex_unlock(&rt_ext_tree_lock);
         return(ret);
  }
  
@@ -897,9 +897,9 @@ incore_ext_init(xfs_mount_t *mp)
  
         ba_list = NULL;
         rt_ba_list = NULL;
-       PREPAIR_RW_LOCK_INIT(&ext_flist_lock, NULL);
-       PREPAIR_RW_LOCK_INIT(&rt_ext_tree_lock, NULL);
-       PREPAIR_RW_LOCK_INIT(&rt_ext_flist_lock, NULL);
+       pthread_mutex_init(&ext_flist_lock, NULL);
+       pthread_mutex_init(&rt_ext_tree_lock, NULL);
+       pthread_mutex_init(&rt_ext_flist_lock, NULL);
  
         if ((extent_tree_ptrs = malloc(agcount *
                                         sizeof(avltree_desc_t *))) == NULL)
diff --git a/repair/incore_ino.c b/repair/incore_ino.c

index 370c3614b576b03bffd87416806955990755287b..a709e299e8c3cbc237b78ec8d4541354e76adb7c 100644 (file)
--- a/repair/incore_ino.c
+++ b/repair/incore_ino.c
@@ -25,7 +25,7 @@
  #include "threads.h"
  #include "err_protos.h"
  
-static pthread_rwlock_t ino_flist_lock;
+static pthread_mutex_t ino_flist_lock;
  extern avlnode_t       *avl_firstino(avlnode_t *root);
  
  /*
@@ -259,7 +259,7 @@ mk_ino_tree_nodes(
         ino_tree_node_t         *ino_rec;
         avlnode_t               *node;
  
-       PREPAIR_RW_WRITE_LOCK(&ino_flist_lock);
+       pthread_mutex_lock(&ino_flist_lock);
         if (ino_flist.cnt == 0)  {
                 ASSERT(ino_flist.list == NULL);
  
@@ -283,7 +283,7 @@ mk_ino_tree_nodes(
         ino_flist.cnt--;
         node = &ino_rec->avl_node;
         node->avl_nextino = node->avl_forw = node->avl_back = NULL;
-       PREPAIR_RW_UNLOCK(&ino_flist_lock);
+       pthread_mutex_unlock(&ino_flist_lock);
  
         /* initialize node */
  
@@ -311,7 +311,7 @@ free_ino_tree_node(ino_tree_node_t *ino_rec)
         ino_rec->avl_node.avl_forw = NULL;
         ino_rec->avl_node.avl_back = NULL;
  
-       PREPAIR_RW_WRITE_LOCK(&ino_flist_lock);
+       pthread_mutex_lock(&ino_flist_lock);
         if (ino_flist.list != NULL)  {
                 ASSERT(ino_flist.cnt > 0);
                 ino_rec->avl_node.avl_nextino = (avlnode_t *) ino_flist.list;
@@ -333,9 +333,7 @@ free_ino_tree_node(ino_tree_node_t *ino_rec)
                 free(ino_rec->ino_un.ex_data);
  
         }
-       PREPAIR_RW_UNLOCK(&ino_flist_lock);
-
-       return;
+       pthread_mutex_unlock(&ino_flist_lock);
  }
  
  /*
@@ -403,8 +401,6 @@ add_aginode_uncertain(xfs_agnumber_t agno, xfs_agino_t ino, int free)
          * set cache entry
          */
         last_rec[agno] = ino_rec;
-
-       return;
  }
  
  /*
@@ -452,8 +448,6 @@ void
  clear_uncertain_ino_cache(xfs_agnumber_t agno)
  {
         last_rec[agno] = NULL;
-
-       return;
  }
  
  
@@ -521,8 +515,6 @@ void
  free_inode_rec(xfs_agnumber_t agno, ino_tree_node_t *ino_rec)
  {
         free_ino_tree_node(ino_rec);
-
-       return;
  }
  
  void
@@ -534,7 +526,6 @@ find_inode_rec_range(xfs_agnumber_t agno, xfs_agino_t start_ino,
  
         avl_findranges(inode_tree_ptrs[agno], start_ino,
                 end_ino, (avlnode_t **) first, (avlnode_t **) last);
-       return;
  }
  
  /*
@@ -716,8 +707,6 @@ set_inode_parent(ino_tree_node_t *irec, int offset, xfs_ino_t parent)
  #endif
         irec->ino_un.plist->pentries[target] = parent;
         irec->ino_un.plist->pmask |= (1LL << offset);
-
-       return;
  }
  
  xfs_ino_t
@@ -810,7 +799,7 @@ incore_ino_init(xfs_mount_t *mp)
         int i;
         int agcount = mp->m_sb.sb_agcount;
  
-       PREPAIR_RW_LOCK_INIT(&ino_flist_lock, NULL);
+       pthread_mutex_init(&ino_flist_lock, NULL);
         if ((inode_tree_ptrs = malloc(agcount *
                                         sizeof(avltree_desc_t *))) == NULL)
                 do_error(_("couldn't malloc inode tree descriptor table\n"));
@@ -842,8 +831,6 @@ incore_ino_init(xfs_mount_t *mp)
         bzero(last_rec, sizeof(ino_tree_node_t *) * agcount);
  
         full_ino_ex_data = 0;
-
-       return;
  }
  
  #ifdef XR_INO_REF_DEBUG
@@ -853,8 +840,6 @@ add_inode_refchecked(xfs_ino_t ino, ino_tree_node_t *ino_rec, int ino_offset)
         XFS_INOPROC_SET_PROC((ino_rec), (ino_offset));
  
         ASSERT(is_inode_refchecked(ino, ino_rec, ino_offset));
-
-       return;
  }
  
  int
diff --git a/repair/init.c b/repair/init.c

index 4036677dc48fa89ea47eb6fff3e69de42cfff183..63e9754ef579623ac96bf3192f0d43ab4fde3e3c 100644 (file)
--- a/repair/init.c
+++ b/repair/init.c
@@ -22,7 +22,11 @@
  #include "protos.h"
  #include "err_protos.h"
  #include "pthread.h"
+#include "avl.h"
+#include "dir.h"
+#include "incore.h"
  #include "prefetch.h"
+#include "radix-tree.h"
  #include <sys/resource.h>
  
  static pthread_key_t dirbuf_key;
@@ -144,9 +148,5 @@ xfs_init(libxfs_init_t *args)
         ts_create();
         ts_init();
         increase_rlimit();
-       if (do_prefetch) {
-               do_prefetch = libxfs_lio_init();
-               if (do_prefetch)
-                       libxfs_lio_allocate();
-       }
+       radix_tree_init();
  }
diff --git a/repair/phase3.c b/repair/phase3.c

index 2e2ec510dd815ea848996a809ba7e5f1bc03f3e1..016beb72309a988b8c5ac26e3f34d85c5847a7b9 100644 (file)
--- a/repair/phase3.c
+++ b/repair/phase3.c
@@ -26,6 +26,7 @@
  #include "dinode.h"
  #include "threads.h"
  #include "progress.h"
+#include "prefetch.h"
  
  /*
   * walks an unlinked list, returns 1 on an error (bogus pointer) or
@@ -59,7 +60,7 @@ walk_unlinked_list(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agino_t start_ino)
                                 add_aginode_uncertain(agno, current_ino, 1);
                                 agbno = XFS_AGINO_TO_AGBNO(mp, current_ino);
  
-                               PREPAIR_RW_WRITE_LOCK(&per_ag_lock[agno]);
+                               pthread_mutex_lock(&ag_locks[agno]);
                                 switch (state = get_agbno_state(mp,
                                                         agno, agbno))  {
                                 case XR_E_UNKNOWN:
@@ -67,14 +68,11 @@ walk_unlinked_list(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agino_t start_ino)
                                 case XR_E_FREE1:
                                         set_agbno_state(mp, agno, agbno,
                                                 XR_E_INO);
-                                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
                                         break;
                                 case XR_E_BAD_STATE:
-                                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
                                         do_error(_(
                                                 "bad state in block map %d\n"),
                                                 state);
-                                       abort();
                                         break;
                                 default:
                                         /*
@@ -89,9 +87,9 @@ walk_unlinked_list(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agino_t start_ino)
                                          */
                                         set_agbno_state(mp, agno, agbno,
                                                 XR_E_INO);
-                                       PREPAIR_RW_UNLOCK(&per_ag_lock[agno]);
                                         break;
                                 }
+                               pthread_mutex_unlock(&ag_locks[agno]);
                         }
                         current_ino = dip->di_next_unlinked;
                 } else  {
@@ -149,21 +147,67 @@ process_agi_unlinked(xfs_mount_t *mp, xfs_agnumber_t agno)
                 libxfs_putbuf(bp);
  }
  
-void
-parallel_p3_process_aginodes(xfs_mount_t *mp, xfs_agnumber_t agno)
+static void
+process_ag_func(
+       work_queue_t            *wq,
+       xfs_agnumber_t          agno,
+       void                    *arg)
  {
         /*
          * turn on directory processing (inode discovery) and
          * attribute processing (extra_attr_check)
          */
+       wait_for_inode_prefetch(arg);
         do_log(_("        - agno = %d\n"), agno);
-       process_aginodes(mp, agno, 1, 0, 1);
+       process_aginodes(wq->mp, arg, agno, 1, 0, 1);
+       cleanup_inode_prefetch(arg);
+}
+
+static void
+process_ags(
+       xfs_mount_t             *mp)
+{
+       int                     i, j;
+       xfs_agnumber_t          agno;
+       work_queue_t            *queues;
+       prefetch_args_t         *pf_args[2];
+
+       queues = malloc(thread_count * sizeof(work_queue_t));
+
+       if (ag_stride) {
+               /*
+                * create one worker thread for each segment of the volume
+                */
+               for (i = 0, agno = 0; i < thread_count; i++) {
+                       create_work_queue(&queues[i], mp, 1);
+                       pf_args[0] = NULL;
+                       for (j = 0; j < ag_stride && agno < mp->m_sb.sb_agcount;
+                                       j++, agno++) {
+                               pf_args[0] = start_inode_prefetch(agno, 0, pf_args[0]);
+                               queue_work(&queues[i], process_ag_func, agno, pf_args[0]);
+                       }
+               }
+               /*
+                * wait for workers to complete
+                */
+               for (i = 0; i < thread_count; i++)
+                       destroy_work_queue(&queues[i]);
+       } else {
+               queues[0].mp = mp;
+               pf_args[0] = start_inode_prefetch(0, 0, NULL);
+               for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                       pf_args[(~i) & 1] = start_inode_prefetch(i + 1, 0,
+                                       pf_args[i & 1]);
+                       process_ag_func(&queues[0], i, pf_args[i & 1]);
+               }
+       }
+       free(queues);
  }
  
  void
  phase3(xfs_mount_t *mp)
  {
-       int i, j;
+       int                     i, j;
  
         do_log(_("Phase 3 - for each AG...\n"));
         if (!no_modify)
@@ -192,16 +236,9 @@ phase3(xfs_mount_t *mp)
             "        - process known inodes and perform inode discovery...\n"));
  
         set_progress_msg(PROG_FMT_PROCESS_INO, (__uint64_t) mp->m_sb.sb_icount);
-       if (ag_stride) {
-               int     steps = (mp->m_sb.sb_agcount + ag_stride - 1) / ag_stride;
-               for (i = 0; i < steps; i++)
-                       for (j = i; j < mp->m_sb.sb_agcount; j += ag_stride)
-                               queue_work(parallel_p3_process_aginodes, mp, j);
-       } else {
-               for (i = 0; i < mp->m_sb.sb_agcount; i++)
-                       parallel_p3_process_aginodes(mp, i);
-       }
-       wait_for_workers();
+
+       process_ags(mp);
+
         print_final_rpt();
  
         /*
diff --git a/repair/phase4.c b/repair/phase4.c

index a69beef89588c088a3d74b3bb4f85762e85ee040..44bbac3140a9702b8b07552aae2b70aafbab735a 100644 (file)
--- a/repair/phase4.c
+++ b/repair/phase4.c
@@ -30,6 +30,7 @@
  #include "dir2.h"
  #include "threads.h"
  #include "progress.h"
+#include "prefetch.h"
  
  
  /*
@@ -114,11 +115,16 @@ quota_sb_check(xfs_mount_t *mp)
  }
  
  
-void
-parallel_p4_process_aginodes(xfs_mount_t *mp, xfs_agnumber_t agno)
+static void
+process_ag_func(
+       work_queue_t            *wq,
+       xfs_agnumber_t          agno,
+       void                    *arg)
  {
+       wait_for_inode_prefetch(arg);
         do_log(_("        - agno = %d\n"), agno);
-       process_aginodes(mp, agno, 0, 1, 0);
+       process_aginodes(wq->mp, arg, agno, 0, 1, 0);
+       cleanup_inode_prefetch(arg);
  
         /*
          * now recycle the per-AG duplicate extent records
@@ -126,6 +132,54 @@ parallel_p4_process_aginodes(xfs_mount_t *mp, xfs_agnumber_t agno)
         release_dup_extent_tree(agno);
  }
  
+static void
+process_ags(
+       xfs_mount_t             *mp)
+{
+       int                     i, j;
+       work_queue_t            *queues;
+       prefetch_args_t         *pf_args[2];
+
+       queues = malloc(thread_count * sizeof(work_queue_t));
+
+       if (!libxfs_bcache_overflowed()) {
+               queues[0].mp = mp;
+               create_work_queue(&queues[0], mp, libxfs_nproc());
+               for (i = 0; i < mp->m_sb.sb_agcount; i++)
+                       queue_work(&queues[0], process_ag_func, i, NULL);
+               destroy_work_queue(&queues[0]);
+       } else {
+               if (ag_stride) {
+                       /*
+                        * create one worker thread for each segment of the volume
+                        */
+                       for (i = 0; i < thread_count; i++) {
+                               create_work_queue(&queues[i], mp, 1);
+                               pf_args[0] = NULL;
+                               for (j = i; j < mp->m_sb.sb_agcount; j += ag_stride) {
+                                       pf_args[0] = start_inode_prefetch(j, 0, pf_args[0]);
+                                       queue_work(&queues[i], process_ag_func, j, pf_args[0]);
+                               }
+                       }
+                       /*
+                        * wait for workers to complete
+                        */
+                       for (i = 0; i < thread_count; i++)
+                               destroy_work_queue(&queues[i]);
+               } else {
+                       queues[0].mp = mp;
+                       pf_args[0] = start_inode_prefetch(0, 0, NULL);
+                       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                               pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
+                                               0, pf_args[i & 1]);
+                               process_ag_func(&queues[0], i, pf_args[i & 1]);
+                       }
+               }
+       }
+       free(queues);
+}
+
+
  void
  phase4(xfs_mount_t *mp)
  {
@@ -328,17 +382,7 @@ phase4(xfs_mount_t *mp)
          * and attribute processing is turned OFF since we did that
          * already in phase 3.
          */
-       if (ag_stride) {
-               int     steps = (mp->m_sb.sb_agcount + ag_stride - 1) / ag_stride;
-               for (i = 0; i < steps; i++)
-                       for (j = i; j < mp->m_sb.sb_agcount; j += ag_stride)
-                               queue_work(parallel_p4_process_aginodes, mp, j);
-       } else {
-               for (i = 0; i < mp->m_sb.sb_agcount; i++)
-                       parallel_p4_process_aginodes(mp, i);
-       }
-
-       wait_for_workers();
+       process_ags(mp);
         print_final_rpt();
  
         /*
diff --git a/repair/phase5.c b/repair/phase5.c

index 8b2ead8fee7b197d2c3f00bced14bef343a7c524..18850c3ca4cc37e4c52ab0c550d5cb4734ebf3f2 100644 (file)
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -1441,8 +1441,10 @@ keep_fsinos(xfs_mount_t *mp)
                 set_inode_used(irec, i);
  }
  
-void
-phase5_function(xfs_mount_t *mp, xfs_agnumber_t agno)
+static void
+phase5_func(
+       xfs_mount_t     *mp,
+       xfs_agnumber_t  agno)
  {
         __uint64_t      num_inos;
         __uint64_t      num_free_inos;
@@ -1620,7 +1622,7 @@ phase5_function(xfs_mount_t *mp, xfs_agnumber_t agno)
  void
  phase5(xfs_mount_t *mp)
  {
-       xfs_agnumber_t agno;
+       xfs_agnumber_t          agno;
  
         do_log(_("Phase 5 - rebuild AG headers and trees...\n"));
         set_progress_msg(PROG_FMT_REBUILD_AG, (__uint64_t )glob_agcount);
@@ -1663,10 +1665,9 @@ phase5(xfs_mount_t *mp)
         if (sb_fdblocks_ag == NULL)
                 do_error(_("cannot alloc sb_fdblocks_ag buffers\n"));
  
-       for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)  {
-               queue_work(phase5_function, mp, agno);
-       }
-       wait_for_workers();
+       for (agno = 0; agno < mp->m_sb.sb_agcount; agno++)
+               phase5_func(mp, agno);
+
         print_final_rpt();
  
         /* aggregate per ag counters */
diff --git a/repair/phase6.c b/repair/phase6.c

index 1bba95b5f358fe64c0db3870fc95a822fa835221..e43addabfc28ade9f91c26251439073a48d7a6b9 100644 (file)
--- a/repair/phase6.c
+++ b/repair/phase6.c
@@ -23,18 +23,17 @@
  #include "incore.h"
  #include "dir.h"
  #include "dir2.h"
-#include "dir_stack.h"
  #include "protos.h"
  #include "err_protos.h"
  #include "dinode.h"
  #include "prefetch.h"
  #include "progress.h"
+#include "threads.h"
  #include "versions.h"
  
  static struct cred             zerocr;
  static struct fsxattr          zerofsx;
  static xfs_ino_t               orphanage_ino;
-static xfs_inode_t             *orphanage_ip;
  
  /*
   * Data structures and routines to keep track of directory entries
@@ -236,8 +235,8 @@ dir_hash_init(
         int             hsize;
  
         hsize = size / (16 * 4);
-       if (hsize > 1024)
-               hsize = 1024;
+       if (hsize > 65536)
+               hsize = 63336;
         else if (hsize < 16)
                 hsize = 16;
         if ((hashtab = calloc(DIR_HASH_TAB_SIZE(hsize), 1)) == NULL)
@@ -902,6 +901,7 @@ mv_orphanage(
         xfs_ino_t               ino,            /* inode # to be moved */
         int                     isa_dir)        /* 1 if inode is a directory */
  {
+       xfs_inode_t             *orphanage_ip;
         xfs_ino_t               entry_ino_num;
         xfs_inode_t             *ino_p;
         xfs_trans_t             *tp;
@@ -919,7 +919,9 @@ mv_orphanage(
         fnamelen = snprintf(fname, sizeof(fname), "%llu",
                         (unsigned long long)ino);
  
-       ASSERT(orphanage_ip != NULL);
+       err = libxfs_iget(mp, NULL, orphanage_ino, 0, &orphanage_ip, 0);
+       if (err)
+               do_error(_("%d - couldn't iget orphanage inode\n"), err);
         /*
          * Make sure the filename is unique in the lost+found
          */
@@ -1082,7 +1084,7 @@ mv_orphanage(
   * Returns the fsbno of the first (leftmost) block in the directory leaf.
   * sets *bno to the directory block # corresponding to the returned fsbno.
   */
-xfs_dfsbno_t
+static xfs_dfsbno_t
  map_first_dblock_fsbno(xfs_mount_t     *mp,
                         xfs_ino_t       ino,
                         xfs_inode_t     *ip,
@@ -1098,7 +1100,6 @@ map_first_dblock_fsbno(xfs_mount_t        *mp,
         int                     i;
         int                     error;
         char                    *ftype;
-       xfs_fsblock_t           fblock2;
  
         /*
          * traverse down left-side of tree until we hit the
@@ -1146,11 +1147,6 @@ _("can't map block %d in %s inode %llu, xfs_bmapi returns %d, nmap = %d\n"),
         if (XFS_SB_VERSION_HASDIRV2(&mp->m_sb))
                 return(fsbno);
  
-       if (do_prefetch) {
-                fblock2 = NULLFSBLOCK;
-                prefetch_p6_dir1(mp, ino, ip, 0, &fblock2);
-        }
-
         do {
                 /*
                  * walk down left side of btree, release buffers as you
@@ -1233,7 +1229,7 @@ _("can't map block %d in %s ino %llu, xfs_bmapi returns %d, nmap = %d\n"),
   *
   * this routine can NOT be called if running in no modify mode
   */
-int
+static int
  prune_lf_dir_entry(xfs_mount_t *mp, xfs_ino_t ino, xfs_inode_t *ip,
                         xfs_dahash_t *hashval)
  {
@@ -1428,14 +1424,13 @@ entry_junked(
   * process a leaf block, also checks for .. entry
   * and corrects it to match what we think .. should be
   */
-void
+static void
  lf_block_dir_entry_check(xfs_mount_t           *mp,
                         xfs_ino_t               ino,
                         xfs_dir_leafblock_t     *leaf,
                         int                     *dirty,
                         int                     *num_illegal,
                         int                     *need_dot,
-                       dir_stack_t             *stack,
                         ino_tree_node_t         *current_irec,
                         int                     current_ino_offset,
                         dir_hash_tab_t          *hashtab,
@@ -1629,9 +1624,6 @@ _("entry \"%s\" in dir %llu points to an already connected dir inode %llu,\n"),
                 } else if (parent == ino)  {
                         add_inode_reached(irec, ino_offset);
                         add_inode_ref(current_irec, current_ino_offset);
-
-                       if (!do_prefetch && !is_inode_refchecked(lino, irec, ino_offset))
-                               push_dir(stack, lino);
                 } else  {
                         junkit = 1;
                         do_warn(
@@ -1666,13 +1658,12 @@ _("entry \"%s\" in dir ino %llu not consistent with .. value (%llu) in ino %llu,
   * happen in file blocks.  the inode size and other core info
   * is already correct, it's just the leaf entries that get altered.
   */
-void
+static void
  longform_dir_entry_check(xfs_mount_t   *mp,
                         xfs_ino_t       ino,
                         xfs_inode_t     *ip,
                         int             *num_illegal,
                         int             *need_dot,
-                       dir_stack_t     *stack,
                         ino_tree_node_t *irec,
                         int             ino_offset,
                         dir_hash_tab_t  *hashtab)
@@ -1738,7 +1729,7 @@ _("bad magic # (0x%x) for dir ino %llu leaf block (bno %u fsbno %llu)\n"),
  
                 if (!skipit)
                         lf_block_dir_entry_check(mp, ino, leaf, &dirty,
-                                       num_illegal, need_dot, stack, irec,
+                                       num_illegal, need_dot, irec,
                                         ino_offset, hashtab, da_bno);
  
                 da_bno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT);
@@ -1802,8 +1793,7 @@ longform_dir2_rebuild(
         xfs_fileoff_t           lastblock;
         xfs_fsblock_t           firstblock;
         xfs_bmap_free_t         flist;
-       xfs_ino_t               parentino;
-       xfs_inode_t             *pip;
+       xfs_inode_t             pip;
         int                     byhash;
         dir_hash_ent_t          *p;
         int                     committed;
@@ -1818,13 +1808,15 @@ longform_dir2_rebuild(
  
         /*
          * first attempt to locate the parent inode, if it can't be found,
-        * we'll use the lost+found inode
+        * set it to the root inode and it'll be adjusted or fixed later
+        * if incorrect (the inode number here needs to be valid for the
+        * libxfs_dir2_init() call).
          */
         byhash = DIR_HASH_FUNC(hashtab, libxfs_da_hashname((uchar_t*)"..", 2));
-       parentino = orphanage_ino;
+       pip.i_ino = mp->m_sb.sb_rootino;
         for (p = hashtab->byhash[byhash]; p; p = p->nextbyhash) {
                 if (p->namelen == 2 && p->name[0] == '.' && p->name[1] == '.') {
-                       parentino = p->inum;
+                       pip.i_ino = p->inum;
                         break;
                 }
         }
@@ -1845,19 +1837,6 @@ longform_dir2_rebuild(
                 do_error(_("xfs_bmap_last_offset failed -- error - %d\n"),
                         error);
  
-       /* re-init the directory to shortform */
-       if ((error = libxfs_trans_iget(mp, tp, parentino, 0, 0, &pip))) {
-               do_warn(
-               _("couldn't iget parent inode %llu -- error - %d\n"),
-                       parentino, error);
-               /* we'll try to use the orphanage ino then */
-               parentino = orphanage_ino;
-               if ((error = libxfs_trans_iget(mp, tp, parentino, 0, 0, &pip)))
-                       do_error(
-               _("couldn't iget lost+found inode %llu -- error - %d\n"),
-                               parentino, error);
-       }
-
         /* free all data, leaf, node and freespace blocks */
  
         if ((error = libxfs_bunmapi(tp, ip, 0, lastblock,
@@ -1871,7 +1850,7 @@ longform_dir2_rebuild(
  
         ASSERT(done);
  
-       libxfs_dir2_init(tp, ip, pip);
+       libxfs_dir2_init(tp, ip, &pip);
  
         error = libxfs_bmap_finish(&tp, &flist, firstblock, &committed);
  
@@ -1987,7 +1966,6 @@ longform_dir2_entry_check_data(
         xfs_inode_t             *ip,
         int                     *num_illegal,
         int                     *need_dot,
-       dir_stack_t             *stack,
         ino_tree_node_t         *current_irec,
         int                     current_ino_offset,
         xfs_dabuf_t             **bpp,
@@ -2198,7 +2176,6 @@ longform_dir2_entry_check_data(
                 ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
                 inum = INT_GET(dep->inumber, ARCH_CONVERT);
                 lastfree = 0;
-
                 /*
                  * skip bogus entries (leading '/').  they'll be deleted
                  * later.  must still log it, else we leak references to
@@ -2210,6 +2187,7 @@ longform_dir2_entry_check_data(
                                 libxfs_dir2_data_log_entry(tp, bp, dep);
                         continue;
                 }
+
                 bcopy(dep->name, fname, dep->namelen);
                 fname[dep->namelen] = '\0';
                 ASSERT(inum != NULLFSINO);
@@ -2341,8 +2319,6 @@ _("entry \"%s\" in dir %llu points to an already connected directory inode %llu\
                 } else if (parent == ip->i_ino)  {
                         add_inode_reached(irec, ino_offset);
                         add_inode_ref(current_irec, current_ino_offset);
-                       if (!do_prefetch && !is_inode_refchecked(inum, irec, ino_offset))
-                               push_dir(stack, inum);
                 } else  {
                         junkit = 1;
                         do_warn(
@@ -2381,7 +2357,7 @@ _("entry \"%s\" in dir inode %llu inconsistent with .. value (%llu) in ino %llu\
  /*
   * Check contents of leaf-form block.
   */
-int
+static int
  longform_dir2_check_leaf(
         xfs_mount_t             *mp,
         xfs_inode_t             *ip,
@@ -2448,7 +2424,7 @@ longform_dir2_check_leaf(
   * Check contents of the node blocks (leaves)
   * Looks for matching hash values for the data entries.
   */
-int
+static int
  longform_dir2_check_node(
         xfs_mount_t             *mp,
         xfs_inode_t             *ip,
@@ -2581,13 +2557,12 @@ longform_dir2_check_node(
   * destroy the entry and create a new one with recovered name/inode pairs.
   * (ie. get libxfs to do all the grunt work)
   */
-void
+static void
  longform_dir2_entry_check(xfs_mount_t  *mp,
                         xfs_ino_t       ino,
                         xfs_inode_t     *ip,
                         int             *num_illegal,
                         int             *need_dot,
-                       dir_stack_t     *stack,
                         ino_tree_node_t *irec,
                         int             ino_offset,
                         dir_hash_tab_t  *hashtab)
@@ -2627,9 +2602,6 @@ longform_dir2_entry_check(xfs_mount_t     *mp,
         libxfs_dir2_isblock(NULL, ip, &isblock);
         libxfs_dir2_isleaf(NULL, ip, &isleaf);
  
-       if (do_prefetch && !isblock)
-               prefetch_p6_dir2(mp, ip);
-
         /* check directory "data" blocks (ie. name/inode pairs) */
         for (da_bno = 0, next_da_bno = 0;
              next_da_bno != NULLFILEOFF && da_bno < mp->m_dirleafblk;
@@ -2656,7 +2628,7 @@ longform_dir2_entry_check(xfs_mount_t     *mp,
                         continue;       /* try and read all "data" blocks */
                 }
                 longform_dir2_entry_check_data(mp, ip, num_illegal, need_dot,
-                               stack, irec, ino_offset, &bplist[db], hashtab,
+                               irec, ino_offset, &bplist[db], hashtab,
                                 &freetab, da_bno, isblock);
         }
         fixit = (*num_illegal != 0) || dir2_is_badino(ino);
@@ -2697,12 +2669,11 @@ longform_dir2_entry_check(xfs_mount_t   *mp,
   * shortform directory processing routines -- entry verification and
   * bad entry deletion (pruning).
   */
-void
+static void
  shortform_dir_entry_check(xfs_mount_t  *mp,
                         xfs_ino_t       ino,
                         xfs_inode_t     *ip,
                         int             *ino_dirty,
-                       dir_stack_t     *stack,
                         ino_tree_node_t *current_irec,
                         int             current_ino_offset,
                         dir_hash_tab_t  *hashtab)
@@ -2888,10 +2859,6 @@ shortform_dir_entry_check(xfs_mount_t    *mp,
                         } else if (parent == ino)  {
                                 add_inode_reached(irec, ino_offset);
                                 add_inode_ref(current_irec, current_ino_offset);
-
-                               if (!do_prefetch && !is_inode_refchecked(lino, irec,
-                                               ino_offset))
-                                       push_dir(stack, lino);
                         } else  {
                                 junkit = 1;
                                 do_warn(_("entry \"%s\" in dir %llu not "
@@ -2986,7 +2953,7 @@ do_junkit:
  }
  
  /* ARGSUSED */
-void
+static void
  prune_sf_dir_entry(xfs_mount_t *mp, xfs_ino_t ino, xfs_inode_t *ip)
  {
                                 /* REFERENCED */
@@ -3083,12 +3050,11 @@ prune_sf_dir_entry(xfs_mount_t *mp, xfs_ino_t ino, xfs_inode_t *ip)
   * shortform directory v2 processing routines -- entry verification and
   * bad entry deletion (pruning).
   */
-void
+static void
  shortform_dir2_entry_check(xfs_mount_t *mp,
                         xfs_ino_t       ino,
                         xfs_inode_t     *ip,
                         int             *ino_dirty,
-                       dir_stack_t     *stack,
                         ino_tree_node_t *current_irec,
                         int             current_ino_offset,
                         dir_hash_tab_t  *hashtab)
@@ -3285,10 +3251,6 @@ shortform_dir2_entry_check(xfs_mount_t   *mp,
                         } else if (parent == ino)  {
                                 add_inode_reached(irec, ino_offset);
                                 add_inode_ref(current_irec, current_ino_offset);
-
-                               if (!do_prefetch && !is_inode_refchecked(lino, irec,
-                                               ino_offset))
-                                       push_dir(stack, lino);
                         } else  {
                                 junkit = 1;
                                 do_warn(_("entry \"%s\" in directory inode %llu"
@@ -3401,87 +3363,78 @@ do_junkit:
  }
  
  /*
- * processes all directories reachable via the inodes on the stack
- * returns 0 if things are good, 1 if there's a problem
+ * processes all reachable inodes in directories
   */
-void
-process_dirstack(xfs_mount_t *mp, dir_stack_t *stack)
+static void
+process_dir_inode(
+       xfs_mount_t             *mp,
+       xfs_ino_t               ino,
+       ino_tree_node_t         *irec,
+       int                     ino_offset)
  {
         xfs_bmap_free_t         flist;
         xfs_fsblock_t           first;
-       xfs_ino_t               ino;
         xfs_inode_t             *ip;
         xfs_trans_t             *tp;
         xfs_dahash_t            hashval;
-       ino_tree_node_t         *irec;
         dir_hash_tab_t          *hashtab;
-       int                     ino_offset, need_dot, committed;
+       int                     need_dot, committed;
         int                     dirty, num_illegal, error, nres;
  
         /*
-        * pull directory inode # off directory stack
-        *
          * open up directory inode, check all entries,
          * then call prune_dir_entries to remove all
          * remaining illegal directory entries.
          */
  
-       while ((ino = pop_dir(stack)) != NULLFSINO)  {
-               irec = find_inode_rec(XFS_INO_TO_AGNO(mp, ino),
-                                       XFS_INO_TO_AGINO(mp, ino));
-               ASSERT(irec != NULL);
+       ASSERT(!is_inode_refchecked(ino, irec, ino_offset));
  
-               ino_offset = XFS_INO_TO_AGINO(mp, ino) - irec->ino_startnum;
-
-               ASSERT(!is_inode_refchecked(ino, irec, ino_offset));
-
-               if ((error = libxfs_iget(mp, NULL, ino, 0, &ip, 0))) {
-                       if (!no_modify)
-                               do_error(
-                               _("couldn't map inode %llu, err = %d\n"),
-                                       ino, error);
-                       else  {
-                               do_warn(
-                               _("couldn't map inode %llu, err = %d\n"),
-                                       ino, error);
-                               /*
-                                * see below for what we're doing if this
-                                * is root.  Why do we need to do this here?
-                                * to ensure that the root doesn't show up
-                                * as being disconnected in the no_modify case.
-                                */
-                               if (mp->m_sb.sb_rootino == ino)  {
-                                       add_inode_reached(irec, 0);
-                                       add_inode_ref(irec, 0);
-                               }
-                       }
-
-                       add_inode_refchecked(ino, irec, 0);
-                       continue;
-               }
-
-               need_dot = dirty = num_illegal = 0;
-
-               if (mp->m_sb.sb_rootino == ino)  {
+       error = libxfs_iget(mp, NULL, ino, 0, &ip, 0);
+       if (error) {
+               if (!no_modify)
+                       do_error(_("couldn't map inode %llu, err = %d\n"),
+                               ino, error);
+               else  {
+                       do_warn(_("couldn't map inode %llu, err = %d\n"),
+                               ino, error);
                         /*
-                        * mark root inode reached and bump up
-                        * link count for root inode to account
-                        * for '..' entry since the root inode is
-                        * never reached by a parent.  we know
-                        * that root's '..' is always good --
-                        * guaranteed by phase 3 and/or below.
+                        * see below for what we're doing if this
+                        * is root.  Why do we need to do this here?
+                        * to ensure that the root doesn't show up
+                        * as being disconnected in the no_modify case.
                          */
-                       add_inode_reached(irec, ino_offset);
+                       if (mp->m_sb.sb_rootino == ino)  {
+                               add_inode_reached(irec, 0);
+                               add_inode_ref(irec, 0);
+                       }
                 }
  
-               add_inode_refchecked(ino, irec, ino_offset);
+               add_inode_refchecked(ino, irec, 0);
+               return;
+       }
  
-               hashtab = dir_hash_init(ip->i_d.di_size);
+       need_dot = dirty = num_illegal = 0;
  
+       if (mp->m_sb.sb_rootino == ino)  {
                 /*
-                * look for bogus entries
+                * mark root inode reached and bump up
+                * link count for root inode to account
+                * for '..' entry since the root inode is
+                * never reached by a parent.  we know
+                * that root's '..' is always good --
+                * guaranteed by phase 3 and/or below.
                  */
-               switch (ip->i_d.di_format)  {
+               add_inode_reached(irec, ino_offset);
+       }
+
+       add_inode_refchecked(ino, irec, ino_offset);
+
+       hashtab = dir_hash_init(ip->i_d.di_size);
+
+       /*
+        * look for bogus entries
+        */
+       switch (ip->i_d.di_format)  {
                 case XFS_DINODE_FMT_EXTENTS:
                 case XFS_DINODE_FMT_BTREE:
                         /*
@@ -3493,16 +3446,15 @@ process_dirstack(xfs_mount_t *mp, dir_stack_t *stack)
                         if (XFS_SB_VERSION_HASDIRV2(&mp->m_sb))
                                 longform_dir2_entry_check(mp, ino, ip,
                                                         &num_illegal, &need_dot,
-                                                       stack, irec,
-                                                       ino_offset,
+                                                       irec, ino_offset,
                                                         hashtab);
                         else
                                 longform_dir_entry_check(mp, ino, ip,
                                                         &num_illegal, &need_dot,
-                                                       stack, irec,
-                                                       ino_offset,
+                                                       irec, ino_offset,
                                                         hashtab);
                         break;
+
                 case XFS_DINODE_FMT_LOCAL:
                         tp = libxfs_trans_alloc(mp, 0);
                         /*
@@ -3524,208 +3476,199 @@ process_dirstack(xfs_mount_t *mp, dir_stack_t *stack)
  
                         if (XFS_SB_VERSION_HASDIRV2(&mp->m_sb))
                                 shortform_dir2_entry_check(mp, ino, ip, &dirty,
-                                                       stack, irec,
-                                                       ino_offset,
+                                                       irec, ino_offset,
                                                         hashtab);
                         else
                                 shortform_dir_entry_check(mp, ino, ip, &dirty,
-                                                       stack, irec,
-                                                       ino_offset,
+                                                       irec, ino_offset,
                                                         hashtab);
  
                         ASSERT(dirty == 0 || (dirty && !no_modify));
                         if (dirty)  {
                                 libxfs_trans_log_inode(tp, ip,
                                         XFS_ILOG_CORE | XFS_ILOG_DDATA);
-                               libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES
-                                               |XFS_TRANS_SYNC, 0);
+                               libxfs_trans_commit(tp,
+                                       XFS_TRANS_RELEASE_LOG_RES |
+                                       XFS_TRANS_SYNC, 0);
                         } else  {
-                               libxfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+                               libxfs_trans_cancel(tp,
+                                       XFS_TRANS_RELEASE_LOG_RES);
                         }
                         break;
+
                 default:
                         break;
-               }
-               dir_hash_done(hashtab);
+       }
+       dir_hash_done(hashtab);
  
-               hashval = 0;
+       hashval = 0;
  
-               /*
-                * if we have to create a .. for /, do it now *before*
-                * we delete the bogus entries, otherwise the directory
-                * could transform into a shortform dir which would
-                * probably cause the simulation to choke.  Even
-                * if the illegal entries get shifted around, it's ok
-                * because the entries are structurally intact and in
-                * in hash-value order so the simulation won't get confused
-                * if it has to move them around.
-                */
-               if (!no_modify && need_root_dotdot &&
-                               ino == mp->m_sb.sb_rootino)  {
-                       ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_LOCAL);
+       /*
+        * if we have to create a .. for /, do it now *before*
+        * we delete the bogus entries, otherwise the directory
+        * could transform into a shortform dir which would
+        * probably cause the simulation to choke.  Even
+        * if the illegal entries get shifted around, it's ok
+        * because the entries are structurally intact and in
+        * in hash-value order so the simulation won't get confused
+        * if it has to move them around.
+        */
+       if (!no_modify && need_root_dotdot && ino == mp->m_sb.sb_rootino)  {
+               ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_LOCAL);
  
-                       do_warn(_("recreating root directory .. entry\n"));
+               do_warn(_("recreating root directory .. entry\n"));
  
-                       tp = libxfs_trans_alloc(mp, 0);
-                       ASSERT(tp != NULL);
+               tp = libxfs_trans_alloc(mp, 0);
+               ASSERT(tp != NULL);
  
-                       nres = XFS_MKDIR_SPACE_RES(mp, 2);
-                       error = libxfs_trans_reserve(tp, nres,
-                                       XFS_MKDIR_LOG_RES(mp),
-                                       0,
-                                       XFS_TRANS_PERM_LOG_RES,
-                                       XFS_MKDIR_LOG_COUNT);
+               nres = XFS_MKDIR_SPACE_RES(mp, 2);
+               error = libxfs_trans_reserve(tp, nres, XFS_MKDIR_LOG_RES(mp),
+                               0, XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
+               if (error)
+                       res_failed(error);
  
-                       if (error)
-                               res_failed(error);
+               libxfs_trans_ijoin(tp, ip, 0);
+               libxfs_trans_ihold(tp, ip);
  
-                       libxfs_trans_ijoin(tp, ip, 0);
-                       libxfs_trans_ihold(tp, ip);
+               XFS_BMAP_INIT(&flist, &first);
  
-                       XFS_BMAP_INIT(&flist, &first);
+               error = dir_createname(mp, tp, ip, "..", 2, ip->i_ino, &first,
+                               &flist, nres);
+               if (error)
+                       do_error(_("can't make \"..\" entry in root inode "
+                               "%llu, createname error %d\n"), ino, error);
  
-                       if ((error = dir_createname(mp, tp, ip, "..", 2,
-                                       ip->i_ino, &first, &flist, nres)))
-                               do_error(
-_("can't make \"..\" entry in root inode %llu, createname error %d\n"),
-                                       ino, error);
+               libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  
-                       libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+               error = libxfs_bmap_finish(&tp, &flist, first, &committed);
+               ASSERT(error == 0);
+               libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES |
+                               XFS_TRANS_SYNC, 0);
  
-                       error = libxfs_bmap_finish(&tp, &flist, first,
-                                       &committed);
-                       ASSERT(error == 0);
-                       libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES
-                                       |XFS_TRANS_SYNC, 0);
+               need_root_dotdot = 0;
+       } else if (need_root_dotdot && ino == mp->m_sb.sb_rootino)  {
+               do_warn(_("would recreate root directory .. entry\n"));
+       }
  
-                       need_root_dotdot = 0;
-               } else if (need_root_dotdot && ino == mp->m_sb.sb_rootino)  {
-                       do_warn(_("would recreate root directory .. entry\n"));
+       /*
+        * delete any illegal entries -- which should only exist
+        * if the directory is a longform directory.  bogus
+        * shortform directory entries were deleted in phase 4.
+        */
+       if (!no_modify && num_illegal > 0)  {
+               ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_LOCAL);
+               ASSERT(!XFS_SB_VERSION_HASDIRV2(&mp->m_sb));
+
+               while (num_illegal > 0 && ip->i_d.di_format !=
+                               XFS_DINODE_FMT_LOCAL)  {
+                       prune_lf_dir_entry(mp, ino, ip, &hashval);
+                       num_illegal--;
                 }
  
                 /*
-                * delete any illegal entries -- which should only exist
-                * if the directory is a longform directory.  bogus
-                * shortform directory entries were deleted in phase 4.
-                */
-               if (!no_modify && num_illegal > 0)  {
-                       ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_LOCAL);
-                       ASSERT(!XFS_SB_VERSION_HASDIRV2(&mp->m_sb));
-
-                       while (num_illegal > 0 && ip->i_d.di_format !=
-                                       XFS_DINODE_FMT_LOCAL)  {
-                               prune_lf_dir_entry(mp, ino, ip, &hashval);
-                               num_illegal--;
-                       }
-
+               * handle case where we've deleted so many
+               * entries that the directory has changed from
+               * a longform to a shortform directory.  have
+               * to allocate a transaction since we're working
+               * with the incore data fork.
+               */
+               if (num_illegal > 0)  {
+                       ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
+                       tp = libxfs_trans_alloc(mp, 0);
                         /*
-                        * handle case where we've deleted so many
-                        * entries that the directory has changed from
-                        * a longform to a shortform directory.  have
-                        * to allocate a transaction since we're working
-                        * with the incore data fork.
-                        */
-                       if (num_illegal > 0)  {
-                               ASSERT(ip->i_d.di_format ==
-                                       XFS_DINODE_FMT_LOCAL);
-                               tp = libxfs_trans_alloc(mp, 0);
-                               /*
-                                * using the remove reservation is overkill
-                                * since at most we'll only need to log the
-                                * inode but it's easier than wedging a
-                                * new define in ourselves.  10 block fs
-                                * space reservation is also overkill but
-                                * what the heck...
-                                */
-                               nres = XFS_REMOVE_SPACE_RES(mp);
-                               error = libxfs_trans_reserve(tp, nres,
-                                               XFS_REMOVE_LOG_RES(mp), 0,
-                                               XFS_TRANS_PERM_LOG_RES,
-                                               XFS_REMOVE_LOG_COUNT);
-                               if (error)
-                                       res_failed(error);
+                       * using the remove reservation is overkill
+                       * since at most we'll only need to log the
+                       * inode but it's easier than wedging a
+                       * new define in ourselves.  10 block fs
+                       * space reservation is also overkill but
+                       * what the heck...
+                       */
+                       nres = XFS_REMOVE_SPACE_RES(mp);
+                       error = libxfs_trans_reserve(tp, nres,
+                                       XFS_REMOVE_LOG_RES(mp), 0,
+                                       XFS_TRANS_PERM_LOG_RES,
+                                       XFS_REMOVE_LOG_COUNT);
+                       if (error)
+                               res_failed(error);
  
-                               libxfs_trans_ijoin(tp, ip, 0);
-                               libxfs_trans_ihold(tp, ip);
+                       libxfs_trans_ijoin(tp, ip, 0);
+                       libxfs_trans_ihold(tp, ip);
  
-                               prune_sf_dir_entry(mp, ino, ip);
+                       prune_sf_dir_entry(mp, ino, ip);
  
-                               libxfs_trans_log_inode(tp, ip,
-                                               XFS_ILOG_CORE | XFS_ILOG_DDATA);
-                               ASSERT(error == 0);
-                               libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES
-                                               |XFS_TRANS_SYNC, 0);
-                       }
+                       libxfs_trans_log_inode(tp, ip,
+                                       XFS_ILOG_CORE | XFS_ILOG_DDATA);
+                       ASSERT(error == 0);
+                       libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES
+                                       |XFS_TRANS_SYNC, 0);
                 }
+       }
  
+       /*
+        * if we need to create the '.' entry, do so only if
+        * the directory is a longform dir.  it it's been
+        * turned into a shortform dir, then the inode is ok
+        * since shortform dirs have no '.' entry and the inode
+        * has already been committed by prune_lf_dir_entry().
+        */
+       if (need_dot)  {
                 /*
-                * if we need to create the '.' entry, do so only if
-                * the directory is a longform dir.  it it's been
-                * turned into a shortform dir, then the inode is ok
-                * since shortform dirs have no '.' entry and the inode
-                * has already been committed by prune_lf_dir_entry().
+                * bump up our link count but don't
+                * bump up the inode link count.  chances
+                * are good that even though we lost '.'
+                * the inode link counts reflect '.' so
+                * leave the inode link count alone and if
+                * it turns out to be wrong, we'll catch
+                * that in phase 7.
                  */
-               if (need_dot)  {
+               add_inode_ref(irec, ino_offset);
+
+               if (no_modify)  {
+                       do_warn(_("would create missing \".\" entry in dir ino %llu\n"),
+                               ino);
+               } else if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)  {
                         /*
-                        * bump up our link count but don't
-                        * bump up the inode link count.  chances
-                        * are good that even though we lost '.'
-                        * the inode link counts reflect '.' so
-                        * leave the inode link count alone and if
-                        * it turns out to be wrong, we'll catch
-                        * that in phase 7.
+                        * need to create . entry in longform dir.
                          */
-                       add_inode_ref(irec, ino_offset);
+                       do_warn(_("creating missing \".\" entry in dir ino %llu\n"),
+                               ino);
  
-                       if (no_modify)  {
-                               do_warn(
-       _("would create missing \".\" entry in dir ino %llu\n"),
-                                       ino);
-                       } else if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)  {
-                               /*
-                                * need to create . entry in longform dir.
-                                */
-                               do_warn(
-       _("creating missing \".\" entry in dir ino %llu\n"),
-                                       ino);
-
-                               tp = libxfs_trans_alloc(mp, 0);
-                               ASSERT(tp != NULL);
+                       tp = libxfs_trans_alloc(mp, 0);
+                       ASSERT(tp != NULL);
  
-                               nres = XFS_MKDIR_SPACE_RES(mp, 1);
-                               error = libxfs_trans_reserve(tp, nres,
-                                               XFS_MKDIR_LOG_RES(mp),
-                                               0,
-                                               XFS_TRANS_PERM_LOG_RES,
-                                               XFS_MKDIR_LOG_COUNT);
+                       nres = XFS_MKDIR_SPACE_RES(mp, 1);
+                       error = libxfs_trans_reserve(tp, nres,
+                                       XFS_MKDIR_LOG_RES(mp),
+                                       0,
+                                       XFS_TRANS_PERM_LOG_RES,
+                                       XFS_MKDIR_LOG_COUNT);
  
-                               if (error)
-                                       res_failed(error);
+                       if (error)
+                               res_failed(error);
  
-                               libxfs_trans_ijoin(tp, ip, 0);
-                               libxfs_trans_ihold(tp, ip);
+                       libxfs_trans_ijoin(tp, ip, 0);
+                       libxfs_trans_ihold(tp, ip);
  
-                               XFS_BMAP_INIT(&flist, &first);
+                       XFS_BMAP_INIT(&flist, &first);
  
-                               if ((error = dir_createname(mp, tp, ip, ".",
-                                               1, ip->i_ino, &first, &flist,
-                                               nres)))
-                                       do_error(
-       _("can't make \".\" entry in dir ino %llu, createname error %d\n"),
-                                               ino, error);
+                       if ((error = dir_createname(mp, tp, ip, ".",
+                                       1, ip->i_ino, &first, &flist,
+                                       nres)))
+                               do_error(_("can't make \".\" entry in dir ino "
+                                       "%llu, createname error %d\n"),
+                                       ino, error);
  
-                               libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                       libxfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  
-                               error = libxfs_bmap_finish(&tp, &flist, first,
-                                               &committed);
-                               ASSERT(error == 0);
-                               libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES
-                                               |XFS_TRANS_SYNC, 0);
-                       }
+                       error = libxfs_bmap_finish(&tp, &flist, first,
+                                       &committed);
+                       ASSERT(error == 0);
+                       libxfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES
+                                       |XFS_TRANS_SYNC, 0);
                 }
-
-               libxfs_iput(ip, 0);
         }
+
+       libxfs_iput(ip, 0);
  }
  
  /*
@@ -3783,10 +3726,10 @@ mark_standalone_inodes(xfs_mount_t *mp)
  static void
  check_for_orphaned_inodes(
         xfs_mount_t             *mp,
+       xfs_agnumber_t          agno,
         ino_tree_node_t         *irec)
  {
         int                     i;
-       int                     err;
         xfs_ino_t               ino;
  
         for (i = 0; i < XFS_INODES_PER_CHUNK; i++)  {
@@ -3794,93 +3737,101 @@ check_for_orphaned_inodes(
                 if (is_inode_free(irec, i))
                         continue;
  
-               if (!is_inode_reached(irec, i)) {
-                       ASSERT(inode_isadir(irec, i) ||
-                               num_inode_references(irec, i) == 0);
-                       ino = XFS_AGINO_TO_INO(mp, i, i + irec->ino_startnum);
-                       if (inode_isadir(irec, i))
-                               do_warn(_("disconnected dir inode %llu, "), ino);
-                       else
-                               do_warn(_("disconnected inode %llu, "), ino);
-                       if (!no_modify)  {
-                               if (!orphanage_ino)
-                                       orphanage_ino = mk_orphanage(mp);
-                               if (!orphanage_ip) {
-                                       err = libxfs_iget(mp, NULL, orphanage_ino, 0, &orphanage_ip, 0);
-                                       if (err)
-                                               do_error(_("%d - couldn't iget orphanage inode\n"), err);
-                               }
-                               do_warn(_("moving to %s\n"), ORPHANAGE);
-                               mv_orphanage(mp, ino, inode_isadir(irec, i));
-                       } else  {
-                               do_warn(_("would move to %s\n"), ORPHANAGE);
-                       }
-                       /*
-                        * for read-only case, even though the inode isn't
-                        * really reachable, set the flag (and bump our link
-                        * count) anyway to fool phase 7
-                        */
-                       add_inode_reached(irec, i);
+               if (is_inode_reached(irec, i))
+                       continue;
+
+               ASSERT(inode_isadir(irec, i) ||
+                       num_inode_references(irec, i) == 0);
+
+               ino = XFS_AGINO_TO_INO(mp, agno, i + irec->ino_startnum);
+               if (inode_isadir(irec, i))
+                       do_warn(_("disconnected dir inode %llu, "), ino);
+               else
+                       do_warn(_("disconnected inode %llu, "), ino);
+               if (!no_modify)  {
+                       if (!orphanage_ino)
+                               orphanage_ino = mk_orphanage(mp);
+                       do_warn(_("moving to %s\n"), ORPHANAGE);
+                       mv_orphanage(mp, ino, inode_isadir(irec, i));
+               } else  {
+                       do_warn(_("would move to %s\n"), ORPHANAGE);
                 }
+               /*
+                * for read-only case, even though the inode isn't
+                * really reachable, set the flag (and bump our link
+                * count) anyway to fool phase 7
+                */
+               add_inode_reached(irec, i);
         }
  }
  
  static void
-traverse_function(xfs_mount_t *mp, xfs_agnumber_t agno)
+traverse_function(
+       work_queue_t            *wq,
+       xfs_agnumber_t          agno,
+       void                    *arg)
  {
-       register ino_tree_node_t *irec;
-       int                     j;
-       xfs_ino_t               ino;
-       dir_stack_t             stack;
+       ino_tree_node_t         *irec;
+       int                     i;
+       prefetch_args_t         *pf_args = arg;
+
+       wait_for_inode_prefetch(pf_args);
  
         if (verbose)
                 do_log(_("        - agno = %d\n"), agno);
  
-       dir_stack_init(&stack);
-       irec = findfirst_inode_rec(agno);
-
-       while (irec != NULL)  {
-               for (j = 0; j < XFS_INODES_PER_CHUNK; j++)  {
-                       if (!inode_isadir(irec, j)) {
-                               ino = XFS_AGINO_TO_INO(mp, agno,
-                                       irec->ino_startnum + j);
-                               if (mp->m_sb.sb_rootino != ino)
-                                       continue;
-                       }
+       for (irec = findfirst_inode_rec(agno); irec; irec = next_ino_rec(irec)) {
+               if (irec->ino_isa_dir == 0)
+                       continue;
  
-                       ino = XFS_AGINO_TO_INO(mp, agno,
-                               irec->ino_startnum + j);
+               if (pf_args)
+                       sem_post(&pf_args->ra_count);
  
-                       push_dir(&stack, ino);
-                       process_dirstack(mp, &stack);
+               for (i = 0; i < XFS_INODES_PER_CHUNK; i++)  {
+                       if (inode_isadir(irec, i))
+                               process_dir_inode(wq->mp,
+                                       XFS_AGINO_TO_INO(wq->mp, agno,
+                                       irec->ino_startnum + i), irec, i);
                 }
-               irec = next_ino_rec(irec);
         }
-       return;
+       cleanup_inode_prefetch(pf_args);
  }
  
  static void
-traverse_alt(xfs_mount_t *mp)
+traverse_ags(
+       xfs_mount_t             *mp)
  {
         int                     i;
-
-       set_progress_msg(PROG_FMT_TRAVERSAL, (__uint64_t) glob_agcount);
-       for (i = 0; i < mp->m_sb.sb_agcount; i++)  {
-               traverse_function(mp, i);
-               PROG_RPT_INC(prog_rpt_done[i], 1);
+       work_queue_t            *queues;
+       prefetch_args_t         *pf_args[2];
+
+       queues = malloc(thread_count * sizeof(work_queue_t));
+       queues[0].mp = mp;
+
+       if (!libxfs_bcache_overflowed()) {
+               /*create_work_queue(&queues[0], mp, libxfs_nproc());
+               for (i = 0; i < glob_agcount; i++)
+                       queue_work(&queues[0], traverse_function, i, NULL);
+               destroy_work_queue(&queues[0]);*/
+               for (i = 0; i < glob_agcount; i++)
+                       traverse_function(&queues[0], i, NULL);
+       } else {
+               /* TODO: AG stride support */
+               pf_args[0] = start_inode_prefetch(0, 1, NULL);
+               for (i = 0; i < glob_agcount; i++) {
+                       pf_args[(~i) & 1] = start_inode_prefetch(i + 1, 1,
+                                       pf_args[i & 1]);
+                       traverse_function(&queues[0], i, pf_args[i & 1]);
+               }
         }
-       print_final_rpt();
+       free(queues);
  }
  
  void
  phase6(xfs_mount_t *mp)
  {
-       xfs_ino_t               ino;
         ino_tree_node_t         *irec;
-       dir_stack_t             stack;
         int                     i;
-       int                     j;
-       xfs_ino_t               orphanage_ino;
  
         bzero(&zerocr, sizeof(struct cred));
         bzero(&zerofsx, sizeof(struct fsxattr));
@@ -3948,34 +3899,9 @@ _("        - resetting contents of realtime bitmap and summary inodes\n"));
                 }
         }
  
-       dir_stack_init(&stack);
-
         mark_standalone_inodes(mp);
  
-       /*
-        * push root dir on stack, then go
-        */
-       if (!need_root_inode)  {
-               do_log(_("        - traversing filesystem starting at / ... \n"));
-
-               if (do_prefetch) {
-                       traverse_alt(mp);
-               } else {
-                       push_dir(&stack, mp->m_sb.sb_rootino);
-                       process_dirstack(mp, &stack);
-               }
-
-               do_log(_("        - traversal finished ... \n"));
-       } else  {
-               ASSERT(no_modify != 0);
-
-               do_log(
-_("        - root inode lost, cannot make new one in no modify mode ... \n"));
-               do_log(
-_("        - skipping filesystem traversal from / ... \n"));
-       }
-
-       do_log(_("        - traversing all unattached subtrees ... \n"));
+       do_log(_("        - traversing filesystem ... \n"));
  
         irec = find_inode_rec(XFS_INO_TO_AGNO(mp, mp->m_sb.sb_rootino),
                                 XFS_INO_TO_AGINO(mp, mp->m_sb.sb_rootino));
@@ -3989,42 +3915,9 @@ _("        - skipping filesystem traversal from / ... \n"));
         }
  
         /*
-        * then process all unreached inodes
-        * by walking incore inode tree
-        *
-        *      get next unreached directory inode # from
-        *              incore list
-        *      push inode on dir stack
-        *      call process_dirstack
+        * then process all inodes by walking incore inode tree
          */
-       for (i = 0; i < glob_agcount; i++)  {
-               irec = findfirst_inode_rec(i);
-
-               if (irec == NULL)
-                       continue;
-
-               while (irec != NULL)  {
-                       for (j = 0; j < XFS_INODES_PER_CHUNK; j++)  {
-                               if (!is_inode_confirmed(irec, j))
-                                       continue;
-                               /*
-                                * skip directories that have already been
-                                * processed, even if they haven't been
-                                * reached.  If they are reachable, we'll
-                                * pick them up when we process their parent.
-                                */
-                               ino = XFS_AGINO_TO_INO(mp, i,
-                                               j + irec->ino_startnum);
-                               if (inode_isadir(irec, j) &&
-                                               !is_inode_refchecked(ino,
-                                                       irec, j)) {
-                                       push_dir(&stack, ino);
-                                       process_dirstack(mp, &stack);
-                               }
-                       }
-                       irec = next_ino_rec(irec);
-               }
-       }
+       traverse_ags(mp);
  
         do_log(_("        - traversals finished ... \n"));
         do_log(_("        - moving disconnected inodes to %s ... \n"),
@@ -4036,7 +3929,7 @@ _("        - skipping filesystem traversal from / ... \n"));
         for (i = 0; i < glob_agcount; i++)  {
                 irec = findfirst_inode_rec(i);
                 while (irec != NULL)  {
-                       check_for_orphaned_inodes(mp, irec);
+                       check_for_orphaned_inodes(mp, i, irec);
                         irec = next_ino_rec(irec);
                 }
         }
diff --git a/repair/phase7.c b/repair/phase7.c

index ee89a374f2d59b5e0df1eae7d68b9cbd81ecb77c..09b426cc748aad802146c34a2f8a3a462b55aa35 100644 (file)
--- a/repair/phase7.c
+++ b/repair/phase7.c
@@ -25,9 +25,7 @@
  #include "err_protos.h"
  #include "dinode.h"
  #include "versions.h"
-#include "prefetch.h"
  #include "progress.h"
-#include "threads.h"
  
  /* dinoc is a pointer to the IN-CORE dinode core */
  static void
@@ -116,57 +114,6 @@ update_inode_nlinks(
         }
  }
  
-static void
-phase7_alt_function(xfs_mount_t *mp, xfs_agnumber_t agno)
-{
-       ino_tree_node_t         *irec;
-       int                     j;
-       __uint32_t              nrefs;
-
-       /*
-        * using the nlink values memorised during phase3/4, compare to the
-        * nlink counted in phase 6, and if different, update on-disk.
-        */
-
-       irec = findfirst_inode_rec(agno);
-
-       while (irec != NULL)  {
-               for (j = 0; j < XFS_INODES_PER_CHUNK; j++)  {
-                       assert(is_inode_confirmed(irec, j));
-
-                       if (is_inode_free(irec, j))
-                               continue;
-
-                       assert(no_modify || is_inode_reached(irec, j));
-                       assert(no_modify || is_inode_referenced(irec, j));
-
-                       nrefs = num_inode_references(irec, j);
-
-                       if (get_inode_disk_nlinks(irec, j) != nrefs)
-                               update_inode_nlinks(mp, XFS_AGINO_TO_INO(mp,
-                                               agno, irec->ino_startnum + j),
-                                               nrefs);
-               }
-               irec = next_ino_rec(irec);
-               PROG_RPT_INC(prog_rpt_done[agno], XFS_INODES_PER_CHUNK);
-       }
-}
-
-static void
-phase7_alt(xfs_mount_t *mp)
-{
-       int             i;
-
-       set_progress_msg(no_modify ? PROGRESS_FMT_VRFY_LINK : PROGRESS_FMT_CORR_LINK,
-               (__uint64_t) mp->m_sb.sb_icount);
-
-       for (i = 0; i < glob_agcount; i++)  {
-               queue_work(phase7_alt_function, mp, i);
-       }
-       wait_for_workers();
-       print_final_rpt();
-}
-
  void
  phase7(xfs_mount_t *mp)
  {
@@ -180,11 +127,6 @@ phase7(xfs_mount_t *mp)
         else
                 do_log(_("Phase 7 - verify link counts...\n"));
  
-       if (do_prefetch) {
-               phase7_alt(mp);
-               return;
-       }
-
         /*
          * for each ag, look at each inode 1 at a time. If the number of
          * links is bad, reset it, log the inode core, commit the transaction
diff --git a/repair/prefetch.c b/repair/prefetch.c

index cd198a5953321311e31ae164ace0d5897f0095f1..4b7ea7f11d1c24843eaa1428f1c074545ba11a07 100644 (file)
--- a/repair/prefetch.c
+++ b/repair/prefetch.c
@@ -1,6 +1,5 @@
  #include <libxfs.h>
-#include "prefetch.h"
-#include "aio.h"
+#include <pthread.h>
  #include "avl.h"
  #include "globals.h"
  #include "agheader.h"
@@ -13,454 +12,796 @@
  #include "dinode.h"
  #include "bmap.h"
  #include "versions.h"
+#include "threads.h"
+#include "prefetch.h"
+#include "progress.h"
+#include "radix-tree.h"
  
  int do_prefetch = 1;
  
-ino_tree_node_t *
-prefetch_inode_chunks(xfs_mount_t *mp,
-               xfs_agnumber_t agno,
-               ino_tree_node_t *ino_ra)
-{
-       xfs_agblock_t agbno;
-       libxfs_lio_req_t *liop;
-       int i;
+/*
+ * Performs prefetching by priming the libxfs cache by using a dedicate thread
+ * scanning inodes and reading blocks in ahead of time they are required.
+ *
+ * Any I/O errors can be safely ignored.
+ */
  
-       if (libxfs_lio_ino_count == 0)
-               return NULL;
+static xfs_mount_t     *mp;
+static int             mp_fd;
+static int             pf_max_bytes;
+static int             pf_max_bbs;
+static int             pf_max_fsbs;
+static int             pf_batch_bytes;
+static int             pf_batch_fsbs;
  
-       liop = (libxfs_lio_req_t *) libxfs_get_lio_buffer(LIBXFS_LIO_TYPE_INO);
-       if (liop == NULL) {
-               do_prefetch = 0;
-               return NULL;
-       }
+#define B_INODE                0x1000000
+#define B_META         0x2000000
  
-       if (ino_ra == NULL)
-               ino_ra = findfirst_inode_rec(agno);
-
-       i = 0;
-       while (ino_ra) {
-               agbno = XFS_AGINO_TO_AGBNO(mp, ino_ra->ino_startnum);
-               liop[i].blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
-               liop[i].len = (int) XFS_FSB_TO_BB(mp, XFS_IALLOC_BLOCKS(mp));
-               i++;
-               ino_ra = next_ino_rec(ino_ra);
-               if (i >= libxfs_lio_ino_count)
-                       break;
+#define DEF_BATCH_BYTES        0x10000
+
+#define MAX_BUFS       128
+
+#define IO_THRESHOLD   (MAX_BUFS * PF_THREAD_COUNT)
+
+typedef enum pf_which {
+       PF_PRIMARY,
+       PF_SECONDARY,
+       PF_META_ONLY
+} pf_which_t;
+
+
+static inline void
+pf_start_processing(
+       prefetch_args_t         *args)
+{
+       if (!args->can_start_processing) {
+#ifdef XR_PF_TRACE
+               pftrace("signalling processing for AG %d", args->agno);
+#endif
+               args->can_start_processing = 1;
+               pthread_cond_signal(&args->start_processing);
         }
-       if (i) {
-               if (libxfs_readbuf_list(mp->m_dev, i, (void *) liop, LIBXFS_LIO_TYPE_INO) == -1)
-                       do_prefetch = 0;
+}
+
+static inline void
+pf_start_io_workers(
+       prefetch_args_t         *args)
+{
+       if (!args->can_start_reading) {
+#ifdef XR_PF_TRACE
+               pftrace("signalling reading for AG %d", args->agno);
+#endif
+               args->can_start_reading = 1;
+               pthread_cond_broadcast(&args->start_reading);
         }
-       libxfs_put_lio_buffer((void *) liop);
-       return (ino_ra);
  }
  
+
  static void
-prefetch_node(
-       xfs_mount_t             *mp,
-       xfs_buf_t               *bp,
-       da_bt_cursor_t          *da_cursor)
+pf_queue_io(
+       prefetch_args_t         *args,
+       xfs_fsblock_t           fsbno,
+       int                     blen,
+       int                     flag)
  {
-       xfs_da_intnode_t        *node;
-       libxfs_lio_req_t        *liop;
-       int                     i;
-       xfs_dfsbno_t            fsbno;
-
-       node = (xfs_da_intnode_t *)XFS_BUF_PTR(bp);
-       if (INT_GET(node->hdr.count, ARCH_CONVERT) <= 1)
-               return;
+       xfs_buf_t               *bp;
  
-       if ((liop = (libxfs_lio_req_t *) libxfs_get_lio_buffer(LIBXFS_LIO_TYPE_DIR)) == NULL) {
+       bp = libxfs_getbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno),
+                       XFS_FSB_TO_BB(mp, blen));
+       if (bp->b_flags & LIBXFS_B_UPTODATE) {
+               libxfs_putbuf(bp);
                 return;
         }
+       bp->b_flags |= flag;
  
-       for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); i++) {
-               if (i == libxfs_lio_dir_count)
-                       break;
+       pthread_mutex_lock(&args->lock);
  
-               fsbno = blkmap_get(da_cursor->blkmap, INT_GET(node->btree[i].before, ARCH_CONVERT));
-               if (fsbno == NULLDFSBNO) {
-                       libxfs_put_lio_buffer((void *) liop);
-                       return;
+       if (fsbno > args->last_bno_read) {
+               radix_tree_insert(&args->primary_io_queue, fsbno, bp);
+               if (flag == B_META)
+                       radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
+               else {
+                       args->inode_bufs_queued++;
+                       if (args->inode_bufs_queued == IO_THRESHOLD)
+                               pf_start_io_workers(args);
                 }
-
-               liop[i].blkno = XFS_FSB_TO_DADDR(mp, fsbno);
-               liop[i].len =  XFS_FSB_TO_BB(mp, 1);
+#ifdef XR_PF_TRACE
+               pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
+                       "primary queue (inode_bufs_queued = %d, last_bno = %lu)",
+                       flag == B_INODE ? 'I' : 'M', bp,
+                       (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
+                       args->inode_bufs_queued, args->last_bno_read);
+#endif
+       } else {
+#ifdef XR_PF_TRACE
+               pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
+                       "secondary queue (last_bno = %lu)",
+                       flag == B_INODE ? 'I' : 'M', bp,
+                       (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
+                       args->last_bno_read);
+#endif
+               ASSERT(flag == B_META);
+               radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
         }
  
-       if (i > 1) {
-               if (libxfs_readbuf_list(mp->m_dev, i, (void *) liop, LIBXFS_LIO_TYPE_DIR) == -1)
-                       do_prefetch = 0;
-       }
+       pf_start_processing(args);
  
-       libxfs_put_lio_buffer((void *) liop);
-       return;
+       pthread_mutex_unlock(&args->lock);
  }
  
-void
-prefetch_dir1(
-       xfs_mount_t             *mp,
-       xfs_dablk_t             bno,
-       da_bt_cursor_t          *da_cursor)
+static int
+pf_read_bmbt_reclist(
+       prefetch_args_t         *args,
+       xfs_bmbt_rec_t          *rp,
+       int                     numrecs)
  {
-       xfs_da_intnode_t        *node;
-       xfs_buf_t               *bp;
-       xfs_dfsbno_t            fsbno;
         int                     i;
+       xfs_dfsbno_t            s;              /* start */
+       xfs_dfilblks_t          c;              /* count */
+       xfs_dfiloff_t           o;              /* offset */
+       xfs_dfilblks_t          cp = 0;         /* prev count */
+       xfs_dfiloff_t           op = 0;         /* prev offset */
+       int                     flag;           /* extent flag */
+
+       for (i = 0; i < numrecs; i++, rp++) {
+               convert_extent((xfs_bmbt_rec_32_t*)rp, &o, &s, &c, &flag);
+
+               if (((i > 0) && (op + cp > o)) || (c == 0) ||
+                               (o >= fs_max_file_offset))
+                       return 0;
+
+               if (!verify_dfsbno(mp, s) || !verify_dfsbno(mp, s + c - 1))
+                       return 0;
+
+               if (!args->dirs_only && ((o + c) >= mp->m_dirfreeblk))
+                       break;  /* only Phase 6 reads the free blocks */
+
+               op = o;
+               cp = c;
+
+               while (c) {
+#ifdef XR_PF_TRACE
+                       pftrace("queuing dir extent in AG %d", args->agno);
+#endif
+                       pf_queue_io(args, s, 1, B_META);
+                       c--;
+                       s++;
+               }
+       }
+       return 1;
+}
  
-       fsbno = blkmap_get(da_cursor->blkmap, bno);
-       if (fsbno == NULLDFSBNO)
-               return;
+/*
+ * simplified version of the main scan_lbtree. Returns 0 to stop.
+ */
+
+static int
+pf_scan_lbtree(
+       xfs_dfsbno_t            dbno,
+       int                     level,
+       int                     isadir,
+       prefetch_args_t         *args,
+       int                     (*func)(xfs_btree_lblock_t      *block,
+                                       int                     level,
+                                       int                     isadir,
+                                       prefetch_args_t         *args))
+{
+       xfs_buf_t               *bp;
+       int                     rc;
  
-       bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno),
+       bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
                         XFS_FSB_TO_BB(mp, 1), 0);
+       if (!bp)
+               return 0;
  
-       if (bp == NULL)
-               return;
+       rc = (*func)((xfs_btree_lblock_t *)XFS_BUF_PTR(bp), level - 1, isadir, args);
  
+       libxfs_putbuf(bp);
  
-       node = (xfs_da_intnode_t *)XFS_BUF_PTR(bp);
-       if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)  {
-               libxfs_putbuf(bp);
-               return;
+       return rc;
+}
+
+static int
+pf_scanfunc_bmap(
+       xfs_btree_lblock_t      *block,
+       int                     level,
+       int                     isadir,
+       prefetch_args_t         *args)
+{
+       xfs_bmbt_rec_t          *rp;
+       xfs_bmbt_ptr_t          *pp;
+       int                     numrecs;
+       int                     i;
+       xfs_dfsbno_t            dbno;
+
+       /*
+        * do some validation on the block contents
+        */
+       if ((be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC) ||
+                       (be16_to_cpu(block->bb_level) != level))
+               return 0;
+
+       numrecs = be16_to_cpu(block->bb_numrecs);
+
+       if (level == 0) {
+               if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
+                       return 0;
+
+               rp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
+                               block, 1, mp->m_bmap_dmxr[0]);
+
+               return pf_read_bmbt_reclist(args, rp, numrecs);
         }
  
-       prefetch_node(mp, bp, da_cursor);
+       if (numrecs > mp->m_bmap_dmxr[1])
+               return 0;
  
-       /* skip prefetching if next level is leaf level */
-       if (INT_GET(node->hdr.level, ARCH_CONVERT) > 1) {
-               for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); i++) {
-                       prefetch_dir1(mp,
-                               INT_GET(node->btree[i].before, ARCH_CONVERT),
-                               da_cursor);
-               }
+       pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, 1,
+                       mp->m_bmap_dmxr[1]);
+
+       for (i = 0; i < numrecs; i++) {
+               dbno = be64_to_cpu(pp[i]);
+               if (!verify_dfsbno(mp, dbno))
+                       return 0;
+               if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
+                       return 0;
         }
-       
-       libxfs_putbuf(bp);
-       return;
+       return 1;
  }
  
-void
-prefetch_dir2(
-       xfs_mount_t     *mp,
-       blkmap_t        *blkmap)
+
+static void
+pf_read_btinode(
+       prefetch_args_t         *args,
+       xfs_dinode_t            *dino,
+       int                     isadir)
  {
-       xfs_dfiloff_t           dbno;
-       xfs_dfiloff_t           pdbno;
-       bmap_ext_t              *bmp;   
-       int                     nex;
-       int                     i, j, t;
-       libxfs_lio_req_t        *liop;
-
-       liop = (libxfs_lio_req_t *) libxfs_get_lio_buffer(LIBXFS_LIO_TYPE_DIR);
-       if (liop == NULL)
+       xfs_bmdr_block_t        *dib;
+       xfs_bmbt_ptr_t          *pp;
+       int                     i;
+       int                     level;
+       int                     numrecs;
+       int                     dsize;
+       xfs_dfsbno_t            dbno;
+
+       dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
+
+       level = be16_to_cpu(dib->bb_level);
+       numrecs = be16_to_cpu(dib->bb_numrecs);
+
+       if ((numrecs == 0) || (level == 0) ||
+                       (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
                 return;
+       /*
+        * use bmdr/dfork_dsize since the root block is in the data fork
+        */
+       if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
+               return;
+
+       dsize = XFS_DFORK_DSIZE(dino, mp);
+       pp = XFS_BTREE_PTR_ADDR(dsize, xfs_bmdr, dib, 1,
+                       XFS_BTREE_BLOCK_MAXRECS(dsize, xfs_bmdr, 0));
  
-       pdbno = NULLDFILOFF;    /* previous dbno is NULLDFILOFF */
-       i = 0;
-       while ((dbno = blkmap_next_off(blkmap, pdbno, &t)) < mp->m_dirfreeblk) {
-               if (i == libxfs_lio_dir_count)
+       for (i = 0; i < numrecs; i++) {
+               dbno = be64_to_cpu(pp[i]);
+               if (!verify_dfsbno(mp, dbno))
                         break;
-               if (dbno == NULLDFILOFF)
+               if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
                         break;
-               if (mp->m_dirblkfsbs == 1) {
-                       xfs_dfsbno_t blk;
+       }
+}
+
+static void
+pf_read_exinode(
+       prefetch_args_t         *args,
+       xfs_dinode_t            *dino)
+{
+       pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
+                       be32_to_cpu(dino->di_core.di_nextents));
+}
  
-                       /* avoid bmp realloc/free overhead, use blkmap_get */
-                       blk = blkmap_get(blkmap, dbno);
-                       if (blk == NULLDFSBNO)
+static void
+pf_read_inode_dirs(
+       prefetch_args_t         *args,
+       xfs_buf_t               *bp)
+{
+       xfs_dinode_t            *dino;
+       int                     icnt = 0;
+       xfs_dinode_core_t       *dinoc;
+
+       for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
+               dino = XFS_MAKE_IPTR(mp, bp, icnt);
+               dinoc = &dino->di_core;
+
+               /*
+                * We are only prefetching directory contents in extents
+                * and btree nodes for other inodes
+                */
+               if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL ||
+                               (dinoc->di_format == XFS_DINODE_FMT_EXTENTS &&
+                                (be16_to_cpu(dinoc->di_mode) & S_IFMT) != S_IFDIR))
+                       continue;
+
+               /*
+                * do some checks on the inode to see if we can prefetch
+                * its directory data. It's a cut down version of
+                * process_dinode_int() in dinode.c.
+                */
+               if (dinoc->di_format > XFS_DINODE_FMT_BTREE)
+                       continue;
+
+               if (be16_to_cpu(dinoc->di_magic) != XFS_DINODE_MAGIC)
+                       continue;
+
+               if (!XFS_DINODE_GOOD_VERSION(dinoc->di_version) ||
+                               (!fs_inode_nlink && dinoc->di_version >
+                                       XFS_DINODE_VERSION_1))
+                       continue;
+
+               if (be64_to_cpu(dinoc->di_size) <= XFS_DFORK_DSIZE(dino, mp))
+                       continue;
+
+               if ((dinoc->di_forkoff != 0) &&
+                               (dinoc->di_forkoff >= (XFS_LITINO(mp) >> 3)))
+                       continue;
+
+               switch (dinoc->di_format) {
+                       case XFS_DINODE_FMT_EXTENTS:
+                               pf_read_exinode(args, dino);
                                 break;
-                       pdbno = dbno;
-                       liop[i].blkno = XFS_FSB_TO_DADDR(mp, blk);
-                       liop[i].len = (int) XFS_FSB_TO_BB(mp, 1);
-                       i++;
-               }
-               else if (mp->m_dirblkfsbs > 1) {
-                       nex = blkmap_getn(blkmap, dbno, mp->m_dirblkfsbs, &bmp, NULL);
-                       if (nex == 0)
+                       case XFS_DINODE_FMT_BTREE:
+                               pf_read_btinode(args, dino, (be16_to_cpu(
+                                       dinoc->di_mode) & S_IFMT) == S_IFDIR);
                                 break;
-                       pdbno = dbno + mp->m_dirblkfsbs - 1;
-                       for (j = 0; j < nex; j++) {
-                               liop[i].blkno = XFS_FSB_TO_DADDR(mp, bmp[j].startblock);
-                               liop[i].len = (int) XFS_FSB_TO_BB(mp, bmp[j].blockcount);
-                               i++;
-                               if (i == libxfs_lio_dir_count)
-                                       break;  /* for loop */
-                       }
-                       free(bmp);
-               }
-               else {
-                       do_error("invalid mp->m_dirblkfsbs %d\n", mp->m_dirblkfsbs);
                 }
         }
-       if (i > 1) {
-               if (libxfs_readbuf_list(mp->m_dev, i, (void *) liop, LIBXFS_LIO_TYPE_DIR) == -1)
-                       do_prefetch = 0;
-       }
-       libxfs_put_lio_buffer((void *) liop);
  }
  
+/*
+ * pf_batch_read must be called with the lock locked.
+ */
+
  static void
-prefetch_p6_node(
-       xfs_mount_t             *mp,
-       xfs_inode_t             *ip,
-       xfs_buf_t               *bp)
+pf_batch_read(
+       prefetch_args_t         *args,
+       pf_which_t              which,
+       void                    *buf)
  {
-       xfs_da_intnode_t        *node;
-       libxfs_lio_req_t        *liop;
+       struct radix_tree_root  *queue;
+       xfs_buf_t               *bplist[MAX_BUFS];
+       unsigned int            num;
+       off64_t                 first_off, last_off, next_off;
+       int                     len, size;
         int                     i;
-       xfs_fsblock_t           fblock;
-       xfs_dfsbno_t            fsbno;
-       xfs_bmbt_irec_t         map;
-       int                     nmap;
-       int                     error;
-
-       node = (xfs_da_intnode_t *)XFS_BUF_PTR(bp);
-       if (INT_GET(node->hdr.count, ARCH_CONVERT) <= 1)
-               return;
-
-       if ((liop = (libxfs_lio_req_t *) libxfs_get_lio_buffer(LIBXFS_LIO_TYPE_DIR)) == NULL) {
-               return;
-       }
-
-       fblock = NULLFSBLOCK;
-
-       for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); i++) {
-               if (i == libxfs_lio_dir_count)
-                       break;
+       int                     inode_bufs;
+       unsigned long           fsbno;
+       char                    *pbuf;
+
+       queue = (which != PF_SECONDARY) ? &args->primary_io_queue
+                               : &args->secondary_io_queue;
+
+       while (radix_tree_lookup_first(queue, &fsbno) != NULL) {
+
+               if (which != PF_META_ONLY) {
+                       num = radix_tree_gang_lookup_ex(queue,
+                                       (void**)&bplist[0], fsbno,
+                                       fsbno + pf_max_fsbs, MAX_BUFS);
+                       ASSERT(num > 0);
+                       ASSERT(XFS_FSB_TO_DADDR(mp, fsbno) ==
+                               XFS_BUF_ADDR(bplist[0]));
+               } else {
+                       num = radix_tree_gang_lookup_tag(queue,
+                                       (void**)&bplist[0], fsbno,
+                                       MAX_BUFS / 4, 0);
+                       if (num == 0)
+                               return;
+               }
  
-               nmap = 1;
-               error = libxfs_bmapi(NULL, ip, (xfs_fileoff_t)
-                               INT_GET(node->btree[i].before, ARCH_CONVERT), 1,
-                               XFS_BMAPI_METADATA, &fblock, 0,
-                               &map, &nmap, NULL);
+               /*
+                * do a big read if 25% of the potential buffer is useful,
+                * otherwise, find as many close together blocks and
+                * read them in one read
+                */
+               first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
+               last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
+                       XFS_BUF_SIZE(bplist[num-1]);
+               while (last_off - first_off > pf_max_bytes) {
+                       num--;
+                       last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
+                               XFS_BUF_SIZE(bplist[num-1]);
+               }
+               if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
+                       /*
+                        * not enough blocks for one big read, so determine
+                        * the number of blocks that are close enough.
+                        */
+                       last_off = first_off + XFS_BUF_SIZE(bplist[0]);
+                       for (i = 1; i < num; i++) {
+                               next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
+                                               XFS_BUF_SIZE(bplist[i]);
+                               if (next_off - last_off > pf_batch_bytes)
+                                       break;
+                               last_off = next_off;
+                       }
+                       num = i;
+               }
  
-               if (error || (nmap != 1)) {
-                       libxfs_put_lio_buffer((void *) liop);
-                       return;
+               for (i = 0; i < num; i++) {
+                       if (radix_tree_delete(queue, XFS_DADDR_TO_FSB(mp,
+                                       XFS_BUF_ADDR(bplist[i]))) == NULL)
+                               do_error(_("prefetch corruption\n"));
                 }
  
-               if ((fsbno = map.br_startblock) == HOLESTARTBLOCK) {
-                       libxfs_put_lio_buffer((void *) liop);
-                       return;
+               if (which == PF_PRIMARY) {
+                       for (inode_bufs = 0, i = 0; i < num; i++) {
+                               if (bplist[i]->b_flags & B_INODE)
+                                       inode_bufs++;
+                       }
+                       args->inode_bufs_queued -= inode_bufs;
+                       if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
+                                       pf_batch_fsbs)
+                               args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
+               }
+#ifdef XR_PF_TRACE
+               pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
+                       (long long)XFS_BUF_ADDR(bplist[0]),
+                       (long long)XFS_BUF_ADDR(bplist[num-1]), num,
+                       (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
+                       args->last_bno_read, args->inode_bufs_queued);
+#endif
+               pthread_mutex_unlock(&args->lock);
+
+               /*
+                * now read the data and put into the xfs_but_t's
+                */
+               len = pread64(mp_fd, buf, (int)(last_off - first_off), first_off);
+               if (len > 0) {
+                       /*
+                        * go through the xfs_buf_t list copying from the
+                        * read buffer into the xfs_buf_t's and release them.
+                        */
+                       last_off = first_off;
+                       for (i = 0; i < num; i++) {
+
+                               pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
+                               size = XFS_BUF_SIZE(bplist[i]);
+                               if (len < size)
+                                       break;
+                               memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
+                               bplist[i]->b_flags |= LIBXFS_B_UPTODATE;
+                               len -= size;
+                               if (bplist[i]->b_flags & B_INODE)
+                                       pf_read_inode_dirs(args, bplist[i]);
+                       }
+               }
+               for (i = 0; i < num; i++) {
+#ifdef XR_PF_TRACE
+                       pftrace("putbuf %c %p (%llu) in AG %d",
+                               bplist[i]->b_flags & B_INODE ? 'I' : 'M',
+                               bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
+                               args->agno);
+#endif
+                       libxfs_putbuf(bplist[i]);
+               }
+               pthread_mutex_lock(&args->lock);
+               if (which != PF_SECONDARY) {
+#ifdef XR_PF_TRACE
+                       pftrace("inode_bufs_queued for AG %d = %d", args->agno,
+                               args->inode_bufs_queued);
+#endif
+                       /*
+                        * if primary inode queue running low, process metadata
+                        * in boths queues to avoid I/O starvation as the
+                        * processing thread would be waiting for a metadata
+                        * buffer
+                        */
+                       if (which == PF_PRIMARY && !args->queuing_done &&
+                                       args->inode_bufs_queued < IO_THRESHOLD) {
+#ifdef XR_PF_TRACE
+                               pftrace("reading metadata bufs from primary queue for AG %d",
+                                       args->agno);
+#endif
+                               pf_batch_read(args, PF_META_ONLY, buf);
+#ifdef XR_PF_TRACE
+                               pftrace("reading bufs from secondary queue for AG %d",
+                                       args->agno);
+#endif
+                               pf_batch_read(args, PF_SECONDARY, buf);
+                       }
                 }
-               liop[i].blkno = XFS_FSB_TO_DADDR(mp, fsbno);
-               liop[i].len =  XFS_FSB_TO_BB(mp, 1);
         }
+}
+
+static void *
+pf_io_worker(
+       void                    *param)
+{
+       prefetch_args_t         *args = param;
+       void                    *buf = memalign(libxfs_device_alignment(),
+                                               pf_max_bytes);
+
+       if (buf == NULL)
+               return NULL;
  
-       if (i > 1) {
-               if (libxfs_readbuf_list(mp->m_dev, i, (void *) liop, LIBXFS_LIO_TYPE_DIR) == -1)
-                       do_prefetch = 0;
+       pthread_mutex_lock(&args->lock);
+       while (!args->queuing_done || args->primary_io_queue.height) {
+
+#ifdef XR_PF_TRACE
+               pftrace("waiting to start prefetch I/O for AG %d", args->agno);
+#endif
+               while (!args->can_start_reading && !args->queuing_done)
+                       pthread_cond_wait(&args->start_reading, &args->lock);
+#ifdef XR_PF_TRACE
+               pftrace("starting prefetch I/O for AG %d", args->agno);
+#endif
+               pf_batch_read(args, PF_PRIMARY, buf);
+               pf_batch_read(args, PF_SECONDARY, buf);
+
+#ifdef XR_PF_TRACE
+               pftrace("ran out of bufs to prefetch for AG %d", args->agno);
+#endif
+               if (!args->queuing_done)
+                       args->can_start_reading = 0;
         }
+       pthread_mutex_unlock(&args->lock);
  
-       libxfs_put_lio_buffer((void *) liop);
-       return;
+       free(buf);
+
+#ifdef XR_PF_TRACE
+       pftrace("finished prefetch I/O for AG %d", args->agno);
+#endif
+       return NULL;
  }
  
-void
-prefetch_p6_dir1(
-       xfs_mount_t             *mp,
-       xfs_ino_t               ino,
-       xfs_inode_t             *ip,
-       xfs_dablk_t             da_bno,
-       xfs_fsblock_t           *fblockp)
+static int
+pf_create_prefetch_thread(
+       prefetch_args_t         *args);
+
+static void *
+pf_queuing_worker(
+       void                    *param)
  {
-       xfs_da_intnode_t        *node;
-       xfs_buf_t               *bp;
-       xfs_dfsbno_t            fsbno;
-       xfs_bmbt_irec_t         map;
-       int                     nmap;
+       prefetch_args_t         *args = param;
+       int                     num_inos;
+       ino_tree_node_t         *irec;
+       ino_tree_node_t         *cur_irec;
+       int                     blks_per_cluster;
+       int                     inos_per_cluster;
+       xfs_agblock_t           bno;
         int                     i;
-       int                     error;
-
-       nmap = 1;
-       error = libxfs_bmapi(NULL, ip, (xfs_fileoff_t) da_bno, 1,
-                       XFS_BMAPI_METADATA, fblockp, 0,
-                       &map, &nmap, NULL);
-       if (error || (nmap != 1))  {
-               return;
+       int                     err;
+
+       blks_per_cluster =  XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+       if (blks_per_cluster == 0)
+               blks_per_cluster = 1;
+       inos_per_cluster = blks_per_cluster * mp->m_sb.sb_inopblock;
+
+       for (i = 0; i < PF_THREAD_COUNT; i++) {
+               err = pthread_create(&args->io_threads[i], NULL,
+                               pf_io_worker, args);
+               if (err != 0) {
+                       do_warn(_("failed to create prefetch thread: %s\n"),
+                               strerror(err));
+                       if (i == 0) {
+                               pf_start_processing(args);
+                               return NULL;
+                       }
+                       /*
+                        * since we have at least one I/O thread, use them for
+                        * prefetch
+                        */
+                       break;
+               }
         }
  
-       if ((fsbno = map.br_startblock) == HOLESTARTBLOCK)
-               return;
+#ifdef XR_PF_TRACE
+       pftrace("starting prefetch for AG %d", args->agno);
+#endif
  
-       bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno),
-                       XFS_FSB_TO_BB(mp, 1), 0);
+       for (irec = findfirst_inode_rec(args->agno); irec != NULL;
+                       irec = next_ino_rec(irec)) {
  
-       if (bp == NULL)
-               return;
+               cur_irec = irec;
  
+               num_inos = XFS_INODES_PER_CHUNK;
+               while (num_inos < XFS_IALLOC_INODES(mp) && irec != NULL) {
+                       irec = next_ino_rec(irec);
+                       num_inos += XFS_INODES_PER_CHUNK;
+               }
  
-       node = (xfs_da_intnode_t *)XFS_BUF_PTR(bp);
-       if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)  {
-               libxfs_putbuf(bp);
-               return;
+               if (args->dirs_only && cur_irec->ino_isa_dir == 0)
+                       continue;
+#ifdef XR_PF_TRACE
+               sem_getvalue(&args->ra_count, &i);
+               pftrace("queuing irec %p in AG %d, sem count = %d",
+                       irec, args->agno, i);
+#endif
+               sem_wait(&args->ra_count);
+
+               num_inos = 0;
+               bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
+
+               do {
+                       pf_queue_io(args, XFS_AGB_TO_FSB(mp, args->agno, bno),
+                                       blks_per_cluster, B_INODE);
+                       bno += blks_per_cluster;
+                       num_inos += inos_per_cluster;
+               } while (num_inos < XFS_IALLOC_INODES(mp));
         }
  
-       prefetch_p6_node(mp, ip, bp);
+       pthread_mutex_lock(&args->lock);
  
-       /* skip prefetching if next level is leaf level */
-       if (INT_GET(node->hdr.level, ARCH_CONVERT) > 1) {
-               for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); i++) {
-                       (void) prefetch_p6_dir1(mp, ino, ip,
-                               INT_GET(node->btree[i].before, ARCH_CONVERT),
-                               fblockp);
-               }
-       }
-       
-       libxfs_putbuf(bp);
-       return;
+#ifdef XR_PF_TRACE
+       pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
+               args->agno, args->inode_bufs_queued);
+#endif
+       args->queuing_done = 1;
+       pf_start_io_workers(args);
+       pf_start_processing(args);
+       pthread_mutex_unlock(&args->lock);
+
+       /* now wait for the readers to finish */
+       for (i = 0; i < PF_THREAD_COUNT; i++)
+               if (args->io_threads[i])
+                       pthread_join(args->io_threads[i], NULL);
+
+#ifdef XR_PF_TRACE
+       pftrace("prefetch for AG %d finished", args->agno);
+#endif
+       pthread_mutex_lock(&args->lock);
+
+       ASSERT(args->primary_io_queue.height == 0);
+       ASSERT(args->secondary_io_queue.height == 0);
+
+       args->prefetch_done = 1;
+       if (args->next_args)
+               pf_create_prefetch_thread(args->next_args);
+
+       pthread_mutex_unlock(&args->lock);
+
+       return NULL;
  }
  
-#define        NMAPP   4
+static int
+pf_create_prefetch_thread(
+       prefetch_args_t         *args)
+{
+       int                     err;
+
+#ifdef XR_PF_TRACE
+       pftrace("creating queue thread for AG %d", args->agno);
+#endif
+       err = pthread_create(&args->queuing_thread, NULL,
+                       pf_queuing_worker, args);
+       if (err != 0) {
+               do_warn(_("failed to create prefetch thread: %s\n"),
+                       strerror(err));
+               cleanup_inode_prefetch(args);
+       }
+
+       return err == 0;
+}
  
  void
-prefetch_p6_dir2(
-       xfs_mount_t     *mp,
-       xfs_inode_t     *ip)
+init_prefetch(
+       xfs_mount_t             *pmp)
  {
-       xfs_fileoff_t           da_bno;
-       xfs_fileoff_t           next_da_bno;
-       int                     i, j;
-       libxfs_lio_req_t        *liop;
-       xfs_fsblock_t           fsb;
-       int                     nfsb;
-       int                     error;
-
-       if ((liop = (libxfs_lio_req_t *) libxfs_get_lio_buffer(LIBXFS_LIO_TYPE_DIR)) == NULL) {
-               return;
-       }
-       i = 0;
-       for (da_bno = 0, next_da_bno = 0; next_da_bno != NULLFILEOFF; da_bno = next_da_bno) {
-               if (i == libxfs_lio_dir_count)
-                       break;
-               next_da_bno = da_bno + mp->m_dirblkfsbs - 1;
-               if (libxfs_bmap_next_offset(NULL, ip, &next_da_bno, XFS_DATA_FORK))
-                       break;
+       mp = pmp;
+       mp_fd = libxfs_device_to_fd(mp->m_dev);
+       pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
+       pf_max_bbs = pf_max_bytes >> BBSHIFT;
+       pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
+       pf_batch_bytes = DEF_BATCH_BYTES;
+       pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
+}
  
-               if (mp->m_dirblkfsbs == 1) {
-                       if ((error = libxfs_bmapi_single(NULL, ip, XFS_DATA_FORK, &fsb, da_bno)) != 0) {
-                               libxfs_put_lio_buffer((void *) liop);
-                               do_prefetch = 0;
-                               do_warn("phase6 prefetch: cannot bmap single block err = %d\n", error);
-                               return;
-                       }
-                       if (fsb == NULLFSBLOCK) {
-                               libxfs_put_lio_buffer((void *) liop);
-                               return;
-                       }
+prefetch_args_t *
+start_inode_prefetch(
+       xfs_agnumber_t          agno,
+       int                     dirs_only,
+       prefetch_args_t         *prev_args)
+{
+       prefetch_args_t         *args;
  
-                       liop[i].blkno = XFS_FSB_TO_DADDR(mp, fsb);
-                       liop[i].len =  XFS_FSB_TO_BB(mp, 1);
-                       i++;
-               }
-               else if ((nfsb = mp->m_dirblkfsbs) > 1) {
-                       xfs_fsblock_t   firstblock;
-                       xfs_bmbt_irec_t map[NMAPP];
-                       xfs_bmbt_irec_t *mapp;
-                       int             nmap;
-
-                       if (nfsb > NMAPP) {
-                               mapp = malloc(sizeof(*mapp) * nfsb);
-                               if (mapp == NULL) {
-                                       libxfs_put_lio_buffer((void *) liop);
-                                       do_prefetch = 0;
-                                       do_warn("phase6 prefetch: cannot allocate mem for map\n");
-                                       return;
-                               }
-                       }
-                       else {
-                               mapp = map;
-                       }
-                        firstblock = NULLFSBLOCK;
-                        nmap = nfsb;
-                        if ((error = libxfs_bmapi(NULL, ip, da_bno,
-                                        nfsb,
-                                        XFS_BMAPI_METADATA | XFS_BMAPI_AFLAG(XFS_DATA_FORK),
-                                        &firstblock, 0, mapp, &nmap, NULL))) {
-                               libxfs_put_lio_buffer((void *) liop);
-                               do_prefetch = 0;
-                               do_warn("phase6 prefetch: cannot bmap err = %d\n", error);
-                               return;
-                       }
-                       for (j = 0; j < nmap; j++) {
-                               liop[i].blkno = XFS_FSB_TO_DADDR(mp, mapp[j].br_startblock);
-                               liop[i].len = (int)XFS_FSB_TO_BB(mp, mapp[j].br_blockcount);
-                               i++;
-                               if (i == libxfs_lio_dir_count)
-                                       break; /* for loop */
-                       }
-                       if (mapp != map)
-                               free(mapp);
+       if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
+               return NULL;
  
-               }
-               else {
-                       do_error("phase6: invalid mp->m_dirblkfsbs %d\n", mp->m_dirblkfsbs);
-               }
-       }
-       if (i > 1) {
-               if (libxfs_readbuf_list(mp->m_dev, i, (void *) liop, LIBXFS_LIO_TYPE_DIR) == -1)
-                       do_prefetch = 0;
+       args = calloc(1, sizeof(prefetch_args_t));
+
+       INIT_RADIX_TREE(&args->primary_io_queue, 0);
+       INIT_RADIX_TREE(&args->secondary_io_queue, 0);
+       pthread_mutex_init(&args->lock, NULL);
+       pthread_cond_init(&args->start_reading, NULL);
+       pthread_cond_init(&args->start_processing, NULL);
+       args->agno = agno;
+       args->dirs_only = dirs_only;
+
+       /*
+        * use only 1/8 of the libxfs cache as we are only counting inodes
+        * and not any other associated metadata like directories
+        */
+
+       sem_init(&args->ra_count, 0, libxfs_bcache->c_maxcount / thread_count /
+               (XFS_IALLOC_BLOCKS(mp) / (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog)) / 8);
+
+       if (!prev_args) {
+               if (!pf_create_prefetch_thread(args))
+                       return NULL;
+       } else {
+               pthread_mutex_lock(&prev_args->lock);
+               if (prev_args->prefetch_done) {
+                       if (!pf_create_prefetch_thread(args))
+                               args = NULL;
+               } else
+                       prev_args->next_args = args;
+               pthread_mutex_unlock(&prev_args->lock);
         }
-       libxfs_put_lio_buffer((void *) liop);
+
+       return args;
  }
  
  void
-prefetch_sb(xfs_mount_t *mp, xfs_agnumber_t  agno)
+wait_for_inode_prefetch(
+       prefetch_args_t         *args)
  {
-       libxfs_lio_req_t        *liop;
-
-       if ((liop = (libxfs_lio_req_t *) libxfs_get_lio_buffer(LIBXFS_LIO_TYPE_RAW)) == NULL) {
-               do_prefetch = 0;
+       if (args == NULL)
                 return;
+
+       pthread_mutex_lock(&args->lock);
+
+       while (!args->can_start_processing) {
+#ifdef XR_PF_TRACE
+               pftrace("waiting to start processing AG %d", args->agno);
+#endif
+               pthread_cond_wait(&args->start_processing, &args->lock);
         }
+#ifdef XR_PF_TRACE
+       pftrace("can start processing AG %d", args->agno);
+#endif
+       pthread_mutex_unlock(&args->lock);
+}
  
-       liop[0].blkno = XFS_AG_DADDR(mp, agno, XFS_SB_DADDR);
-       liop[1].blkno = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
-       liop[2].blkno = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
-       liop[0].len = XFS_FSS_TO_BB(mp, 1);
-       liop[1].len = XFS_FSS_TO_BB(mp, 1);
-       liop[2].len = XFS_FSS_TO_BB(mp, 1);
-       if (libxfs_readbuf_list(mp->m_dev, 3, (void *) liop, LIBXFS_LIO_TYPE_RAW) == -1)
-               do_prefetch = 0;
+void
+cleanup_inode_prefetch(
+       prefetch_args_t         *args)
+{
+       if (args == NULL)
+               return;
  
-       libxfs_put_lio_buffer((void *) liop);
+#ifdef XR_PF_TRACE
+       pftrace("waiting AG %d prefetch to finish", args->agno);
+#endif
+       if (args->queuing_thread)
+               pthread_join(args->queuing_thread, NULL);
+
+#ifdef XR_PF_TRACE
+       pftrace("AG %d prefetch done", args->agno);
+#endif
+       pthread_mutex_destroy(&args->lock);
+       pthread_cond_destroy(&args->start_reading);
+       pthread_cond_destroy(&args->start_processing);
+       sem_destroy(&args->ra_count);
+
+       free(args);
  }
  
+#ifdef XR_PF_TRACE
+
  void
-prefetch_roots(xfs_mount_t *mp, xfs_agnumber_t agno,
-               xfs_agf_t *agf, xfs_agi_t *agi)
+_pftrace(const char *func, const char *msg, ...)
  {
-       int                     i;
-       libxfs_lio_req_t        *liop;
+       char            buf[200];
+       struct timeval  tv;
+       va_list         args;
  
-       if ((liop = (libxfs_lio_req_t *) libxfs_get_lio_buffer(LIBXFS_LIO_TYPE_RAW)) == NULL) {
-               do_prefetch = 0;
-               return;
-       }
+       gettimeofday(&tv, NULL);
  
-       i = 0;
-       if (agf->agf_roots[XFS_BTNUM_BNO] != 0 &&
-                       verify_agbno(mp, agno, agf->agf_roots[XFS_BTNUM_BNO])) {
-               liop[i].blkno = XFS_AGB_TO_DADDR(mp, agno, agf->agf_roots[XFS_BTNUM_BNO]);
-               liop[i].len = XFS_FSB_TO_BB(mp, 1);
-               i++;
-       }
-       if (agf->agf_roots[XFS_BTNUM_CNT] != 0 &&
-                       verify_agbno(mp, agno, agf->agf_roots[XFS_BTNUM_CNT])) {
-               liop[i].blkno = XFS_AGB_TO_DADDR(mp, agno, agf->agf_roots[XFS_BTNUM_CNT]);
-               liop[i].len = XFS_FSB_TO_BB(mp, 1);
-               i++;
-       }
-       if (agi->agi_root != 0 && verify_agbno(mp, agno, agi->agi_root)) {
-               liop[i].blkno = XFS_AGB_TO_DADDR(mp, agno, agi->agi_root);
-               liop[i].len = XFS_FSB_TO_BB(mp, 1);
-               i++;
-       }
-       if (i > 1) {
-               if (libxfs_readbuf_list(mp->m_dev, i, (void *) liop, LIBXFS_LIO_TYPE_RAW) == -1)
-                       do_prefetch = 0;
-       }
+       va_start(args, msg);
+       vsnprintf(buf, sizeof(buf), msg, args);
+       buf[sizeof(buf)-1] = '\0';
+       va_end(args);
  
-       libxfs_put_lio_buffer((void *) liop);
+       fprintf(pf_trace_file, "%lu.%06lu  %s: %s\n", tv.tv_sec, tv.tv_usec, func, buf);
  }
+
+#endif
diff --git a/repair/prefetch.h b/repair/prefetch.h

index 10e83ed75db3191cb3055ecab42af0636c4e8fd0..60ba96646e17bdcead8eb1b37bc0bbc873a2d347 100644 (file)
--- a/repair/prefetch.h
+++ b/repair/prefetch.h
@@ -1,45 +1,59 @@
  #ifndef _XFS_REPAIR_PREFETCH_H
  #define        _XFS_REPAIR_PREFETCH_H
  
-struct blkmap;
-struct da_bt_cursor;
-struct xfs_mount;
-
-extern         int do_prefetch;
-
-struct ino_tree_node *prefetch_inode_chunks(
-       struct xfs_mount *,
-       xfs_agnumber_t,
-       struct ino_tree_node *);
-
-extern void prefetch_dir1(
-       struct xfs_mount        *mp,
-       xfs_dablk_t             bno,
-       struct da_bt_cursor     *da_cursor);
-
-extern void prefetch_dir2(
-       struct xfs_mount        *mp,
-       struct blkmap           *blkmap);
-
-extern void prefetch_p6_dir1(
-       struct xfs_mount        *mp,
-       xfs_ino_t               ino,
-       struct xfs_inode        *ip,
-       xfs_dablk_t             da_bno,
-       xfs_fsblock_t           *fblockp);
-
-extern void prefetch_p6_dir2(
-       struct xfs_mount        *mp,
-       struct xfs_inode        *ip);
-
-extern void prefetch_sb(
-       struct xfs_mount        *mp,
-       xfs_agnumber_t          agno);
-
-extern void prefetch_roots(
-       struct xfs_mount        *mp,
-       xfs_agnumber_t          agno,
-       xfs_agf_t               *agf,
-       xfs_agi_t               *agi);
+#include <semaphore.h>
+#include "incore.h"
+#include "radix-tree.h"
+
+
+extern int     do_prefetch;
+
+#define PF_THREAD_COUNT        4
+
+typedef struct prefetch_args {
+       pthread_mutex_t         lock;
+       pthread_t               queuing_thread;
+       pthread_t               io_threads[PF_THREAD_COUNT];
+       struct radix_tree_root  primary_io_queue;
+       struct radix_tree_root  secondary_io_queue;
+       pthread_cond_t          start_reading;
+       pthread_cond_t          start_processing;
+       int                     agno;
+       int                     dirs_only;
+       volatile int            can_start_reading;
+       volatile int            can_start_processing;
+       volatile int            prefetch_done;
+       volatile int            queuing_done;
+       volatile int            inode_bufs_queued;
+       volatile xfs_fsblock_t  last_bno_read;
+       sem_t                   ra_count;
+       struct prefetch_args    *next_args;
+} prefetch_args_t;
+
+
+
+void
+init_prefetch(
+       xfs_mount_t             *pmp);
+
+prefetch_args_t *
+start_inode_prefetch(
+       xfs_agnumber_t          agno,
+       int                     dirs_only,
+       prefetch_args_t         *prev_args);
+
+void
+wait_for_inode_prefetch(
+       prefetch_args_t         *args);
+
+void
+cleanup_inode_prefetch(
+       prefetch_args_t         *args);
+
+
+#ifdef XR_PF_TRACE
+#define pftrace(msg...)        _pftrace(__FUNCTION__, ## msg)
+void   _pftrace(const char *, const char *, ...);
+#endif
  
  #endif /* _XFS_REPAIR_PREFETCH_H */
diff --git a/repair/progress.c b/repair/progress.c

index 2abaed57a282dcae7642b55ff42a8fd1dea88241..0ca9a8a8155be02a58faa3251258c7514e77ca8f 100644 (file)
--- a/repair/progress.c
+++ b/repair/progress.c
@@ -1,7 +1,7 @@
  
  #include <libxfs.h>
-#include "progress.h"
  #include "globals.h"
+#include "progress.h"
  #include "err_protos.h"
  #include <signal.h>
  
@@ -96,7 +96,7 @@ typedef struct phase_times_s {
         time_t          start;
         time_t          end;
         time_t          duration;
-       __uint64_t      item_counts[4]; 
+       __uint64_t      item_counts[4];
  } phase_times_t;
  static phase_times_t phase_times[8];
  
@@ -177,7 +177,7 @@ progress_rpt_thread (void *p)
         /*
          * Specify a repeating timer that fires each MSG_INTERVAL seconds.
          */
-       
+
         timespec.it_value.tv_sec = msgp->interval;
         timespec.it_value.tv_nsec = 0;
         timespec.it_interval.tv_sec = msgp->interval;
@@ -285,7 +285,7 @@ int
  set_progress_msg (int report, __uint64_t total)
  {
  
-       if (!do_parallel)
+       if (!ag_stride)
                 return (0);
  
         if (pthread_mutex_lock(&global_msgs.mutex))
@@ -314,8 +314,8 @@ print_final_rpt(void)
         __uint64_t sum;
         msg_block_t     *msgp = &global_msgs;
         char            msgbuf[DURATION_BUF_SIZE];
-       
-       if (!do_parallel)
+
+       if (!ag_stride)
                 return 0;
  
         if (pthread_mutex_lock(&global_msgs.mutex))
@@ -379,6 +379,9 @@ timestamp(int end, int phase, char *buf)
         time_t    now;
         struct tm *tmp;
  
+       if (verbose > 1)
+               cache_report(stderr, "libxfs_bcache", libxfs_bcache);
+
         now = time(NULL);
  
         if (end) {
@@ -461,7 +464,7 @@ duration(int length, char *buf)
                         }
                         strcat(buf, temp);
                 }
-                       
+
         }
         if (length >= ONEMINUTE) {
                 minutes = (length - sum) / ONEMINUTE;
@@ -488,7 +491,7 @@ duration(int length, char *buf)
                         strcat(buf, _(", "));
                 strcat(buf, temp);
         }
-               
+
         return(buf);
  }
  
diff --git a/repair/progress.h b/repair/progress.h

index 2f7ca2eef5eda2c34ebf1f5db1eb8d61854e23a4..33db8344629a6e9f0847b55b6f501aec5ff0530b 100644 (file)
--- a/repair/progress.h
+++ b/repair/progress.h
@@ -21,8 +21,8 @@
  #define        PROG_FMT_REBUILD_AG     9       /* Phase 5 */
  
  #define        PROG_FMT_TRAVERSAL      10      /* Phase 6 */
-#define        PROG_FMT_TRAVERSSUB     11      
-#define        PROG_FMT_DISCONINODE    12      
+#define        PROG_FMT_TRAVERSSUB     11
+#define        PROG_FMT_DISCONINODE    12
  
  #define        PROGRESS_FMT_CORR_LINK  13      /* Phase 7 */
  #define        PROGRESS_FMT_VRFY_LINK  14
@@ -38,6 +38,6 @@ extern char *timestamp(int end, int phase, char *buf);
  extern char *duration(int val, char *buf);
  extern int do_parallel;
  
-#define        PROG_RPT_INC(a,b) if (do_parallel && prog_rpt_done) (a) += (b)
+#define        PROG_RPT_INC(a,b) if (ag_stride && prog_rpt_done) (a) += (b)
  
  #endif /* _XFS_REPAIR_PROGRESS_RPT_H_ */
diff --git a/repair/radix-tree.c b/repair/radix-tree.c

new file mode 100644 (file)

index 0000000..36a6324
--- /dev/null
+++ b/repair/radix-tree.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <libxfs.h>
+#include "radix-tree.h"
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#define RADIX_TREE_MAP_SHIFT   6
+#define RADIX_TREE_MAP_SIZE    (1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK    (RADIX_TREE_MAP_SIZE-1)
+
+#ifdef RADIX_TREE_TAGS
+#define RADIX_TREE_TAG_LONGS   \
+       ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#endif
+
+struct radix_tree_node {
+       unsigned int    count;
+       void            *slots[RADIX_TREE_MAP_SIZE];
+#ifdef RADIX_TREE_TAGS
+       unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+#endif
+};
+
+struct radix_tree_path {
+       struct radix_tree_node *node;
+       int offset;
+};
+
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
+
+static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH];
+
+/*
+ * Radix tree node cache.
+ */
+
+#define radix_tree_node_alloc(r)       ((struct radix_tree_node *) \
+               calloc(1, sizeof(struct radix_tree_node)))
+#define radix_tree_node_free(n)        free(n)
+
+#ifdef RADIX_TREE_TAGS
+
+static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
+               int offset)
+{
+       *((__uint32_t *)node->tags[tag] + (offset >> 5)) |= (1 << (offset & 31));
+}
+
+static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
+               int offset)
+{
+       __uint32_t      *p = (__uint32_t*)node->tags[tag] + (offset >> 5);
+       __uint32_t      m = 1 << (offset & 31);
+       *p &= ~m;
+}
+
+static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+               int offset)
+{
+       return 1 & (((const __uint32_t *)node->tags[tag])[offset >> 5] >> (offset & 31));
+}
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
+{
+       int idx;
+       for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+               if (node->tags[tag][idx])
+                       return 1;
+       }
+       return 0;
+}
+
+#endif
+
+/*
+ *     Return the maximum key which can be store into a
+ *     radix tree with height HEIGHT.
+ */
+static inline unsigned long radix_tree_maxindex(unsigned int height)
+{
+       return height_to_maxindex[height];
+}
+
+/*
+ *     Extend a radix tree so it can store key @index.
+ */
+static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
+{
+       struct radix_tree_node *node;
+       unsigned int height;
+#ifdef RADIX_TREE_TAGS
+       char tags[RADIX_TREE_MAX_TAGS];
+       int tag;
+#endif
+
+       /* Figure out what the height should be.  */
+       height = root->height + 1;
+       while (index > radix_tree_maxindex(height))
+               height++;
+
+       if (root->rnode == NULL) {
+               root->height = height;
+               goto out;
+       }
+
+#ifdef RADIX_TREE_TAGS
+       /*
+        * Prepare the tag status of the top-level node for propagation
+        * into the newly-pushed top-level node(s)
+        */
+       for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+               tags[tag] = 0;
+               if (any_tag_set(root->rnode, tag))
+                       tags[tag] = 1;
+       }
+#endif
+       do {
+               if (!(node = radix_tree_node_alloc(root)))
+                       return -ENOMEM;
+
+               /* Increase the height.  */
+               node->slots[0] = root->rnode;
+
+#ifdef RADIX_TREE_TAGS
+               /* Propagate the aggregated tag info into the new root */
+               for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+                       if (tags[tag])
+                               tag_set(node, tag, 0);
+               }
+#endif
+               node->count = 1;
+               root->rnode = node;
+               root->height++;
+       } while (height > root->height);
+out:
+       return 0;
+}
+
+/**
+ *     radix_tree_insert    -    insert into a radix tree
+ *     @root:          radix tree root
+ *     @index:         index key
+ *     @item:          item to insert
+ *
+ *     Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root,
+                       unsigned long index, void *item)
+{
+       struct radix_tree_node *node = NULL, *slot;
+       unsigned int height, shift;
+       int offset;
+       int error;
+
+       /* Make sure the tree is high enough.  */
+       if ((!index && !root->rnode) ||
+                       index > radix_tree_maxindex(root->height)) {
+               error = radix_tree_extend(root, index);
+               if (error)
+                       return error;
+       }
+
+       slot = root->rnode;
+       height = root->height;
+       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+       offset = 0;                     /* uninitialised var warning */
+       do {
+               if (slot == NULL) {
+                       /* Have to add a child node.  */
+                       if (!(slot = radix_tree_node_alloc(root)))
+                               return -ENOMEM;
+                       if (node) {
+                               node->slots[offset] = slot;
+                               node->count++;
+                       } else
+                               root->rnode = slot;
+               }
+
+               /* Go a level down */
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+               node = slot;
+               slot = node->slots[offset];
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       } while (height > 0);
+
+       if (slot != NULL)
+               return -EEXIST;
+
+       ASSERT(node);
+       node->count++;
+       node->slots[offset] = item;
+#ifdef RADIX_TREE_TAGS
+       ASSERT(!tag_get(node, 0, offset));
+       ASSERT(!tag_get(node, 1, offset));
+#endif
+       return 0;
+}
+
+static inline void **__lookup_slot(struct radix_tree_root *root,
+                                  unsigned long index)
+{
+       unsigned int height, shift;
+       struct radix_tree_node **slot;
+
+       height = root->height;
+       if (index > radix_tree_maxindex(height))
+               return NULL;
+
+       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+       slot = &root->rnode;
+
+       while (height > 0) {
+               if (*slot == NULL)
+                       return NULL;
+
+               slot = (struct radix_tree_node **)
+                       ((*slot)->slots +
+                               ((index >> shift) & RADIX_TREE_MAP_MASK));
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       }
+
+       return (void **)slot;
+}
+
+/**
+ *     radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ *     @root:          radix tree root
+ *     @index:         index key
+ *
+ *     Lookup the slot corresponding to the position @index in the radix tree
+ *     @root. This is useful for update-if-exists operations.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+{
+       return __lookup_slot(root, index);
+}
+
+/**
+ *     radix_tree_lookup    -    perform lookup operation on a radix tree
+ *     @root:          radix tree root
+ *     @index:         index key
+ *
+ *     Lookup the item at the position @index in the radix tree @root.
+ */
+void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+       void **slot;
+
+       slot = __lookup_slot(root, index);
+       return slot != NULL ? *slot : NULL;
+}
+
+/**
+ *     raid_tree_first_key - find the first index key in the radix tree
+ *     @root:          radix tree root
+ *     @index:         where the first index will be placed
+ *
+ *     Returns the first entry and index key in the radix tree @root.
+ */
+void *radix_tree_lookup_first(struct radix_tree_root *root, unsigned long *index)
+{
+       unsigned int height, shift;
+       struct radix_tree_node *slot;
+       unsigned long i;
+
+       height = root->height;
+       *index = 0;
+       if (height == 0)
+               return NULL;
+
+       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+       slot = root->rnode;
+
+       for (; height > 1; height--) {
+               for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+                       if (slot->slots[i] != NULL)
+                               break;
+               }
+               ASSERT(i < RADIX_TREE_MAP_SIZE);
+
+               *index |= (i << shift);
+               shift -= RADIX_TREE_MAP_SHIFT;
+               slot = slot->slots[i];
+       }
+       for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+               if (slot->slots[i] != NULL) {
+                       *index |= i;
+                       return slot->slots[i];
+               }
+       }
+       return NULL;
+}
+
+#ifdef RADIX_TREE_TAGS
+
+/**
+ *     radix_tree_tag_set - set a tag on a radix tree node
+ *     @root:          radix tree root
+ *     @index:         index key
+ *     @tag:           tag index
+ *
+ *     Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *     corresponding to @index in the radix tree.  From
+ *     the root all the way down to the leaf node.
+ *
+ *     Returns the address of the tagged item.   Setting a tag on a not-present
+ *     item is a bug.
+ */
+void *radix_tree_tag_set(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag)
+{
+       unsigned int height, shift;
+       struct radix_tree_node *slot;
+
+       height = root->height;
+       if (index > radix_tree_maxindex(height))
+               return NULL;
+
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+       slot = root->rnode;
+
+       while (height > 0) {
+               int offset;
+
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+               if (!tag_get(slot, tag, offset))
+                       tag_set(slot, tag, offset);
+               slot = slot->slots[offset];
+               ASSERT(slot != NULL);
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       }
+
+       return slot;
+}
+
+/**
+ *     radix_tree_tag_clear - clear a tag on a radix tree node
+ *     @root:          radix tree root
+ *     @index:         index key
+ *     @tag:           tag index
+ *
+ *     Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *     corresponding to @index in the radix tree.  If
+ *     this causes the leaf node to have no tags set then clear the tag in the
+ *     next-to-leaf node, etc.
+ *
+ *     Returns the address of the tagged item on success, else NULL.  ie:
+ *     has the same return value and semantics as radix_tree_lookup().
+ */
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag)
+{
+       struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+       struct radix_tree_node *slot;
+       unsigned int height, shift;
+       void *ret = NULL;
+
+       height = root->height;
+       if (index > radix_tree_maxindex(height))
+               goto out;
+
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+       pathp->node = NULL;
+       slot = root->rnode;
+
+       while (height > 0) {
+               int offset;
+
+               if (slot == NULL)
+                       goto out;
+
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+               pathp[1].offset = offset;
+               pathp[1].node = slot;
+               slot = slot->slots[offset];
+               pathp++;
+               shift -= RADIX_TREE_MAP_SHIFT;
+               height--;
+       }
+
+       ret = slot;
+       if (ret == NULL)
+               goto out;
+
+       do {
+               if (!tag_get(pathp->node, tag, pathp->offset))
+                       goto out;
+               tag_clear(pathp->node, tag, pathp->offset);
+               if (any_tag_set(pathp->node, tag))
+                       goto out;
+               pathp--;
+       } while (pathp->node);
+out:
+       return ret;
+}
+
+#endif
+
+static unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+       unsigned int max_items, unsigned long *next_index)
+{
+       unsigned int nr_found = 0;
+       unsigned int shift, height;
+       struct radix_tree_node *slot;
+       unsigned long i;
+
+       height = root->height;
+       if (height == 0)
+               goto out;
+
+       shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+       slot = root->rnode;
+
+       for ( ; height > 1; height--) {
+
+               for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
+                               i < RADIX_TREE_MAP_SIZE; i++) {
+                       if (slot->slots[i] != NULL)
+                               break;
+                       index &= ~((1UL << shift) - 1);
+                       index += 1UL << shift;
+                       if (index == 0)
+                               goto out;       /* 32-bit wraparound */
+               }
+               if (i == RADIX_TREE_MAP_SIZE)
+                       goto out;
+
+               shift -= RADIX_TREE_MAP_SHIFT;
+               slot = slot->slots[i];
+       }
+
+       /* Bottom level: grab some items */
+       for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
+               index++;
+               if (slot->slots[i]) {
+                       results[nr_found++] = slot->slots[i];
+                       if (nr_found == max_items)
+                               goto out;
+               }
+       }
+out:
+       *next_index = index;
+       return nr_found;
+}
+
+/**
+ *     radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *     @root:          radix tree root
+ *     @results:       where the results of the lookup are placed
+ *     @first_index:   start the lookup from this key
+ *     @max_items:     place up to this many items at *results
+ *
+ *     Performs an index-ascending scan of the tree for present items.  Places
+ *     them at *@results and returns the number of items which were placed at
+ *     *@results.
+ *
+ *     The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+                       unsigned long first_index, unsigned int max_items)
+{
+       const unsigned long max_index = radix_tree_maxindex(root->height);
+       unsigned long cur_index = first_index;
+       unsigned int ret = 0;
+
+       while (ret < max_items) {
+               unsigned int nr_found;
+               unsigned long next_index;       /* Index of next search */
+
+               if (cur_index > max_index)
+                       break;
+               nr_found = __lookup(root, results + ret, cur_index,
+                                       max_items - ret, &next_index);
+               ret += nr_found;
+               if (next_index == 0)
+                       break;
+               cur_index = next_index;
+       }
+       return ret;
+}
+
+/**
+ *     radix_tree_gang_lookup_ex - perform multiple lookup on a radix tree
+ *     @root:          radix tree root
+ *     @results:       where the results of the lookup are placed
+ *     @first_index:   start the lookup from this key
+ *     @last_index:    don't lookup past this key
+ *     @max_items:     place up to this many items at *results
+ *
+ *     Performs an index-ascending scan of the tree for present items starting
+ *     @first_index until @last_index up to as many as @max_items.  Places
+ *     them at *@results and returns the number of items which were placed
+ *     at *@results.
+ *
+ *     The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup_ex(struct radix_tree_root *root, void **results,
+                       unsigned long first_index, unsigned long last_index,
+                       unsigned int max_items)
+{
+       const unsigned long max_index = radix_tree_maxindex(root->height);
+       unsigned long cur_index = first_index;
+       unsigned int ret = 0;
+
+       while (ret < max_items && cur_index < last_index) {
+               unsigned int nr_found;
+               unsigned long next_index;       /* Index of next search */
+
+               if (cur_index > max_index)
+                       break;
+               nr_found = __lookup(root, results + ret, cur_index,
+                                       max_items - ret, &next_index);
+               ret += nr_found;
+               if (next_index == 0)
+                       break;
+               cur_index = next_index;
+       }
+       return ret;
+}
+
+#ifdef RADIX_TREE_TAGS
+
+static unsigned int
+__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+       unsigned int max_items, unsigned long *next_index, unsigned int tag)
+{
+       unsigned int nr_found = 0;
+       unsigned int shift;
+       unsigned int height = root->height;
+       struct radix_tree_node *slot;
+
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+       slot = root->rnode;
+
+       while (height > 0) {
+               unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+               for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+                       if (tag_get(slot, tag, i)) {
+                               ASSERT(slot->slots[i] != NULL);
+                               break;
+                       }
+                       index &= ~((1UL << shift) - 1);
+                       index += 1UL << shift;
+                       if (index == 0)
+                               goto out;       /* 32-bit wraparound */
+               }
+               if (i == RADIX_TREE_MAP_SIZE)
+                       goto out;
+               height--;
+               if (height == 0) {      /* Bottom level: grab some items */
+                       unsigned long j = index & RADIX_TREE_MAP_MASK;
+
+                       for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+                               index++;
+                               if (tag_get(slot, tag, j)) {
+                                       ASSERT(slot->slots[j] != NULL);
+                                       results[nr_found++] = slot->slots[j];
+                                       if (nr_found == max_items)
+                                               goto out;
+                               }
+                       }
+               }
+               shift -= RADIX_TREE_MAP_SHIFT;
+               slot = slot->slots[i];
+       }
+out:
+       *next_index = index;
+       return nr_found;
+}
+
+/**
+ *     radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
+ *                                  based on a tag
+ *     @root:          radix tree root
+ *     @results:       where the results of the lookup are placed
+ *     @first_index:   start the lookup from this key
+ *     @max_items:     place up to this many items at *results
+ *     @tag:           the tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ *     Performs an index-ascending scan of the tree for present items which
+ *     have the tag indexed by @tag set.  Places the items at *@results and
+ *     returns the number of items which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+               unsigned long first_index, unsigned int max_items,
+               unsigned int tag)
+{
+       const unsigned long max_index = radix_tree_maxindex(root->height);
+       unsigned long cur_index = first_index;
+       unsigned int ret = 0;
+
+       while (ret < max_items) {
+               unsigned int nr_found;
+               unsigned long next_index;       /* Index of next search */
+
+               if (cur_index > max_index)
+                       break;
+               nr_found = __lookup_tag(root, results + ret, cur_index,
+                                       max_items - ret, &next_index, tag);
+               ret += nr_found;
+               if (next_index == 0)
+                       break;
+               cur_index = next_index;
+       }
+       return ret;
+}
+
+#endif
+
+/**
+ *     radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *     @root           radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root)
+{
+       /* try to shrink tree height */
+       while (root->height > 1 &&
+                       root->rnode->count == 1 &&
+                       root->rnode->slots[0]) {
+               struct radix_tree_node *to_free = root->rnode;
+
+               root->rnode = to_free->slots[0];
+               root->height--;
+               /* must only free zeroed nodes into the slab */
+#ifdef RADIX_TREE_TAGS
+               tag_clear(to_free, 0, 0);
+               tag_clear(to_free, 1, 0);
+#endif
+               to_free->slots[0] = NULL;
+               to_free->count = 0;
+               radix_tree_node_free(to_free);
+       }
+}
+
+/**
+ *     radix_tree_delete    -    delete an item from a radix tree
+ *     @root:          radix tree root
+ *     @index:         index key
+ *
+ *     Remove the item at @index from the radix tree rooted at @root.
+ *
+ *     Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+       struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+       struct radix_tree_path *orig_pathp;
+       struct radix_tree_node *slot;
+       unsigned int height, shift;
+       void *ret = NULL;
+#ifdef RADIX_TREE_TAGS
+       char tags[RADIX_TREE_MAX_TAGS];
+       int nr_cleared_tags;
+       int tag;
+#endif
+       int offset;
+
+       height = root->height;
+       if (index > radix_tree_maxindex(height))
+               goto out;
+
+       shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+       pathp->node = NULL;
+       slot = root->rnode;
+
+       for ( ; height > 0; height--) {
+               if (slot == NULL)
+                       goto out;
+
+               pathp++;
+               offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+               pathp->offset = offset;
+               pathp->node = slot;
+               slot = slot->slots[offset];
+               shift -= RADIX_TREE_MAP_SHIFT;
+       }
+
+       ret = slot;
+       if (ret == NULL)
+               goto out;
+
+       orig_pathp = pathp;
+
+#ifdef RADIX_TREE_TAGS
+       /*
+        * Clear all tags associated with the just-deleted item
+        */
+       nr_cleared_tags = 0;
+       for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+               tags[tag] = 1;
+               if (tag_get(pathp->node, tag, pathp->offset)) {
+                       tag_clear(pathp->node, tag, pathp->offset);
+                       if (!any_tag_set(pathp->node, tag)) {
+                               tags[tag] = 0;
+                               nr_cleared_tags++;
+                       }
+               }
+       }
+
+       for (pathp--; nr_cleared_tags && pathp->node; pathp--) {
+               for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+                       if (tags[tag])
+                               continue;
+
+                       tag_clear(pathp->node, tag, pathp->offset);
+                       if (any_tag_set(pathp->node, tag)) {
+                               tags[tag] = 1;
+                               nr_cleared_tags--;
+                       }
+               }
+       }
+#endif
+       /* Now free the nodes we do not need anymore */
+       for (pathp = orig_pathp; pathp->node; pathp--) {
+               pathp->node->slots[pathp->offset] = NULL;
+               pathp->node->count--;
+
+               if (pathp->node->count) {
+                       if (pathp->node == root->rnode)
+                               radix_tree_shrink(root);
+                       goto out;
+               }
+
+               /* Node with zero slots in use so free it */
+               radix_tree_node_free(pathp->node);
+       }
+       root->rnode = NULL;
+       root->height = 0;
+out:
+       return ret;
+}
+
+#ifdef RADIX_TREE_TAGS
+/**
+ *     radix_tree_tagged - test whether any items in the tree are tagged
+ *     @root:          radix tree root
+ *     @tag:           tag to test
+ */
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
+{
+       struct radix_tree_node *rnode;
+       rnode = root->rnode;
+       if (!rnode)
+               return 0;
+       return any_tag_set(rnode, tag);
+}
+#endif
+
+static unsigned long __maxindex(unsigned int height)
+{
+       unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
+       unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
+
+       if (tmp >= RADIX_TREE_INDEX_BITS)
+               index = ~0UL;
+       return index;
+}
+
+static void radix_tree_init_maxindex(void)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
+               height_to_maxindex[i] = __maxindex(i);
+}
+
+void radix_tree_init(void)
+{
+       radix_tree_init_maxindex();
+}
diff --git a/repair/radix-tree.h b/repair/radix-tree.h

new file mode 100644 (file)

index 0000000..e16e08d
--- /dev/null
+++ b/repair/radix-tree.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef __XFS_SUPPORT_RADIX_TREE_H__
+#define __XFS_SUPPORT_RADIX_TREE_H__
+
+#define RADIX_TREE_TAGS
+
+struct radix_tree_root {
+       unsigned int            height;
+       struct radix_tree_node  *rnode;
+};
+
+#define RADIX_TREE_INIT(mask)  {                                       \
+       .height = 0,                                                    \
+       .rnode = NULL,                                                  \
+}
+
+#define RADIX_TREE(name, mask) \
+       struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+#define INIT_RADIX_TREE(root, mask)                                    \
+do {                                                                   \
+       (root)->height = 0;                                             \
+       (root)->rnode = NULL;                                           \
+} while (0)
+
+#ifdef RADIX_TREE_TAGS
+#define RADIX_TREE_MAX_TAGS 2
+#endif
+
+int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_lookup_first(struct radix_tree_root *, unsigned long *);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+                       unsigned long first_index, unsigned int max_items);
+unsigned int
+radix_tree_gang_lookup_ex(struct radix_tree_root *root, void **results,
+                       unsigned long first_index, unsigned long last_index,
+                       unsigned int max_items);
+
+void radix_tree_init(void);
+
+#ifdef RADIX_TREE_TAGS
+void *radix_tree_tag_set(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag);
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag);
+int radix_tree_tag_get(struct radix_tree_root *root,
+                       unsigned long index, unsigned int tag);
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+                       unsigned long first_index, unsigned int max_items,
+                       unsigned int tag);
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+#endif
+
+#endif /* __XFS_SUPPORT_RADIX_TREE_H__ */
diff --git a/repair/scan.c b/repair/scan.c

index 9a5c1ab2cdb9d6608eeda3555f0aac5195a95829..a137cc83da3c8df3c1b9059e50d9c5305570c1d0 100644 (file)
--- a/repair/scan.c
+++ b/repair/scan.c
@@ -27,7 +27,6 @@
  #include "scan.h"
  #include "versions.h"
  #include "bmap.h"
-#include "prefetch.h"
  #include "progress.h"
  
  extern int verify_set_agheader(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb,
@@ -1147,9 +1146,6 @@ scan_ag(
  
         agi_dirty = agf_dirty = sb_dirty = 0;
  
-       if (do_prefetch)
-               prefetch_sb(mp, agno);
-
         sbbuf = libxfs_readbuf(mp->m_dev, XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
                                 XFS_FSS_TO_BB(mp, 1), 0);
         if (!sbbuf)  {
@@ -1241,9 +1237,6 @@ scan_ag(
  
         scan_freelist(agf);
  
-       if (do_prefetch)
-                prefetch_roots(mp, agno, agf, agi);
-
         if (INT_GET(agf->agf_roots[XFS_BTNUM_BNO], ARCH_CONVERT) != 0 &&
             verify_agbno(mp, agno,
                         INT_GET(agf->agf_roots[XFS_BTNUM_BNO], ARCH_CONVERT)))
diff --git a/repair/threads.c b/repair/threads.c

index ad4c2bd73d8da3b7c8303225012b2a65234d1386..d10b841f3e8028f6524c1dddec3b858aebd1dd63 100644 (file)
--- a/repair/threads.c
+++ b/repair/threads.c
@@ -1,138 +1,60 @@
  #include <libxfs.h>
-#include "pthread.h"
-#include "signal.h"
+#include <pthread.h>
+#include <signal.h>
  #include "threads.h"
  #include "err_protos.h"
  #include "protos.h"
+#include "globals.h"
  
-int do_parallel = 1;
-int thread_count;
-
-/* A quantum of work */
-typedef struct work_s {
-       struct work_s   *next;
-       disp_func_t     *function;
-       xfs_mount_t     *mp;
-       xfs_agnumber_t  agno;
-} work_t;
-
-typedef struct  work_queue_s {
-       work_t          *next;
-       work_t          *last;
-       int             active_threads;
-       int             work_count;
-       pthread_cond_t  mcv;    /* main thread conditional variable */
-       pthread_cond_t  wcv;    /* worker threads conditional variable */
-       pthread_mutex_t mutex;
-} work_queue_t;
-
-static work_queue_t    work_queue;
-static pthread_t       *work_threads;
-
-static void    *worker_thread(void *arg);
-
-static void
-init_workers(work_queue_t *wq, int nw)
+static void *
+worker_thread(void *arg)
  {
-       int                     err;
-       pthread_mutexattr_t     mtxattr;
-
-       memset(wq, 0, sizeof(work_queue_t));
-       wq->active_threads = nw;
-
-       pthread_cond_init(&wq->mcv, NULL);
-       pthread_cond_init(&wq->wcv, NULL);
-       pthread_mutexattr_init(&mtxattr);
-
-#ifdef PTHREAD_MUTEX_SPINBLOCK_NP
-       /* NP - Non Portable - Irix */
-       if ((err = pthread_mutexattr_settype(&mtxattr,
-                       PTHREAD_MUTEX_SPINBLOCK_NP)) > 0) {
-               do_error(_("init_workers: thread 0x%x: pthread_mutexattr_settype error %d: %s\n"),
-                       pthread_self(), err, strerror(err));
-       }
-#endif
-#ifdef PTHREAD_MUTEX_FAST_NP
-       /* NP - Non Portable - Linux */
-       if ((err = pthread_mutexattr_settype(&mtxattr,
-                       PTHREAD_MUTEX_FAST_NP)) > 0) {
-               do_error(_("init_workers: thread 0x%x: pthread_mutexattr_settype error %d: %s\n"),
-                       pthread_self(), err, strerror(err));
-       }
-#endif
-       if ((err = pthread_mutex_init(&wq->mutex, &mtxattr)) > 0) {
-               do_error(_("init_workers: thread 0x%x: pthread_mutex_init error %d: %s\n"),
-                       pthread_self(), err, strerror(err));
-       }
-}
+       work_queue_t    *wq;
+       work_item_t     *wi;
  
-static void
-quiesce_workers(work_queue_t *wq)
-{
-       int     err;
+       wq = (work_queue_t*)arg;
  
-       if ((err = pthread_mutex_lock(&wq->mutex)) > 0)
-               do_error(_("quiesce_workers: thread 0x%x: pthread_mutex_lock error %d: %s\n"),
-                       pthread_self(), err, strerror(err));
-       if (wq->active_threads > 0) {
-               if ((err = pthread_cond_wait(&wq->mcv, &wq->mutex)) > 0)
-                       do_error(_("quiesce_workers: thread 0x%x: pthread_cond_wait error %d: %s\n"),
-                               pthread_self(), err, strerror(err));
-       }
-       ASSERT(wq->active_threads == 0);
-       if ((err = pthread_mutex_unlock(&wq->mutex)) > 0)
-               do_error(_("quiesce_workers: thread 0x%x: pthread_mutex_unlock error %d: %s\n"),
-                       pthread_self(), err, strerror(err));
-}
+       /*
+        * Loop pulling work from the passed in work queue.
+        * Check for notification to exit after every chunk of work.
+        */
+       while (1) {
+               pthread_mutex_lock(&wq->lock);
  
-static void
-start_workers(work_queue_t *wq, unsigned thcnt, pthread_attr_t *attrp)
-{
-       int             err;
-       unsigned long   i;
+               /*
+                * Wait for work.
+                */
+               while (wq->next_item == NULL && !wq->terminate) {
+                       ASSERT(wq->item_count == 0);
+                       pthread_cond_wait(&wq->wakeup, &wq->lock);
+               }
+               if (wq->next_item == NULL && wq->terminate) {
+                       pthread_mutex_unlock(&wq->lock);
+                       break;
+               }
  
-       init_workers(wq, thcnt);
+               /*
+                *  Dequeue work from the head of the list.
+                */
+               ASSERT(wq->item_count > 0);
+               wi = wq->next_item;
+               wq->next_item = wi->next;
+               wq->item_count--;
  
-       if ((work_threads = (pthread_t *)malloc(sizeof(pthread_t) * thcnt)) == NULL)
-               do_error(_("cannot malloc %ld bytes for work_threads array\n"),
-                               sizeof(pthread_t) * thcnt);
+               pthread_mutex_unlock(&wq->lock);
  
-       /*
-       **  Create worker threads
-       */
-       for (i = 0; i < thcnt; i++) {
-               err = pthread_create(&work_threads[i], attrp, worker_thread, (void *) i);
-               if(err > 0) {
-                       do_error(_("cannot create worker threads, status = [%d] %s\n"),
-                               err, strerror(err));
-               }
+               (wi->function)(wi->queue, wi->agno, wi->arg);
+               free(wi);
         }
-       do_log(_("        - creating %d worker thread(s)\n"), thcnt);
  
-       /*
-       **  Wait for all worker threads to initialize
-       */
-       quiesce_workers(wq);
+       return NULL;
  }
  
  void
  thread_init(void)
  {
-       int             status;
-       pthread_attr_t  attr;
         sigset_t        blocked;
  
-       if (do_parallel == 0)
-               return;
-       if (thread_count == 0)
-               thread_count = 2 * libxfs_nproc();
-
-       if ((status = pthread_attr_init(&attr)) != 0)
-               do_error(_("status from pthread_attr_init: %d"),status);
-
-       if ((status = pthread_setconcurrency(thread_count)) != 0)
-               do_error(_("Status from pthread_setconcurrency(%d): %d"), thread_count, status);
-
         /*
          *  block delivery of progress report signal to all threads
           */
@@ -140,160 +62,90 @@ thread_init(void)
         sigaddset(&blocked, SIGHUP);
         sigaddset(&blocked, SIGALRM);
         pthread_sigmask(SIG_BLOCK, &blocked, NULL);
-
-       start_workers(&work_queue, thread_count, &attr);
  }
  
-/*
- * Dequeue from the head of the list.
- * wq->mutex held.
- */
-static work_t *
-dequeue(work_queue_t *wq)
+
+void
+create_work_queue(
+       work_queue_t            *wq,
+       xfs_mount_t             *mp,
+       int                     nworkers)
  {
-       work_t  *wp;
+       int                     err;
+       int                     i;
  
-       ASSERT(wq->work_count > 0);
-       wp = wq->next;
-       wq->next = wp->next;
-       wq->work_count--;
-       if (wq->next == NULL) {
-               ASSERT(wq->work_count == 0);
-               wq->last = NULL;
-       }
-       wp->next = NULL;
-       return (wp);
-}
+       memset(wq, 0, sizeof(work_queue_t));
  
-static void *
-worker_thread(void *arg)
-{
-       work_queue_t    *wq;
-       work_t          *wp;
-       int             err;
-       unsigned long   myid;
+       pthread_cond_init(&wq->wakeup, NULL);
+       pthread_mutex_init(&wq->lock, NULL);
  
-       wq = &work_queue;
-       myid = (unsigned long) arg;
-       ts_init();
-       libxfs_lio_allocate();
+       wq->mp = mp;
+       wq->thread_count = nworkers;
+       wq->threads = malloc(nworkers * sizeof(pthread_t));
+       wq->terminate = 0;
  
-       /*
-        * Loop pulling work from the global work queue.
-        * Check for notification to exit after every chunk of work.
-        */
-       while (1) {
-               if ((err = pthread_mutex_lock(&wq->mutex)) > 0)
-                       do_error(_("work_thread%d: thread 0x%x: pthread_mutex_lock error %d: %s\n"),
-                               myid, pthread_self(), err, strerror(err));
-               /*
-                * Wait for work.
-                */
-               while (wq->next == NULL) {
-                       ASSERT(wq->work_count == 0);
-                       /*
-                        * Last thread going to idle sleep must wakeup
-                        * the master thread.  Same mutex is used to lock
-                        * around two different condition variables.
-                        */
-                       wq->active_threads--;
-                       ASSERT(wq->active_threads >= 0);
-                       if (!wq->active_threads) {
-                               if ((err = pthread_cond_signal(&wq->mcv)) > 0)
-                                       do_error(_("work_thread%d: thread 0x%x: pthread_cond_signal error %d: %s\n"),
-                                               myid, pthread_self(), err, strerror(err));
-                       }
-                       if ((err = pthread_cond_wait(&wq->wcv, &wq->mutex)) > 0)
-                               do_error(_("work_thread%d: thread 0x%x: pthread_cond_wait error %d: %s\n"),
-                                       myid, pthread_self(), err, strerror(err));
-                       wq->active_threads++;
+       for (i = 0; i < nworkers; i++) {
+               err = pthread_create(&wq->threads[i], NULL, worker_thread, wq);
+               if (err != 0) {
+                       do_error(_("cannot create worker threads, error = [%d] %s\n"),
+                               err, strerror(err));
                 }
-               /*
-                *  Dequeue work from the head of the list.
-                */
-               ASSERT(wq->work_count > 0);
-               wp = dequeue(wq);
-               if ((err = pthread_mutex_unlock(&wq->mutex)) > 0)
-                       do_error(_("work_thread%d: thread 0x%x: pthread_mutex_unlock error %d: %s\n"),
-                               myid, pthread_self(), err, strerror(err));
-               /*
-                *  Do the work.
-                */
-               (wp->function)(wp->mp, wp->agno);
-
-               free(wp);
         }
-       /* NOT REACHED */
-       pthread_exit(NULL);
-       return (NULL);
+
  }
  
-int
-queue_work(disp_func_t func, xfs_mount_t *mp, xfs_agnumber_t agno)
+void
+queue_work(
+       work_queue_t    *wq,
+       work_func_t     func,
+       xfs_agnumber_t  agno,
+       void            *arg)
  {
-       work_queue_t *wq;
-       work_t  *wp;
+       work_item_t     *wi;
  
-       if (do_parallel == 0) {
-               func(mp, agno);
-               return 0;
-       }
-       wq = &work_queue;
-       /*
-        * Get memory for a new work structure.
-        */
-       if ((wp = (work_t *)memalign(8, sizeof(work_t))) == NULL)
-               return (ENOMEM);
-       /*
-        * Initialize the new work structure.
-        */
-       wp->function = func;
-       wp->mp = mp;
-       wp->agno = agno;
+       wi = (work_item_t *)malloc(sizeof(work_item_t));
+       if (wi == NULL)
+               do_error(_("cannot allocate worker item, error = [%d] %s\n"),
+                       errno, strerror(errno));
+
+       wi->function = func;
+       wi->agno = agno;
+       wi->arg = arg;
+       wi->queue = wq;
+       wi->next = NULL;
  
         /*
          *  Now queue the new work structure to the work queue.
          */
-       if (wq->next == NULL) {
-               wq->next = wp;
+       pthread_mutex_lock(&wq->lock);
+       if (wq->next_item == NULL) {
+               wq->next_item = wi;
+               ASSERT(wq->item_count == 0);
+               pthread_cond_signal(&wq->wakeup);
         } else {
-               wq->last->next = wp;
+               wq->last_item->next = wi;
         }
-       wq->last = wp;
-       wp->next = NULL;
-       wq->work_count++;
-
-       return (0);
+       wq->last_item = wi;
+       wq->item_count++;
+       pthread_mutex_unlock(&wq->lock);
  }
  
  void
-wait_for_workers(void)
+destroy_work_queue(
+       work_queue_t    *wq)
  {
-       int             err;
-       work_queue_t    *wq;
+       int             i;
  
-       if (do_parallel == 0)
-               return;
-       wq = &work_queue;
-       if ((err = pthread_mutex_lock(&wq->mutex)) > 0)
-               do_error(_("wait_for_workers: thread 0x%x: pthread_mutex_lock error %d: %s\n"),
-                       pthread_self(), err, strerror(err));
+       pthread_mutex_lock(&wq->lock);
+       wq->terminate = 1;
+       pthread_mutex_unlock(&wq->lock);
  
-       ASSERT(wq->active_threads == 0);
-       if (wq->work_count > 0) {
-               /* get the workers going */
-               if ((err = pthread_cond_broadcast(&wq->wcv)) > 0)
-                       do_error(_("wait_for_workers: thread 0x%x: pthread_cond_broadcast error %d: %s\n"),
-                               pthread_self(), err, strerror(err));
-               /* and wait for them */
-               if ((err = pthread_cond_wait(&wq->mcv, &wq->mutex)) > 0)
-                       do_error(_("wait_for_workers: thread 0x%x: pthread_cond_wait error %d: %s\n"),
-                               pthread_self(), err, strerror(err));
-       }
-       ASSERT(wq->active_threads == 0);
-       ASSERT(wq->work_count == 0);
+       pthread_cond_broadcast(&wq->wakeup);
+
+       for (i = 0; i < wq->thread_count; i++)
+               pthread_join(wq->threads[i], NULL);
  
-       if ((err = pthread_mutex_unlock(&wq->mutex)) > 0)
-               do_error(_("wait_for_workers: thread 0x%x: pthread_mutex_unlock error %d: %s\n"),
-                       pthread_self(), err, strerror(err));
+       free(wq->threads);
+       pthread_mutex_destroy(&wq->lock);
+       pthread_cond_destroy(&wq->wakeup);
  }
diff --git a/repair/threads.h b/repair/threads.h

index 9356e552b27a0fef0fd31fe5f13aea5bbf1cc57d..bb0b8f851ef021256acb15777897686c7450fe02 100644 (file)
--- a/repair/threads.h
+++ b/repair/threads.h
@@ -1,37 +1,47 @@
  #ifndef        _XFS_REPAIR_THREADS_H_
  #define        _XFS_REPAIR_THREADS_H_
  
-extern int             do_parallel;
-extern int             thread_count;
-/*
-**  locking variants - rwlock/mutex
-*/
-#define PREPAIR_RW_LOCK_ATTR           PTHREAD_PROCESS_PRIVATE
-
-#define        PREPAIR_RW_LOCK_ALLOC(lkp, n)                           \
-       if (do_parallel) {                                      \
-               lkp = malloc(n*sizeof(pthread_rwlock_t));       \
-               if (lkp == NULL)                                \
-                       do_error("cannot alloc %d locks\n", n); \
-                       /* NO RETURN */                         \
-       }
-#define PREPAIR_RW_LOCK_INIT(l,a)      if (do_parallel) pthread_rwlock_init((l),(a))
-#define PREPAIR_RW_READ_LOCK(l)        if (do_parallel) pthread_rwlock_rdlock((l))
-#define PREPAIR_RW_WRITE_LOCK(l)       if (do_parallel) pthread_rwlock_wrlock((l))
-#define PREPAIR_RW_UNLOCK(l)           if (do_parallel) pthread_rwlock_unlock((l))
-#define PREPAIR_RW_WRITE_LOCK_NOTEST(l)        pthread_rwlock_wrlock((l))
-#define PREPAIR_RW_UNLOCK_NOTEST(l)    pthread_rwlock_unlock((l))
-#define PREPAIR_RW_LOCK_DELETE(l)      if (do_parallel) pthread_rwlock_destroy((l))
-
-#define PREPAIR_MTX_LOCK_INIT(m, a)    if (do_parallel) pthread_mutex_init((m), (a))
-#define PREPAIR_MTX_ATTR_INIT(a)       if (do_parallel) pthread_mutexattr_init((a))
-#define PREPAIR_MTX_ATTR_SET(a, l)     if (do_parallel) pthread_mutexattr_settype((a), l)
-#define PREPAIR_MTX_LOCK(m)            if (do_parallel) pthread_mutex_lock(m)
-#define PREPAIR_MTX_UNLOCK(m)          if (do_parallel) pthread_mutex_unlock(m)
-
-
-typedef void   disp_func_t(xfs_mount_t *mp, xfs_agnumber_t agno);
-extern int     queue_work(disp_func_t func, xfs_mount_t *mp, xfs_agnumber_t agno);
-extern void    wait_for_workers(void);
+void   thread_init(void);
+
+struct  work_queue;
+
+typedef void work_func_t(struct work_queue *, xfs_agnumber_t, void *);
+
+typedef struct work_item {
+       struct work_item        *next;
+       work_func_t             *function;
+       struct work_queue       *queue;
+       xfs_agnumber_t          agno;
+       void                    *arg;
+} work_item_t;
+
+typedef struct  work_queue {
+       work_item_t             *next_item;
+       work_item_t             *last_item;
+       int                     item_count;
+       int                     thread_count;
+       pthread_t               *threads;
+       xfs_mount_t             *mp;
+       pthread_mutex_t         lock;
+       pthread_cond_t          wakeup;
+       int                     terminate;
+} work_queue_t;
+
+void
+create_work_queue(
+       work_queue_t            *wq,
+       xfs_mount_t             *mp,
+       int                     nworkers);
+
+void
+queue_work(
+       work_queue_t            *wq,
+       work_func_t             func,
+       xfs_agnumber_t          agno,
+       void                    *arg);
+
+void
+destroy_work_queue(
+       work_queue_t            *wq);
  
  #endif /* _XFS_REPAIR_THREADS_H_ */
diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c

index 01d6af751c0049d81cdab90459f42b824d48711d..7013b378ce1163a7a0386cf63c126019ec4dd0af 100644 (file)
--- a/repair/xfs_repair.c
+++ b/repair/xfs_repair.c
@@ -59,17 +59,14 @@ char *o_opts[] = {
         "ihash",
  #define        BHASH_SIZE      3
         "bhash",
-#define        PREFETCH_INO_CNT        4
-       "pfino",
-#define        PREFETCH_DIR_CNT        5
-       "pfdir",
-#define        PREFETCH_AIO_CNT        6
-       "pfaio",
-#define        AG_STRIDE               7
+#define        AG_STRIDE       4
         "ag_stride",
         NULL
  };
  
+static int     ihash_option_used;
+static int     bhash_option_used;
+
  static void
  usage(void)
  {
@@ -187,8 +184,7 @@ process_args(int argc, char **argv)
         pre_65_beta = 0;
         fs_shared_allowed = 1;
         ag_stride = 0;
-       thread_count = 0;
-       do_parallel = 0;
+       thread_count = 1;
         report_interval = PROG_RPT_DEFAULT;
  
         /*
@@ -223,18 +219,11 @@ process_args(int argc, char **argv)
                                         break;
                                 case IHASH_SIZE:
                                         libxfs_ihash_size = (int) strtol(val, 0, 0);
+                                       ihash_option_used = 1;
                                         break;
                                 case BHASH_SIZE:
                                         libxfs_bhash_size = (int) strtol(val, 0, 0);
-                                       break;
-                               case PREFETCH_INO_CNT:
-                                       libxfs_lio_ino_count = (int) strtol(val, 0, 0);
-                                       break;
-                               case PREFETCH_DIR_CNT:
-                                       libxfs_lio_dir_count = (int) strtol(val, 0, 0);
-                                       break;
-                               case PREFETCH_AIO_CNT:
-                                       libxfs_lio_aio_count = (int) strtol(val, 0, 0);
+                                       bhash_option_used = 1;
                                         break;
                                 case AG_STRIDE:
                                         ag_stride = (int) strtol(val, 0, 0);
@@ -272,10 +261,7 @@ process_args(int argc, char **argv)
                         printf(_("%s version %s\n"), progname, VERSION);
                         exit(0);
                 case 'P':
-                       do_prefetch ^= 1;
-                       break;
-               case 'M':
-                       do_parallel ^= 1;
+                       do_prefetch = 0;
                         break;
                 case 't':
                         report_interval = (int) strtol(optarg, 0, 0);
@@ -483,12 +469,19 @@ main(int argc, char **argv)
         bindtextdomain(PACKAGE, LOCALEDIR);
         textdomain(PACKAGE);
  
+#ifdef XR_PF_TRACE
+       pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
+       setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
+#endif
+
         temp_mp = &xfs_m;
         setbuf(stdout, NULL);
  
         process_args(argc, argv);
         xfs_init(&x);
  
+       msgbuf = malloc(DURATION_BUF_SIZE);
+
         timestamp(PHASE_START, 0, NULL);
         timestamp(PHASE_END, 0, NULL);
  
@@ -529,21 +522,82 @@ main(int argc, char **argv)
         inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
  
         if (ag_stride) {
-               do_parallel = 1;
-               thread_count = (mp->m_sb.sb_agcount + ag_stride - 1) / ag_stride;
+               thread_count = (glob_agcount + ag_stride - 1) / ag_stride;
                 thread_init();
         }
  
-       if (do_parallel && report_interval) {
+       if (ag_stride && report_interval) {
                 init_progress_rpt();
-               msgbuf = malloc(DURATION_BUF_SIZE);
                 if (msgbuf) {
                         do_log(_("        - reporting progress in intervals of %s\n"),
                         duration(report_interval, msgbuf));
-                       free(msgbuf);
                 }
         }
  
+       /*
+        * Adjust libxfs cache sizes based on system memory,
+        * filesystem size and inode count.
+        *
+        * We'll set the cache size based on 3/4s the memory minus
+        * space used by the inode AVL tree and block usage map.
+        *
+        * Inode AVL tree space is approximately 4 bytes per inode,
+        * block usage map is currently 1 byte for 2 blocks.
+        *
+        * We assume most blocks will be inode clusters.
+        *
+        * Calculations are done in kilobyte units.
+        */
+
+       if (!bhash_option_used) {
+               unsigned long   mem_used;
+               unsigned long   phys_mem;
+
+               libxfs_icache_purge();
+               libxfs_bcache_purge();
+               cache_destroy(libxfs_icache);
+               cache_destroy(libxfs_bcache);
+
+               mem_used = (mp->m_sb.sb_icount >> (10 - 2)) +
+                                       (mp->m_sb.sb_dblocks >> (10 + 1));
+               phys_mem = libxfs_physmem() * 3 / 4;
+
+               if (verbose > 1)
+                       do_log(_("        - icount = %llu, imem = %lu, "
+                               "dblock = %llu, dmem = %lu\n"),
+                               mp->m_sb.sb_icount, mp->m_sb.sb_icount >> (10 - 2),
+                               mp->m_sb.sb_dblocks, mp->m_sb.sb_dblocks >> (10 + 1));
+
+               if (phys_mem <= mem_used) {
+                       /*
+                        * Turn off prefetch and minimise libxfs cache if
+                        * physical memory is deemed insufficient
+                        */
+                       do_prefetch = 0;
+                       libxfs_bhash_size = 64;
+               } else {
+                       phys_mem -= mem_used;
+                       if (phys_mem >= (1 << 30))
+                               phys_mem = 1 << 30;
+                       libxfs_bhash_size = phys_mem / (HASH_CACHE_RATIO *
+                                       (mp->m_inode_cluster_size >> 10));
+                       if (libxfs_bhash_size < 512)
+                               libxfs_bhash_size = 512;
+               }
+
+               if (verbose)
+                       do_log(_("        - block cache size set to %d entries\n"),
+                               libxfs_bhash_size * HASH_CACHE_RATIO);
+
+               if (!ihash_option_used)
+                       libxfs_ihash_size = libxfs_bhash_size;
+
+               libxfs_icache = cache_init(libxfs_ihash_size,
+                                               &libxfs_icache_operations);
+               libxfs_bcache = cache_init(libxfs_bhash_size,
+                                               &libxfs_bcache_operations);
+       }
+
         /*
          * calculate what mkfs would do to this filesystem
          */
@@ -564,16 +618,15 @@ main(int argc, char **argv)
         phase2(mp);
         timestamp(PHASE_END, 2, NULL);
  
+       if (do_prefetch)
+               init_prefetch(mp);
+
         phase3(mp);
         timestamp(PHASE_END, 3, NULL);
  
         phase4(mp);
         timestamp(PHASE_END, 4, NULL);
  
-       /* XXX: nathans - something in phase4 ain't playing by */
-       /* the buffer cache rules.. why doesn't IRIX hit this? */
-       libxfs_bcache_flush();
-
         if (no_modify)
                 printf(_("No modify flag set, skipping phase 5\n"));
         else {
@@ -585,8 +638,6 @@ main(int argc, char **argv)
                 phase6(mp);
                 timestamp(PHASE_END, 6, NULL);
  
-               libxfs_bcache_flush();
-
                 phase7(mp);
                 timestamp(PHASE_END, 7, NULL);
         } else  {
@@ -648,7 +699,7 @@ _("Warning:  project quota information would be cleared.\n"
                 }
         }
  
-       if (do_parallel && report_interval)
+       if (ag_stride && report_interval)
                 stop_progress_rpt();
  
         if (no_modify)  {
@@ -662,12 +713,6 @@ _("Warning:  project quota information would be cleared.\n"
                 return(0);
         }
  
-       /*
-        * Done, flush all cached buffers and inodes.
-        */
-       libxfs_icache_purge();
-       libxfs_bcache_purge();
-
         /*
          * Clear the quota flags if they're on.
          */
@@ -694,6 +739,11 @@ _("Note - stripe unit (%d) and width (%d) fields have been reset.\n"
  
         libxfs_writebuf(sbp, 0);
  
+       /*
+        * Done, flush all cached buffers and inodes.
+        */
+       libxfs_bcache_flush();
+
         libxfs_umount(mp);
         if (x.rtdev)
                 libxfs_device_close(x.rtdev);
@@ -704,5 +754,8 @@ _("Note - stripe unit (%d) and width (%d) fields have been reset.\n"
         if (verbose)
                 summary_report();
         do_log(_("done\n"));
+#ifdef XR_PF_TRACE
+       fclose(pf_trace_file);
+#endif
         return (0);
  }
author	Barry Naujok <bnaujok@sgi.com>
	Mon, 16 Jul 2007 15:55:26 +0000 (15:55 +0000)
committer	Barry Naujok <bnaujok@sgi.com>
	Mon, 16 Jul 2007 15:55:26 +0000 (15:55 +0000)
include/cache.h		patch \| blob \| blame \| history
include/libxfs.h		patch \| blob \| blame \| history
include/linux.h		patch \| blob \| blame \| history
libxfs/Makefile		patch \| blob \| blame \| history
libxfs/cache.c		patch \| blob \| blame \| history
libxfs/darwin.c		patch \| blob \| blame \| history
libxfs/freebsd.c		patch \| blob \| blame \| history
libxfs/init.c		patch \| blob \| blame \| history
libxfs/init.h		patch \| blob \| blame \| history
libxfs/irix.c		patch \| blob \| blame \| history
libxfs/linux.c		patch \| blob \| blame \| history
libxfs/lio.c	[deleted file]	patch \| blob \| blame \| history
libxfs/rdwr.c		patch \| blob \| blame \| history
libxfs/trans.c		patch \| blob \| blame \| history
man/man8/xfs_repair.8		patch \| blob \| blame \| history
repair/Makefile		patch \| blob \| blame \| history
repair/dino_chunks.c		patch \| blob \| blame \| history
repair/dinode.c		patch \| blob \| blame \| history
repair/dinode.h		patch \| blob \| blame \| history
repair/dir.c		patch \| blob \| blame \| history
repair/dir2.c		patch \| blob \| blame \| history
repair/dir_stack.c	[deleted file]	patch \| blob \| blame \| history
repair/dir_stack.h	[deleted file]	patch \| blob \| blame \| history
repair/globals.h		patch \| blob \| blame \| history
repair/incore.c		patch \| blob \| blame \| history
repair/incore.h		patch \| blob \| blame \| history
repair/incore_ext.c		patch \| blob \| blame \| history
repair/incore_ino.c		patch \| blob \| blame \| history
repair/init.c		patch \| blob \| blame \| history
repair/phase3.c		patch \| blob \| blame \| history
repair/phase4.c		patch \| blob \| blame \| history
repair/phase5.c		patch \| blob \| blame \| history
repair/phase6.c		patch \| blob \| blame \| history
repair/phase7.c		patch \| blob \| blame \| history
repair/prefetch.c		patch \| blob \| blame \| history
repair/prefetch.h		patch \| blob \| blame \| history
repair/progress.c		patch \| blob \| blame \| history
repair/progress.h		patch \| blob \| blame \| history
repair/radix-tree.c	[new file with mode: 0644]	patch \| blob
repair/radix-tree.h	[new file with mode: 0644]	patch \| blob
repair/scan.c		patch \| blob \| blame \| history
repair/threads.c		patch \| blob \| blame \| history
repair/threads.h		patch \| blob \| blame \| history
repair/xfs_repair.c		patch \| blob \| blame \| history