From: Alex Elder Date: Wed, 25 Nov 2009 23:44:40 +0000 (-0600) Subject: Revert "3.0.5 release" and some of its preceding commits. X-Git-Tag: v3.1.0~16^2~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=62bdee3872d0de9a673b1f4809165db96cc470b3;p=thirdparty%2Fxfsprogs-dev.git Revert "3.0.5 release" and some of its preceding commits. This reverts 11 commits that followed merge 15a60a5...: b0567f1 3.0.5 release 24d9757 add lpath_to_handle to libhandle bad0fe5 repair: add missing locking in scanfunc_bmap 2098754 repair: optimize duplicate extent tracking 241ea1c repair: switch block usage bitmap to a btree af20fe6 repair: cleanup alloc/free/reset of the block... add8f66 repair: cleanup helpers for tracking block usage da9398d repair: track logical to physical block mapping... d081a36 repair: clean up prefetch tracing d93f8b2 repair: use single prefetch queue eb26465 repair: use a btree instead of a radix tree for... Signed-off-by: Alex Elder --- diff --git a/VERSION b/VERSION index 9dff34aa3..601881826 100644 --- a/VERSION +++ b/VERSION @@ -3,5 +3,5 @@ # PKG_MAJOR=3 PKG_MINOR=0 -PKG_REVISION=5 +PKG_REVISION=4 PKG_BUILD=1 diff --git a/doc/CHANGES b/doc/CHANGES index 2e4d2ac6f..515c04c00 100644 --- a/doc/CHANGES +++ b/doc/CHANGES @@ -1,19 +1,3 @@ -xfsprogs-3.0.5 (23 October 2009) - - Use btrees in xfs_repair in a number of critical data - structures, in place of bitmaps and radix trees, resulting - in reduced memory and CPU requirements for large file - systems. - - Various other performance improvements in xfs_repair. - - Add a new function lpath_to_handle() to libhandle, which - allows symlinks to be handled more robustly. - - Tweak the code so a single scanfunc_allocbt() can be used - in place of the two nearly-identical functions used before. - - Add support for discarding blocks to mkfs (along with a - command-line option to avoid its use if desired). - - Allow use of libblkid from util-linux if it is available, - for determining device geometry. - - A few configuration and build improvements. - xfsprogs-3.0.4 (17 September 2009) - Fix a memory leak in xfsprogs. - Increase hash chain length in xfsprogs when running out of memory. diff --git a/include/handle.h b/include/handle.h index 3f1a137f7..b211a2f45 100644 --- a/include/handle.h +++ b/include/handle.h @@ -27,8 +27,6 @@ struct attrlist_cursor; struct parent; extern int path_to_handle (char *__path, void **__hanp, size_t *__hlen); -extern int lpath_to_handle (char *__fspath, char *__path, - void **__hanp, size_t *__hlen); extern int path_to_fshandle (char *__path, void **__fshanp, size_t *__fshlen); extern int handle_to_fshandle (void *__hanp, size_t __hlen, void **__fshanp, size_t *__fshlen); diff --git a/libhandle/handle.c b/libhandle/handle.c index 6c9380de3..627679748 100644 --- a/libhandle/handle.c +++ b/libhandle/handle.c @@ -110,30 +110,17 @@ path_to_handle( char *path, /* input, path to convert */ void **hanp, /* output, pointer to data */ size_t *hlen) /* output, size of returned data */ -{ - return lpath_to_handle(path, path, hanp, hlen); -} - -/* Like path_to_handle, but reliable for paths which are either dangling - * symlinks or symlinks whose targets are not in XFS filesystems. - */ -int -lpath_to_handle( - char *fspath, /* input, path in filesystem */ - char *path, /* input, path to convert */ - void **hanp, /* output, pointer to data */ - size_t *hlen) /* output, size of returned data */ { int fd; int result; comarg_t obj; - fd = open(fspath, O_RDONLY); + fd = open(path, O_RDONLY); if (fd < 0) return -1; obj.path = path; - result = obj_to_handle(fspath, fd, XFS_IOC_PATH_TO_HANDLE, + result = obj_to_handle(path, fd, XFS_IOC_PATH_TO_HANDLE, obj, hanp, hlen); close(fd); return result; diff --git a/repair/Makefile b/repair/Makefile index fa96df52a..a80ea41fd 100644 --- a/repair/Makefile +++ b/repair/Makefile @@ -9,15 +9,15 @@ LSRCFILES = README LTCOMMAND = xfs_repair -HFILES = agheader.h attr_repair.h avl.h avl64.h bmap.h btree.h \ - dinode.h dir.h dir2.h err_protos.h globals.h incore.h protos.h rt.h \ - progress.h scan.h versions.h prefetch.h threads.h +HFILES = agheader.h attr_repair.h avl.h avl64.h bmap.h dinode.h dir.h \ + dir2.h err_protos.h globals.h incore.h protos.h rt.h \ + progress.h scan.h versions.h prefetch.h radix-tree.h threads.h -CFILES = agheader.c attr_repair.c avl.c avl64.c bmap.c btree.c \ - dino_chunks.c dinode.c dir.c dir2.c globals.c incore.c \ +CFILES = agheader.c attr_repair.c avl.c avl64.c bmap.c dino_chunks.c \ + dinode.c dir.c dir2.c globals.c incore.c \ incore_bmc.c init.c incore_ext.c incore_ino.c phase1.c \ phase2.c phase3.c phase4.c phase5.c phase6.c phase7.c \ - progress.c prefetch.c rt.c sb.c scan.c threads.c \ + progress.c prefetch.c radix-tree.c rt.c sb.c scan.c threads.c \ versions.c xfs_repair.c LLDLIBS = $(LIBXFS) $(LIBXLOG) $(LIBUUID) $(LIBRT) $(LIBPTHREAD) @@ -32,7 +32,9 @@ include $(BUILDRULES) # # Tracing flags: +# -DXR_BMAP_DBG incore block bitmap debugging # -DXR_INODE_TRACE inode processing +# -DXR_BMAP_TRACE bmap btree processing # -DXR_DIR_TRACE directory processing # -DXR_DUP_TRACE duplicate extent processing # -DXR_BCNT_TRACE incore bcnt freespace btree building diff --git a/repair/bmap.c b/repair/bmap.c index 79b9f79f4..05d5da89b 100644 --- a/repair/bmap.c +++ b/repair/bmap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2001,2005,2008 Silicon Graphics, Inc. + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -21,46 +21,106 @@ #include "bmap.h" /* - * Track the logical to physical block mapping for inodes. - * - * Repair only processes one inode at a given time per thread, and the - * block map does not have to outlive the processing of a single inode. - * - * The combination of those factors means we can use pthreads thread-local - * storage to store the block map, and we can re-use the allocation over - * and over again. + * Block mapping code taken from xfs_db. + */ + +/* + * Append an extent to the block entry. */ +void +blkent_append( + blkent_t **entp, + xfs_dfsbno_t b, + xfs_dfilblks_t c) +{ + blkent_t *ent; + size_t size; + int i; -pthread_key_t dblkmap_key; -pthread_key_t ablkmap_key; + ent = *entp; + size = BLKENT_SIZE(c + ent->nblks); + if ((*entp = ent = realloc(ent, size)) == NULL) { + do_warn(_("realloc failed in blkent_append (%u bytes)\n"), + size); + return; + } + for (i = 0; i < c; i++) + ent->blks[ent->nblks + i] = b + i; + ent->nblks += c; +} + +/* + * Make a new block entry. + */ +blkent_t * +blkent_new( + xfs_dfiloff_t o, + xfs_dfsbno_t b, + xfs_dfilblks_t c) +{ + blkent_t *ent; + int i; + + if ((ent = malloc(BLKENT_SIZE(c))) == NULL) { + do_warn(_("malloc failed in blkent_new (%u bytes)\n"), + BLKENT_SIZE(c)); + return ent; + } + ent->nblks = c; + ent->startoff = o; + for (i = 0; i < c; i++) + ent->blks[i] = b + i; + return ent; +} +/* + * Prepend an extent to the block entry. + */ +void +blkent_prepend( + blkent_t **entp, + xfs_dfsbno_t b, + xfs_dfilblks_t c) +{ + int i; + blkent_t *newent; + blkent_t *oldent; + + oldent = *entp; + if ((newent = malloc(BLKENT_SIZE(oldent->nblks + c))) == NULL) { + do_warn(_("malloc failed in blkent_prepend (%u bytes)\n"), + BLKENT_SIZE(oldent->nblks + c)); + *entp = newent; + return; + } + newent->nblks = oldent->nblks + c; + newent->startoff = oldent->startoff - c; + for (i = 0; i < c; i++) + newent->blks[i] = b + c; + for (; i < oldent->nblks + c; i++) + newent->blks[i] = oldent->blks[i - c]; + free(oldent); + *entp = newent; +} + +/* + * Allocate a block map. + */ blkmap_t * blkmap_alloc( - xfs_extnum_t nex, - int whichfork) + xfs_extnum_t nex) { - pthread_key_t key; blkmap_t *blkmap; - ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK); - if (nex < 1) nex = 1; - - key = whichfork ? ablkmap_key : dblkmap_key; - blkmap = pthread_getspecific(key); - if (!blkmap || blkmap->naexts < nex) { - blkmap = realloc(blkmap, BLKMAP_SIZE(nex)); - if (!blkmap) { - do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"), - BLKMAP_SIZE(nex)); - return NULL; - } - pthread_setspecific(key, blkmap); - blkmap->naexts = nex; + if ((blkmap = malloc(BLKMAP_SIZE(nex))) == NULL) { + do_warn(_("malloc failed in blkmap_alloc (%u bytes)\n"), + BLKMAP_SIZE(nex)); + return blkmap; } - - blkmap->nexts = 0; + blkmap->naents = nex; + blkmap->nents = 0; return blkmap; } @@ -71,7 +131,14 @@ void blkmap_free( blkmap_t *blkmap) { - /* nothing to do! - keep the memory around for the next inode */ + blkent_t **entp; + xfs_extnum_t i; + + if (blkmap == NULL) + return; + for (i = 0, entp = blkmap->ents; i < blkmap->nents; i++, entp++) + free(*entp); + free(blkmap); } /* @@ -82,18 +149,20 @@ blkmap_get( blkmap_t *blkmap, xfs_dfiloff_t o) { - bmap_ext_t *ext = blkmap->exts; + blkent_t *ent; + blkent_t **entp; int i; - for (i = 0; i < blkmap->nexts; i++, ext++) { - if (o >= ext->startoff && o < ext->startoff + ext->blockcount) - return ext->startblock + (o - ext->startoff); + for (i = 0, entp = blkmap->ents; i < blkmap->nents; i++, entp++) { + ent = *entp; + if (o >= ent->startoff && o < ent->startoff + ent->nblks) + return ent->blks[o - ent->startoff]; } return NULLDFSBNO; } /* - * Get a chunk of entries from a block map - only used for reading dirv2 blocks + * Get a chunk of entries from a block map. */ int blkmap_getn( @@ -103,62 +172,93 @@ blkmap_getn( bmap_ext_t **bmpp, bmap_ext_t *bmpp_single) { - bmap_ext_t *bmp = NULL; - bmap_ext_t *ext; + bmap_ext_t *bmp; + blkent_t *ent; + xfs_dfiloff_t ento; + blkent_t **entp; int i; int nex; if (nb == 1) { - /* + /* * in the common case, when mp->m_dirblkfsbs == 1, * avoid additional malloc/free overhead */ bmpp_single->startblock = blkmap_get(blkmap, o); - goto single_ext; + bmpp_single->blockcount = 1; + bmpp_single->startoff = 0; + bmpp_single->flag = 0; + *bmpp = bmpp_single; + return (bmpp_single->startblock != NULLDFSBNO) ? 1 : 0; } - ext = blkmap->exts; - nex = 0; - for (i = 0; i < blkmap->nexts; i++, ext++) { - - if (ext->startoff >= o + nb) + for (i = nex = 0, bmp = NULL, entp = blkmap->ents; + i < blkmap->nents; + i++, entp++) { + ent = *entp; + if (ent->startoff >= o + nb) break; - if (ext->startoff + ext->blockcount <= o) + if (ent->startoff + ent->nblks <= o) continue; - - /* - * if all the requested blocks are in one extent (also common), - * use the bmpp_single option as well - */ - if (!bmp && o >= ext->startoff && - o + nb <= ext->startoff + ext->blockcount) { - bmpp_single->startblock = - ext->startblock + (o - ext->startoff); - goto single_ext; + for (ento = ent->startoff; + ento < ent->startoff + ent->nblks && ento < o + nb; + ento++) { + if (ento < o) + continue; + if (bmp && + bmp[nex - 1].startoff + bmp[nex - 1].blockcount == + ento && + bmp[nex - 1].startblock + bmp[nex - 1].blockcount == + ent->blks[ento - ent->startoff]) + bmp[nex - 1].blockcount++; + else { + bmp = realloc(bmp, ++nex * sizeof(*bmp)); + if (bmp == NULL) { + do_warn(_("blkmap_getn realloc failed" + " (%u bytes)\n"), + nex * sizeof(*bmp)); + continue; + } + bmp[nex - 1].startoff = ento; + bmp[nex - 1].startblock = + ent->blks[ento - ent->startoff]; + bmp[nex - 1].blockcount = 1; + bmp[nex - 1].flag = 0; + } } - - /* - * rare case - multiple extents for a single dir block - */ - bmp = malloc(nb * sizeof(bmap_ext_t)); - if (!bmp) - do_error(_("blkmap_getn malloc failed (%u bytes)\n"), - nb * sizeof(bmap_ext_t)); - - bmp[nex].startblock = ext->startblock + (o - ext->startoff); - bmp[nex].blockcount = MIN(nb, ext->blockcount - - (bmp[nex].startblock - ext->startblock)); - o += bmp[nex].blockcount; - nb -= bmp[nex].blockcount; - nex++; } *bmpp = bmp; return nex; +} + +/* + * Make a block map larger. + */ +void +blkmap_grow( + blkmap_t **blkmapp, + blkent_t **entp, + blkent_t *newent) +{ + blkmap_t *blkmap; + size_t size; + int i; + int idx; -single_ext: - bmpp_single->blockcount = nb; - bmpp_single->startoff = 0; /* not even used by caller! */ - *bmpp = bmpp_single; - return (bmpp_single->startblock != NULLDFSBNO) ? 1 : 0; + blkmap = *blkmapp; + idx = (int)(entp - blkmap->ents); + if (blkmap->naents == blkmap->nents) { + size = BLKMAP_SIZE(blkmap->nents + 1); + if ((*blkmapp = blkmap = realloc(blkmap, size)) == NULL) { + do_warn(_("realloc failed in blkmap_grow (%u bytes)\n"), + size); + return; + } + blkmap->naents++; + } + for (i = blkmap->nents; i > idx; i--) + blkmap->ents[i] = blkmap->ents[i - 1]; + blkmap->ents[idx] = newent; + blkmap->nents++; } /* @@ -168,12 +268,12 @@ xfs_dfiloff_t blkmap_last_off( blkmap_t *blkmap) { - bmap_ext_t *ext; + blkent_t *ent; - if (!blkmap->nexts) + if (!blkmap->nents) return NULLDFILOFF; - ext = blkmap->exts + blkmap->nexts - 1; - return ext->startoff + ext->blockcount; + ent = blkmap->ents[blkmap->nents - 1]; + return ent->startoff + ent->nblks; } /* @@ -185,45 +285,73 @@ blkmap_next_off( xfs_dfiloff_t o, int *t) { - bmap_ext_t *ext; + blkent_t *ent; + blkent_t **entp; - if (!blkmap->nexts) + if (!blkmap->nents) return NULLDFILOFF; if (o == NULLDFILOFF) { *t = 0; - return blkmap->exts[0].startoff; + ent = blkmap->ents[0]; + return ent->startoff; } - ext = blkmap->exts + *t; - if (o < ext->startoff + ext->blockcount - 1) + entp = &blkmap->ents[*t]; + ent = *entp; + if (o < ent->startoff + ent->nblks - 1) return o + 1; - if (*t >= blkmap->nexts - 1) + entp++; + if (entp >= &blkmap->ents[blkmap->nents]) return NULLDFILOFF; (*t)++; - return ext[1].startoff; + ent = *entp; + return ent->startoff; } /* - * Make a block map larger. + * Set a block value in a block map. */ -static blkmap_t * -blkmap_grow( - blkmap_t **blkmapp) +void +blkmap_set_blk( + blkmap_t **blkmapp, + xfs_dfiloff_t o, + xfs_dfsbno_t b) { - pthread_key_t key = dblkmap_key; - blkmap_t *blkmap = *blkmapp; + blkmap_t *blkmap; + blkent_t *ent; + blkent_t **entp; + blkent_t *nextent; - if (pthread_getspecific(key) != blkmap) { - key = ablkmap_key; - ASSERT(pthread_getspecific(key) == blkmap); + blkmap = *blkmapp; + for (entp = blkmap->ents; entp < &blkmap->ents[blkmap->nents]; entp++) { + ent = *entp; + if (o < ent->startoff - 1) { + ent = blkent_new(o, b, 1); + blkmap_grow(blkmapp, entp, ent); + return; + } + if (o == ent->startoff - 1) { + blkent_prepend(entp, b, 1); + return; + } + if (o >= ent->startoff && o < ent->startoff + ent->nblks) { + ent->blks[o - ent->startoff] = b; + return; + } + if (o > ent->startoff + ent->nblks) + continue; + blkent_append(entp, b, 1); + if (entp == &blkmap->ents[blkmap->nents - 1]) + return; + ent = *entp; + nextent = entp[1]; + if (ent->startoff + ent->nblks < nextent->startoff) + return; + blkent_append(entp, nextent->blks[0], nextent->nblks); + blkmap_shrink(blkmap, &entp[1]); + return; } - - blkmap->naexts += 4; - blkmap = realloc(blkmap, BLKMAP_SIZE(blkmap->naexts)); - if (blkmap == NULL) - do_error(_("realloc failed in blkmap_grow\n")); - *blkmapp = blkmap; - pthread_setspecific(key, blkmap); - return blkmap; + ent = blkent_new(o, b, 1); + blkmap_grow(blkmapp, entp, ent); } /* @@ -236,23 +364,46 @@ blkmap_set_ext( xfs_dfsbno_t b, xfs_dfilblks_t c) { - blkmap_t *blkmap = *blkmapp; + blkmap_t *blkmap; + blkent_t *ent; + blkent_t **entp; xfs_extnum_t i; - if (blkmap->nexts == blkmap->naexts) - blkmap = blkmap_grow(blkmapp); - - for (i = 0; i < blkmap->nexts; i++) { - if (blkmap->exts[i].startoff > o) { - memmove(blkmap->exts + i + 1, - blkmap->exts + i, - sizeof(bmap_ext_t) * (blkmap->nexts - i)); - break; - } + blkmap = *blkmapp; + if (!blkmap->nents) { + blkmap->ents[0] = blkent_new(o, b, c); + blkmap->nents = 1; + return; + } + entp = &blkmap->ents[blkmap->nents - 1]; + ent = *entp; + if (ent->startoff + ent->nblks == o) { + blkent_append(entp, b, c); + return; + } + if (ent->startoff + ent->nblks < o) { + ent = blkent_new(o, b, c); + blkmap_grow(blkmapp, &blkmap->ents[blkmap->nents], ent); + return; } + for (i = 0; i < c; i++) + blkmap_set_blk(blkmapp, o + i, b + i); +} + +/* + * Make a block map smaller. + */ +void +blkmap_shrink( + blkmap_t *blkmap, + blkent_t **entp) +{ + int i; + int idx; - blkmap->exts[i].startoff = o; - blkmap->exts[i].startblock = b; - blkmap->exts[i].blockcount = c; - blkmap->nexts++; + free(*entp); + idx = (int)(entp - blkmap->ents); + for (i = idx + 1; i < blkmap->nents; i++) + blkmap->ents[i] = blkmap->ents[i - 1]; + blkmap->nents--; } diff --git a/repair/bmap.h b/repair/bmap.h index 58abf95fd..eba1799f5 100644 --- a/repair/bmap.h +++ b/repair/bmap.h @@ -16,41 +16,59 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef _XFS_REPAIR_BMAP_H -#define _XFS_REPAIR_BMAP_H +/* + * Block mapping code taken from xfs_db. + */ /* - * Extent descriptor. + * Block map entry. */ -typedef struct bmap_ext { +typedef struct blkent { xfs_dfiloff_t startoff; - xfs_dfsbno_t startblock; - xfs_dfilblks_t blockcount; -} bmap_ext_t; + xfs_dfilblks_t nblks; + xfs_dfsbno_t blks[1]; +} blkent_t; +#define BLKENT_SIZE(n) \ + (offsetof(blkent_t, blks) + (sizeof(xfs_dfsbno_t) * (n))) /* * Block map. */ typedef struct blkmap { - int naexts; - int nexts; - bmap_ext_t exts[1]; + int naents; + int nents; + blkent_t *ents[1]; } blkmap_t; - #define BLKMAP_SIZE(n) \ - (offsetof(blkmap_t, exts) + (sizeof(bmap_ext_t) * (n))) - -blkmap_t *blkmap_alloc(xfs_extnum_t nex, int whichfork); -void blkmap_free(blkmap_t *blkmap); + (offsetof(blkmap_t, ents) + (sizeof(blkent_t *) * (n))) -void blkmap_set_ext(blkmap_t **blkmapp, xfs_dfiloff_t o, - xfs_dfsbno_t b, xfs_dfilblks_t c); +/* + * Extent descriptor. + */ +typedef struct bmap_ext { + xfs_dfiloff_t startoff; + xfs_dfsbno_t startblock; + xfs_dfilblks_t blockcount; + int flag; +} bmap_ext_t; +void blkent_append(blkent_t **entp, xfs_dfsbno_t b, + xfs_dfilblks_t c); +blkent_t *blkent_new(xfs_dfiloff_t o, xfs_dfsbno_t b, xfs_dfilblks_t c); +void blkent_prepend(blkent_t **entp, xfs_dfsbno_t b, + xfs_dfilblks_t c); +blkmap_t *blkmap_alloc(xfs_extnum_t); +void blkmap_free(blkmap_t *blkmap); xfs_dfsbno_t blkmap_get(blkmap_t *blkmap, xfs_dfiloff_t o); int blkmap_getn(blkmap_t *blkmap, xfs_dfiloff_t o, - xfs_dfilblks_t nb, bmap_ext_t **bmpp, + xfs_dfilblks_t nb, bmap_ext_t **bmpp, bmap_ext_t *bmpp_single); +void blkmap_grow(blkmap_t **blkmapp, blkent_t **entp, + blkent_t *newent); xfs_dfiloff_t blkmap_last_off(blkmap_t *blkmap); xfs_dfiloff_t blkmap_next_off(blkmap_t *blkmap, xfs_dfiloff_t o, int *t); - -#endif /* _XFS_REPAIR_BMAP_H */ +void blkmap_set_blk(blkmap_t **blkmapp, xfs_dfiloff_t o, + xfs_dfsbno_t b); +void blkmap_set_ext(blkmap_t **blkmapp, xfs_dfiloff_t o, + xfs_dfsbno_t b, xfs_dfilblks_t c); +void blkmap_shrink(blkmap_t *blkmap, blkent_t **entp); diff --git a/repair/btree.c b/repair/btree.c deleted file mode 100644 index f91f96bc0..000000000 --- a/repair/btree.c +++ /dev/null @@ -1,1234 +0,0 @@ -/* - * Copyright (c) 2007, Silicon Graphics, Inc. Barry Naujok - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include "btree.h" - - -#define BTREE_KEY_MAX 7 -#define BTREE_KEY_MIN (BTREE_KEY_MAX / 2) - -#define BTREE_PTR_MAX (BTREE_KEY_MAX + 1) - -struct btree_node { - unsigned long num_keys; - unsigned long keys[BTREE_KEY_MAX]; - struct btree_node * ptrs[BTREE_PTR_MAX]; -}; - -struct btree_cursor { - struct btree_node *node; - int index; -}; - -struct btree_root { - struct btree_node *root_node; - struct btree_cursor *cursor; /* track path to end leaf */ - int height; - /* lookup cache */ - int keys_valid; /* set if the cache is valid */ - unsigned long cur_key; - unsigned long next_key; - void *next_value; - unsigned long prev_key; - void *prev_value; -#ifdef BTREE_STATS - struct btree_stats { - unsigned long num_items; - unsigned long max_items; - int alloced; - int cache_hits; - int cache_misses; - int lookup; - int find; - int key_update; - int value_update; - int insert; - int delete; - int inc_height; - int dec_height; - int shift_prev; - int shift_next; - int split; - int merge_prev; - int merge_next; - int balance_prev; - int balance_next; - } stats; -#endif -}; - - -static struct btree_node * -btree_node_alloc(void) -{ - return calloc(1, sizeof(struct btree_node)); -} - -static void -btree_node_free( - struct btree_node *node) -{ - free(node); -} - -static void -btree_free_nodes( - struct btree_node *node, - int level) -{ - int i; - - if (level) - for (i = 0; i <= node->num_keys; i++) - btree_free_nodes(node->ptrs[i], level - 1); - btree_node_free(node); -} - -static void -__btree_init( - struct btree_root *root) -{ - memset(root, 0, sizeof(struct btree_root)); - root->height = 1; - root->cursor = calloc(1, sizeof(struct btree_cursor)); - root->root_node = btree_node_alloc(); - ASSERT(root->root_node); -#ifdef BTREE_STATS - root->stats.max_items = 1; - root->stats.alloced += 1; -#endif -} - -static void -__btree_free( - struct btree_root *root) -{ - btree_free_nodes(root->root_node, root->height - 1); - free(root->cursor); - root->height = 0; - root->cursor = NULL; - root->root_node = NULL; -} - -void -btree_init( - struct btree_root **root) -{ - *root = calloc(1, sizeof(struct btree_root)); - __btree_init(*root); -} - -void -btree_clear( - struct btree_root *root) -{ - __btree_free(root); - __btree_init(root); -} - -void -btree_destroy( - struct btree_root *root) -{ - __btree_free(root); - free(root); -} - -int -btree_is_empty( - struct btree_root *root) -{ - return root->root_node->num_keys == 0; -} - -static inline void -btree_invalidate_cursor( - struct btree_root *root) -{ - root->cursor[0].node = NULL; - root->keys_valid = 0; -} - -static inline unsigned long -btree_key_of_cursor( - struct btree_cursor *cursor, - int height) -{ - while (cursor->node->num_keys == cursor->index && --height > 0) - cursor++; - return cursor->node->keys[cursor->index]; -} - -static void * -btree_get_prev( - struct btree_root *root, - unsigned long *key) -{ - struct btree_cursor *cur = root->cursor; - int level = 0; - struct btree_node *node; - - if (cur->index > 0) { - if (key) - *key = cur->node->keys[cur->index - 1]; - return cur->node->ptrs[cur->index - 1]; - } - - /* else need to go up and back down the tree to find the previous */ - - while (cur->index == 0) { - if (++level == root->height) - return NULL; - cur++; - } - - /* the key is in the current level */ - if (key) - *key = cur->node->keys[cur->index - 1]; - - /* descend back down the right side to get the pointer */ - node = cur->node->ptrs[cur->index - 1]; - while (level--) - node = node->ptrs[node->num_keys]; - return node; -} - -static void * -btree_get_next( - struct btree_root *root, - unsigned long *key) -{ - struct btree_cursor *cur = root->cursor; - int level = 0; - struct btree_node *node; - - while (cur->index == cur->node->num_keys) { - if (++level == root->height) - return NULL; - cur++; - } - if (level == 0) { - if (key) { - cur->index++; - *key = btree_key_of_cursor(cur, root->height); - cur->index--; - } - return cur->node->ptrs[cur->index + 1]; - } - - node = cur->node->ptrs[cur->index + 1]; - while (--level > 0) - node = node->ptrs[0]; - if (key) - *key = node->keys[0]; - return node->ptrs[0]; -} - -/* - * Lookup/Search functions - */ - -static int -btree_do_search( - struct btree_root *root, - unsigned long key) -{ - unsigned long k = 0; - struct btree_cursor *cur = root->cursor + root->height; - struct btree_node *node = root->root_node; - int height = root->height; - int key_found = 0; - int i; - - while (--height >= 0) { - cur--; - for (i = 0; i < node->num_keys; i++) - if (node->keys[i] >= key) { - k = node->keys[i]; - key_found = 1; - break; - } - cur->node = node; - cur->index = i; - node = node->ptrs[i]; - } - root->keys_valid = key_found; - if (!key_found) - return 0; - - root->cur_key = k; - root->next_value = NULL; /* do on-demand next value lookup */ - root->prev_value = btree_get_prev(root, &root->prev_key); - return 1; -} - -static int -btree_search( - struct btree_root *root, - unsigned long key) -{ - if (root->keys_valid && key <= root->cur_key && - (!root->prev_value || key > root->prev_key)) { -#ifdef BTREE_STATS - root->stats.cache_hits++; -#endif - return 1; - } -#ifdef BTREE_STATS - root->stats.cache_misses++; -#endif - return btree_do_search(root, key); -} - -void * -btree_find( - struct btree_root *root, - unsigned long key, - unsigned long *actual_key) -{ -#ifdef BTREE_STATS - root->stats.find += 1; -#endif - if (!btree_search(root, key)) - return NULL; - - if (actual_key) - *actual_key = root->cur_key; - return root->cursor->node->ptrs[root->cursor->index]; -} - -void * -btree_lookup( - struct btree_root *root, - unsigned long key) -{ -#ifdef BTREE_STATS - root->stats.lookup += 1; -#endif - if (!btree_search(root, key) || root->cur_key != key) - return NULL; - return root->cursor->node->ptrs[root->cursor->index]; -} - -void * -btree_peek_prev( - struct btree_root *root, - unsigned long *key) -{ - if (!root->keys_valid) - return NULL; - if (key) - *key = root->prev_key; - return root->prev_value; -} - -void * -btree_peek_next( - struct btree_root *root, - unsigned long *key) -{ - if (!root->keys_valid) - return NULL; - if (!root->next_value) - root->next_value = btree_get_next(root, &root->next_key); - if (key) - *key = root->next_key; - return root->next_value; -} - -static void * -btree_move_cursor_to_next( - struct btree_root *root, - unsigned long *key) -{ - struct btree_cursor *cur = root->cursor; - int level = 0; - - while (cur->index == cur->node->num_keys) { - if (++level == root->height) - return NULL; - cur++; - } - cur->index++; - if (level == 0) { - if (key) - *key = btree_key_of_cursor(cur, root->height); - return cur->node->ptrs[cur->index]; - } - - while (--level >= 0) { - root->cursor[level].node = cur->node->ptrs[cur->index]; - root->cursor[level].index = 0; - cur--; - } - if (key) - *key = cur->node->keys[0]; - return cur->node->ptrs[0]; -} - -void * -btree_lookup_next( - struct btree_root *root, - unsigned long *key) -{ - void *value; - - if (!root->keys_valid) - return NULL; - - root->prev_key = root->cur_key; - root->prev_value = root->cursor->node->ptrs[root->cursor->index]; - - value = btree_move_cursor_to_next(root, &root->cur_key); - if (!value) { - btree_invalidate_cursor(root); - return NULL; - } - root->next_value = NULL; /* on-demand next value fetch */ - if (key) - *key = root->cur_key; - return value; -} - -static void * -btree_move_cursor_to_prev( - struct btree_root *root, - unsigned long *key) -{ - struct btree_cursor *cur = root->cursor; - int level = 0; - - while (cur->index == 0) { - if (++level == root->height) - return NULL; - cur++; - } - cur->index--; - if (key) /* the key is in the current level */ - *key = cur->node->keys[cur->index]; - while (level > 0) { - level--; - root->cursor[level].node = cur->node->ptrs[cur->index]; - root->cursor[level].index = root->cursor[level].node->num_keys; - cur--; - } - return cur->node->ptrs[cur->index]; -} - -void * -btree_lookup_prev( - struct btree_root *root, - unsigned long *key) -{ - void *value; - - if (!root->keys_valid) - return NULL; - - value = btree_move_cursor_to_prev(root, &root->cur_key); - if (!value) - return NULL; - root->prev_value = btree_get_prev(root, &root->prev_key); - root->next_value = NULL; /* on-demand next value fetch */ - if (key) - *key = root->cur_key; - return value; -} - -void * -btree_uncached_lookup( - struct btree_root *root, - unsigned long key) -{ - /* cursor-less (ie. uncached) lookup */ - int height = root->height - 1; - struct btree_node *node = root->root_node; - int i; - int key_found = 0; - - while (height >= 0) { - for (i = 0; i < node->num_keys; i++) - if (node->keys[i] >= key) { - key_found = node->keys[i] == key; - break; - } - node = node->ptrs[i]; - height--; - } - return key_found ? node : NULL; -} - -/* Update functions */ - -static inline void -btree_update_node_key( - struct btree_root *root, - struct btree_cursor *cursor, - int level, - unsigned long new_key) -{ - int i; - -#ifdef BTREE_STATS - root->stats.key_update += 1; -#endif - - cursor += level; - for (i = level; i < root->height; i++) { - if (cursor->index < cursor->node->num_keys) { - cursor->node->keys[cursor->index] = new_key; - break; - } - cursor++; - } -} - -int -btree_update_key( - struct btree_root *root, - unsigned long old_key, - unsigned long new_key) -{ - if (!btree_search(root, old_key) || root->cur_key != old_key) - return ENOENT; - - if (root->next_value && new_key >= root->next_key) - return EINVAL; - - if (root->prev_value && new_key <= root->prev_key) - return EINVAL; - - btree_update_node_key(root, root->cursor, 0, new_key); - - return 0; -} - -int -btree_update_value( - struct btree_root *root, - unsigned long key, - void *new_value) -{ - if (!new_value) - return EINVAL; - - if (!btree_search(root, key) || root->cur_key != key) - return ENOENT; - -#ifdef BTREE_STATS - root->stats.value_update += 1; -#endif - root->cursor->node->ptrs[root->cursor->index] = new_value; - - return 0; -} - -/* - * Cursor modification functions - used for inserting and deleting - */ - -static struct btree_cursor * -btree_copy_cursor_prev( - struct btree_root *root, - struct btree_cursor *dest_cursor, - int level) -{ - struct btree_cursor *src_cur = root->cursor + level; - struct btree_cursor *dst_cur; - int l = level; - int i; - - if (level >= root->height) - return NULL; - - while (src_cur->index == 0) { - if (++l >= root->height) - return NULL; - src_cur++; - } - for (i = l; i < root->height; i++) - dest_cursor[i] = *src_cur++; - - dst_cur = dest_cursor + l; - dst_cur->index--; - while (l-- >= level) { - dest_cursor[l].node = dst_cur->node->ptrs[dst_cur->index]; - dest_cursor[l].index = dest_cursor[l].node->num_keys; - dst_cur--; - } - return dest_cursor; -} - -static struct btree_cursor * -btree_copy_cursor_next( - struct btree_root *root, - struct btree_cursor *dest_cursor, - int level) -{ - struct btree_cursor *src_cur = root->cursor + level; - struct btree_cursor *dst_cur; - int l = level; - int i; - - if (level >= root->height) - return NULL; - - while (src_cur->index == src_cur->node->num_keys) { - if (++l >= root->height) - return NULL; - src_cur++; - } - for (i = l; i < root->height; i++) - dest_cursor[i] = *src_cur++; - - dst_cur = dest_cursor + l; - dst_cur->index++; - while (l-- >= level) { - dest_cursor[l].node = dst_cur->node->ptrs[dst_cur->index]; - dest_cursor[l].index = 0; - dst_cur--; - } - return dest_cursor; -} - -/* - * Shift functions - * - * Tries to move items in the current leaf to its sibling if it has space. - * Used in both insert and delete functions. - * Returns the number of items shifted. - */ - -static int -btree_shift_to_prev( - struct btree_root *root, - int level, - struct btree_cursor *prev_cursor, - int num_children) -{ - struct btree_node *node; - struct btree_node *prev_node; - int num_remain; /* # of keys left in "node" */ - unsigned long key; - int i; - - if (!prev_cursor || !num_children) - return 0; - - prev_node = prev_cursor[level].node; - node = root->cursor[level].node; - - ASSERT(num_children > 0 && num_children <= node->num_keys + 1); - - if ((prev_node->num_keys + num_children) > BTREE_KEY_MAX) - return 0; - -#ifdef BTREE_STATS - root->stats.shift_prev += 1; -#endif - - num_remain = node->num_keys - num_children; - ASSERT(num_remain == -1 || num_remain >= BTREE_KEY_MIN); - - /* shift parent keys around */ - level++; - if (num_remain > 0) - key = node->keys[num_children - 1]; - else - key = btree_key_of_cursor(root->cursor + level, - root->height - level); - while (prev_cursor[level].index == prev_cursor[level].node->num_keys) { - level++; - ASSERT(level < root->height); - } - prev_node->keys[prev_node->num_keys] = - prev_cursor[level].node->keys[prev_cursor[level].index]; - prev_cursor[level].node->keys[prev_cursor[level].index] = key; - - /* copy pointers and keys to the end of the prev node */ - for (i = 0; i < num_children - 1; i++) { - prev_node->keys[prev_node->num_keys + 1 + i] = node->keys[i]; - prev_node->ptrs[prev_node->num_keys + 1 + i] = node->ptrs[i]; - } - prev_node->ptrs[prev_node->num_keys + 1 + i] = node->ptrs[i]; - prev_node->num_keys += num_children; - - /* move remaining pointers/keys to start of node */ - if (num_remain >= 0) { - for (i = 0; i < num_remain; i++) { - node->keys[i] = node->keys[num_children + i]; - node->ptrs[i] = node->ptrs[num_children + i]; - } - node->ptrs[i] = node->ptrs[num_children + i]; - node->num_keys = num_remain; - } else - node->num_keys = 0; - - return num_children; -} - -static int -btree_shift_to_next( - struct btree_root *root, - int level, - struct btree_cursor *next_cursor, - int num_children) -{ - struct btree_node *node; - struct btree_node *next_node; - int num_remain; /* # of children left in node */ - int i; - - if (!next_cursor || !num_children) - return 0; - - node = root->cursor[level].node; - next_node = next_cursor[level].node; - - ASSERT(num_children > 0 && num_children <= node->num_keys + 1); - - if ((next_node->num_keys + num_children) > BTREE_KEY_MAX) - return 0; - - num_remain = node->num_keys + 1 - num_children; - ASSERT(num_remain == 0 || num_remain > BTREE_KEY_MIN); - -#ifdef BTREE_STATS - root->stats.shift_next += 1; -#endif - - /* make space for "num_children" items at beginning of next-leaf */ - i = next_node->num_keys; - next_node->ptrs[num_children + i] = next_node->ptrs[i]; - while (--i >= 0) { - next_node->keys[num_children + i] = next_node->keys[i]; - next_node->ptrs[num_children + i] = next_node->ptrs[i]; - } - - /* update keys in parent and next node from parent */ - do { - level++; - ASSERT(level < root->height); - } while (root->cursor[level].index == root->cursor[level].node->num_keys); - - next_node->keys[num_children - 1] = - root->cursor[level].node->keys[root->cursor[level].index]; - root->cursor[level].node->keys[root->cursor[level].index] = - node->keys[node->num_keys - num_children]; - - /* copy last "num_children" items from node into start of next-node */ - for (i = 0; i < num_children - 1; i++) { - next_node->keys[i] = node->keys[num_remain + i]; - next_node->ptrs[i] = node->ptrs[num_remain + i]; - } - next_node->ptrs[i] = node->ptrs[num_remain + i]; - next_node->num_keys += num_children; - - if (num_remain > 0) - node->num_keys -= num_children; - else - node->num_keys = 0; - - return num_children; -} - -/* - * Insertion functions - */ - -static struct btree_node * -btree_increase_height( - struct btree_root *root) -{ - struct btree_node *new_root; - struct btree_cursor *new_cursor; - - new_cursor = realloc(root->cursor, (root->height + 1) * - sizeof(struct btree_cursor)); - if (!new_cursor) - return NULL; - root->cursor = new_cursor; - - new_root = btree_node_alloc(); - if (!new_root) - return NULL; - -#ifdef BTREE_STATS - root->stats.alloced += 1; - root->stats.inc_height += 1; - root->stats.max_items *= BTREE_PTR_MAX; -#endif - - new_root->ptrs[0] = root->root_node; - root->root_node = new_root; - - root->cursor[root->height].node = new_root; - root->cursor[root->height].index = 0; - - root->height++; - - return new_root; -} - -static int -btree_insert_item( - struct btree_root *root, - int level, - unsigned long key, - void *value); - - -static struct btree_node * -btree_split( - struct btree_root *root, - int level, - unsigned long key, - int *index) -{ - struct btree_node *node = root->cursor[level].node; - struct btree_node *new_node; - int i; - - new_node = btree_node_alloc(); - if (!new_node) - return NULL; - - if (btree_insert_item(root, level + 1, node->keys[BTREE_KEY_MIN], - new_node) != 0) { - btree_node_free(new_node); - return NULL; - } - -#ifdef BTREE_STATS - root->stats.alloced += 1; - root->stats.split += 1; -#endif - - for (i = 0; i < BTREE_KEY_MAX - BTREE_KEY_MIN - 1; i++) { - new_node->keys[i] = node->keys[BTREE_KEY_MIN + 1 + i]; - new_node->ptrs[i] = node->ptrs[BTREE_KEY_MIN + 1 + i]; - } - new_node->ptrs[i] = node->ptrs[BTREE_KEY_MIN + 1 + i]; - new_node->num_keys = BTREE_KEY_MAX - BTREE_KEY_MIN - 1; - - node->num_keys = BTREE_KEY_MIN; - if (key < node->keys[BTREE_KEY_MIN]) - return node; /* index doesn't change */ - - /* insertion point is in new node... */ - *index -= BTREE_KEY_MIN + 1; - return new_node; -} - -static int -btree_insert_shift_to_prev( - struct btree_root *root, - int level, - int *index) -{ - struct btree_cursor tmp_cursor[root->height]; - int n; - - if (*index <= 0) - return -1; - - if (!btree_copy_cursor_prev(root, tmp_cursor, level + 1)) - return -1; - - n = MIN(*index, (BTREE_PTR_MAX - tmp_cursor[level].node->num_keys) / 2); - if (!n || !btree_shift_to_prev(root, level, tmp_cursor, n)) - return -1; - - *index -= n; - return 0; -} - -static int -btree_insert_shift_to_next( - struct btree_root *root, - int level, - int *index) -{ - struct btree_cursor tmp_cursor[root->height]; - int n; - - if (*index >= BTREE_KEY_MAX) - return -1; - - if (!btree_copy_cursor_next(root, tmp_cursor, level + 1)) - return -1; - - n = MIN(BTREE_KEY_MAX - *index, - (BTREE_PTR_MAX - tmp_cursor[level].node->num_keys) / 2); - if (!n || !btree_shift_to_next(root, level, tmp_cursor, n)) - return -1; - return 0; -} - -static int -btree_insert_item( - struct btree_root *root, - int level, - unsigned long key, - void *value) -{ - struct btree_node *node = root->cursor[level].node; - int index = root->cursor[level].index; - int i; - - if (node->num_keys == BTREE_KEY_MAX) { - if (btree_insert_shift_to_prev(root, level, &index) == 0) - goto insert; - if (btree_insert_shift_to_next(root, level, &index) == 0) - goto insert; - if (level == root->height - 1) { - if (!btree_increase_height(root)) - return ENOMEM; - } - node = btree_split(root, level, key, &index); - if (!node) - return ENOMEM; - } -insert: - ASSERT(index <= node->num_keys); - - i = node->num_keys; - node->ptrs[i + 1] = node->ptrs[i]; - while (--i >= index) { - node->keys[i + 1] = node->keys[i]; - node->ptrs[i + 1] = node->ptrs[i]; - } - - node->num_keys++; - node->keys[index] = key; - - if (level == 0) - node->ptrs[index] = value; - else - node->ptrs[index + 1] = value; - - return 0; -} - - - -int -btree_insert( - struct btree_root *root, - unsigned long key, - void *value) -{ - int result; - - if (!value) - return EINVAL; - - if (btree_search(root, key) && root->cur_key == key) - return EEXIST; - -#ifdef BTREE_STATS - root->stats.insert += 1; - root->stats.num_items += 1; -#endif - - result = btree_insert_item(root, 0, key, value); - - btree_invalidate_cursor(root); - - return result; -} - - -/* - * Deletion functions - * - * Rather more complicated as deletions has 4 ways to go once a node - * ends up with less than the minimum number of keys: - * - move remainder to previous node - * - move remainder to next node - * (both will involve a parent deletion which may recurse) - * - balance by moving some items from previous node - * - balance by moving some items from next node - */ - -static void -btree_decrease_height( - struct btree_root *root) -{ - struct btree_node *old_root = root->root_node; - - ASSERT(old_root->num_keys == 0); - -#ifdef BTREE_STATS - root->stats.alloced -= 1; - root->stats.dec_height += 1; - root->stats.max_items /= BTREE_PTR_MAX; -#endif - root->root_node = old_root->ptrs[0]; - btree_node_free(old_root); - root->height--; -} - -static int -btree_merge_with_prev( - struct btree_root *root, - int level, - struct btree_cursor *prev_cursor) -{ - if (!prev_cursor) - return 0; - - if (!btree_shift_to_prev(root, level, prev_cursor, - root->cursor[level].node->num_keys + 1)) - return 0; - -#ifdef BTREE_STATS - root->stats.merge_prev += 1; -#endif - return 1; -} - -static int -btree_merge_with_next( - struct btree_root *root, - int level, - struct btree_cursor *next_cursor) -{ - if (!next_cursor) - return 0; - - if (!btree_shift_to_next(root, level, next_cursor, - root->cursor[level].node->num_keys + 1)) - return 0; - -#ifdef BTREE_STATS - root->stats.merge_next += 1; -#endif - return 1; -} - -static int -btree_balance_with_prev( - struct btree_root *root, - int level, - struct btree_cursor *prev_cursor) -{ - struct btree_cursor *root_cursor = root->cursor; - - if (!prev_cursor) - return 0; - ASSERT(prev_cursor[level].node->num_keys > BTREE_KEY_MIN); - -#ifdef BTREE_STATS - root->stats.balance_prev += 1; -#endif - /* - * Move some nodes from the prev node into the current node. - * As the shift operation is a right shift and is relative to - * the root cursor, make the root cursor the prev cursor and - * pass in the root cursor as the next cursor. - */ - - root->cursor = prev_cursor; - if (!btree_shift_to_next(root, level, root_cursor, - (prev_cursor[level].node->num_keys + 1 - BTREE_KEY_MIN) / 2)) - abort(); - root->cursor = root_cursor; - - return 1; -} - -static int -btree_balance_with_next( - struct btree_root *root, - int level, - struct btree_cursor *next_cursor) -{ - struct btree_cursor *root_cursor = root->cursor; - - if (!next_cursor) - return 0; - assert(next_cursor[level].node->num_keys > BTREE_KEY_MIN); - -#ifdef btree_stats - root->stats.balance_next += 1; -#endif - /* - * move some nodes from the next node into the current node. - * as the shift operation is a left shift and is relative to - * the root cursor, make the root cursor the next cursor and - * pass in the root cursor as the prev cursor. - */ - - root->cursor = next_cursor; - if (!btree_shift_to_prev(root, level, root_cursor, - (next_cursor[level].node->num_keys + 1 - BTREE_KEY_MIN) / 2)) - abort(); - root->cursor = root_cursor; - - return 1; - -} - -static void -btree_delete_key( - struct btree_root *root, - int level); - -/* - * btree_delete_node: - * - * Return 0 if it's done or 1 if the next level needs to be collapsed - */ -static void -btree_delete_node( - struct btree_root *root, - int level) -{ - struct btree_cursor prev_cursor[root->height]; - struct btree_cursor next_cursor[root->height]; - struct btree_cursor *pc; - struct btree_cursor *nc; - - /* - * the node has underflowed, grab or merge keys/items from a - * neighbouring node. - */ - - if (level == root->height - 1) { - if (level > 0 && root->root_node->num_keys == 0) - btree_decrease_height(root); - return; - } - - pc = btree_copy_cursor_prev(root, prev_cursor, level + 1); - if (!btree_merge_with_prev(root, level, pc)) { - nc = btree_copy_cursor_next(root, next_cursor, level + 1); - if (!btree_merge_with_next(root, level, nc)) { - /* merging failed, try redistrubution */ - if (!btree_balance_with_prev(root, level, pc) && - !btree_balance_with_next(root, level, nc)) - abort(); - return; /* when balancing, then the node isn't freed */ - } - } - -#ifdef BTREE_STATS - root->stats.alloced -= 1; -#endif - btree_node_free(root->cursor[level].node); - - btree_delete_key(root, level + 1); -} - -static void -btree_delete_key( - struct btree_root *root, - int level) -{ - struct btree_node *node = root->cursor[level].node; - int index = root->cursor[level].index; - - node->num_keys--; - if (index <= node->num_keys) { - /* - * if not deleting the last item, shift higher items down - * to cover the item being deleted - */ - while (index < node->num_keys) { - node->keys[index] = node->keys[index + 1]; - node->ptrs[index] = node->ptrs[index + 1]; - index++; - } - node->ptrs[index] = node->ptrs[index + 1]; - } else { - /* - * else update the associated parent key as the last key - * in the leaf has changed - */ - btree_update_node_key(root, root->cursor, level + 1, - node->keys[node->num_keys]); - } - /* - * if node underflows, either merge with sibling or rebalance - * with sibling. - */ - if (node->num_keys < BTREE_KEY_MIN) - btree_delete_node(root, level); -} - -void * -btree_delete( - struct btree_root *root, - unsigned long key) -{ - void *value; - - value = btree_lookup(root, key); - if (!value) - return NULL; - -#ifdef BTREE_STATS - root->stats.delete += 1; - root->stats.num_items -= 1; -#endif - - btree_delete_key(root, 0); - - btree_invalidate_cursor(root); - - return value; -} - -#ifdef BTREE_STATS -void -btree_print_stats( - struct btree_root *root, - FILE *f) -{ - unsigned long max_items = root->stats.max_items * - (root->root_node->num_keys + 1); - - fprintf(f, "\tnum_items = %lu, max_items = %lu (%lu%%)\n", - root->stats.num_items, max_items, - root->stats.num_items * 100 / max_items); - fprintf(f, "\talloced = %d nodes, %lu bytes, %lu bytes per item\n", - root->stats.alloced, - root->stats.alloced * sizeof(struct btree_node), - root->stats.alloced * sizeof(struct btree_node) / - root->stats.num_items); - fprintf(f, "\tlookup = %d\n", root->stats.lookup); - fprintf(f, "\tfind = %d\n", root->stats.find); - fprintf(f, "\tcache_hits = %d\n", root->stats.cache_hits); - fprintf(f, "\tcache_misses = %d\n", root->stats.cache_misses); - fprintf(f, "\tkey_update = %d\n", root->stats.key_update); - fprintf(f, "\tvalue_update = %d\n", root->stats.value_update); - fprintf(f, "\tinsert = %d\n", root->stats.insert); - fprintf(f, "\tshift_prev = %d\n", root->stats.shift_prev); - fprintf(f, "\tshift_next = %d\n", root->stats.shift_next); - fprintf(f, "\tsplit = %d\n", root->stats.split); - fprintf(f, "\tinc_height = %d\n", root->stats.inc_height); - fprintf(f, "\tdelete = %d\n", root->stats.delete); - fprintf(f, "\tmerge_prev = %d\n", root->stats.merge_prev); - fprintf(f, "\tmerge_next = %d\n", root->stats.merge_next); - fprintf(f, "\tbalance_prev = %d\n", root->stats.balance_prev); - fprintf(f, "\tbalance_next = %d\n", root->stats.balance_next); - fprintf(f, "\tdec_height = %d\n", root->stats.dec_height); -} -#endif diff --git a/repair/btree.h b/repair/btree.h deleted file mode 100644 index aff950415..000000000 --- a/repair/btree.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2007 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef _BTREE_H -#define _BTREE_H - - -struct btree_root; - -void -btree_init( - struct btree_root **root); - -void -btree_destroy( - struct btree_root *root); - -int -btree_is_empty( - struct btree_root *root); - -void * -btree_lookup( - struct btree_root *root, - unsigned long key); - -void * -btree_find( - struct btree_root *root, - unsigned long key, - unsigned long *actual_key); - -void * -btree_peek_prev( - struct btree_root *root, - unsigned long *key); - -void * -btree_peek_next( - struct btree_root *root, - unsigned long *key); - -void * -btree_lookup_next( - struct btree_root *root, - unsigned long *key); - -void * -btree_lookup_prev( - struct btree_root *root, - unsigned long *key); - -int -btree_insert( - struct btree_root *root, - unsigned long key, - void *value); - -void * -btree_delete( - struct btree_root *root, - unsigned long key); - -int -btree_update_key( - struct btree_root *root, - unsigned long old_key, - unsigned long new_key); - -int -btree_update_value( - struct btree_root *root, - unsigned long key, - void *new_value); - -void -btree_clear( - struct btree_root *root); - -#ifdef BTREE_STATS -void -btree_print_stats( - struct btree_root *root, - FILE *f); -#endif - -#endif /* _BTREE_H */ diff --git a/repair/dino_chunks.c b/repair/dino_chunks.c index 734e9a839..4ccf804ed 100644 --- a/repair/dino_chunks.c +++ b/repair/dino_chunks.c @@ -118,7 +118,6 @@ verify_inode_chunk(xfs_mount_t *mp, int i; int j; int state; - xfs_extlen_t blen; agno = XFS_INO_TO_AGNO(mp, ino); agino = XFS_INO_TO_AGINO(mp, ino); @@ -152,8 +151,7 @@ verify_inode_chunk(xfs_mount_t *mp, pthread_mutex_lock(&ag_locks[agno]); - state = get_bmap(agno, agbno); - switch (state) { + switch (state = get_agbno_state(mp, agno, agbno)) { case XR_E_INO: do_warn( _("uncertain inode block %d/%d already known\n"), @@ -162,7 +160,7 @@ verify_inode_chunk(xfs_mount_t *mp, case XR_E_UNKNOWN: case XR_E_FREE1: case XR_E_FREE: - set_bmap(agno, agbno, XR_E_INO); + set_agbno_state(mp, agno, agbno, XR_E_INO); break; case XR_E_MULT: case XR_E_INUSE: @@ -174,14 +172,14 @@ verify_inode_chunk(xfs_mount_t *mp, do_warn( _("inode block %d/%d multiply claimed, (state %d)\n"), agno, agbno, state); - set_bmap(agno, agbno, XR_E_MULT); + set_agbno_state(mp, agno, agbno, XR_E_MULT); pthread_mutex_unlock(&ag_locks[agno]); return(0); default: do_warn( _("inode block %d/%d bad state, (state %d)\n"), agno, agbno, state); - set_bmap(agno, agbno, XR_E_INO); + set_agbno_state(mp, agno, agbno, XR_E_INO); break; } @@ -434,11 +432,9 @@ verify_inode_chunk(xfs_mount_t *mp, * entry or an iunlinked pointer */ pthread_mutex_lock(&ag_locks[agno]); - for (cur_agbno = chunk_start_agbno; - cur_agbno < chunk_stop_agbno; - cur_agbno += blen) { - state = get_bmap_ext(agno, cur_agbno, chunk_stop_agbno, &blen); - switch (state) { + for (j = 0, cur_agbno = chunk_start_agbno; + cur_agbno < chunk_stop_agbno; cur_agbno++) { + switch (state = get_agbno_state(mp, agno, cur_agbno)) { case XR_E_MULT: case XR_E_INUSE: case XR_E_INUSE_FS: @@ -446,9 +442,9 @@ verify_inode_chunk(xfs_mount_t *mp, do_warn( _("inode block %d/%d multiply claimed, (state %d)\n"), agno, cur_agbno, state); - set_bmap_ext(agno, cur_agbno, blen, XR_E_MULT); - pthread_mutex_unlock(&ag_locks[agno]); - return 0; + set_agbno_state(mp, agno, cur_agbno, XR_E_MULT); + j = 1; + break; case XR_E_INO: do_error( _("uncertain inode block overlap, agbno = %d, ino = %llu\n"), @@ -457,6 +453,11 @@ verify_inode_chunk(xfs_mount_t *mp, default: break; } + + if (j) { + pthread_mutex_unlock(&ag_locks[agno]); + return(0); + } } pthread_mutex_unlock(&ag_locks[agno]); @@ -484,10 +485,8 @@ verify_inode_chunk(xfs_mount_t *mp, pthread_mutex_lock(&ag_locks[agno]); for (cur_agbno = chunk_start_agbno; - cur_agbno < chunk_stop_agbno; - cur_agbno += blen) { - state = get_bmap_ext(agno, cur_agbno, chunk_stop_agbno, &blen); - switch (state) { + cur_agbno < chunk_stop_agbno; cur_agbno++) { + switch (state = get_agbno_state(mp, agno, cur_agbno)) { case XR_E_INO: do_error( _("uncertain inode block %llu already known\n"), @@ -496,7 +495,7 @@ verify_inode_chunk(xfs_mount_t *mp, case XR_E_UNKNOWN: case XR_E_FREE1: case XR_E_FREE: - set_bmap_ext(agno, cur_agbno, blen, XR_E_INO); + set_agbno_state(mp, agno, cur_agbno, XR_E_INO); break; case XR_E_MULT: case XR_E_INUSE: @@ -510,7 +509,7 @@ verify_inode_chunk(xfs_mount_t *mp, do_warn( _("inode block %d/%d bad state, (state %d)\n"), agno, cur_agbno, state); - set_bmap_ext(agno, cur_agbno, blen, XR_E_INO); + set_agbno_state(mp, agno, cur_agbno, XR_E_INO); break; } } @@ -630,9 +629,10 @@ process_inode_chunk( cluster_count * sizeof(xfs_buf_t*)); for (bp_index = 0; bp_index < cluster_count; bp_index++) { +#ifdef XR_PF_TRACE pftrace("about to read off %llu in AG %d", (long long)XFS_AGB_TO_DADDR(mp, agno, agbno), agno); - +#endif bplist[bp_index] = libxfs_readbuf(mp->m_dev, XFS_AGB_TO_DADDR(mp, agno, agbno), XFS_FSB_TO_BB(mp, blks_per_cluster), 0); @@ -650,9 +650,11 @@ process_inode_chunk( } agbno += blks_per_cluster; +#ifdef XR_PF_TRACE pftrace("readbuf %p (%llu, %d) in AG %d", bplist[bp_index], (long long)XFS_BUF_ADDR(bplist[bp_index]), XFS_BUF_COUNT(bplist[bp_index]), agno); +#endif } agbno = XFS_AGINO_TO_AGBNO(mp, first_irec->ino_startnum); @@ -743,23 +745,22 @@ process_inode_chunk( * mark block as an inode block in the incore bitmap */ pthread_mutex_lock(&ag_locks[agno]); - state = get_bmap(agno, agbno); - switch (state) { - case XR_E_INO: /* already marked */ - break; - case XR_E_UNKNOWN: - case XR_E_FREE: - case XR_E_FREE1: - set_bmap(agno, agbno, XR_E_INO); - break; - case XR_E_BAD_STATE: - do_error(_("bad state in block map %d\n"), state); - break; - default: - set_bmap(agno, agbno, XR_E_MULT); - do_warn(_("inode block %llu multiply claimed, state was %d\n"), - XFS_AGB_TO_FSB(mp, agno, agbno), state); - break; + switch (state = get_agbno_state(mp, agno, agbno)) { + case XR_E_INO: /* already marked */ + break; + case XR_E_UNKNOWN: + case XR_E_FREE: + case XR_E_FREE1: + set_agbno_state(mp, agno, agbno, XR_E_INO); + break; + case XR_E_BAD_STATE: + do_error(_("bad state in block map %d\n"), state); + break; + default: + set_agbno_state(mp, agno, agbno, XR_E_MULT); + do_warn(_("inode block %llu multiply claimed, state was %d\n"), + XFS_AGB_TO_FSB(mp, agno, agbno), state); + break; } pthread_mutex_unlock(&ag_locks[agno]); @@ -905,10 +906,10 @@ process_inode_chunk( * done! - finished up irec and block simultaneously */ for (bp_index = 0; bp_index < cluster_count; bp_index++) { - pftrace("put/writebuf %p (%llu) in AG %d", - bplist[bp_index], (long long) - XFS_BUF_ADDR(bplist[bp_index]), agno); - +#ifdef XR_PF_TRACE + pftrace("put/writebuf %p (%llu) in AG %d", bplist[bp_index], + (long long)XFS_BUF_ADDR(bplist[bp_index]), agno); +#endif if (dirty && !no_modify) libxfs_writebuf(bplist[bp_index], 0); else @@ -925,21 +926,20 @@ process_inode_chunk( agbno++; pthread_mutex_lock(&ag_locks[agno]); - state = get_bmap(agno, agbno); - switch (state) { + switch (state = get_agbno_state(mp, agno, agbno)) { case XR_E_INO: /* already marked */ break; case XR_E_UNKNOWN: case XR_E_FREE: case XR_E_FREE1: - set_bmap(agno, agbno, XR_E_INO); + set_agbno_state(mp, agno, agbno, XR_E_INO); break; case XR_E_BAD_STATE: do_error(_("bad state in block map %d\n"), state); break; default: - set_bmap(agno, agbno, XR_E_MULT); + set_agbno_state(mp, agno, agbno, XR_E_MULT); do_warn(_("inode block %llu multiply claimed, " "state was %d\n"), XFS_AGB_TO_FSB(mp, agno, agbno), state); diff --git a/repair/dinode.c b/repair/dinode.c index bf04c6ee5..9da721be4 100644 --- a/repair/dinode.c +++ b/repair/dinode.c @@ -524,7 +524,6 @@ process_rt_rec( /* * set the appropriate number of extents - * this iterates block by block, this can be optimised using extents */ for (b = irec->br_startblock; b < irec->br_startblock + irec->br_blockcount; b += mp->m_sb.sb_rextsize) { @@ -546,33 +545,40 @@ process_rt_rec( continue; } - state = get_rtbmap(ext); + state = get_rtbno_state(mp, ext); + switch (state) { - case XR_E_FREE: - case XR_E_UNKNOWN: - set_rtbmap(ext, XR_E_INUSE); - break; - case XR_E_BAD_STATE: - do_error(_("bad state in rt block map %llu\n"), ext); - case XR_E_FS_MAP: - case XR_E_INO: - case XR_E_INUSE_FS: - do_error(_("data fork in rt inode %llu found " - "metadata block %llu in rt bmap\n"), - ino, ext); - case XR_E_INUSE: - if (pwe) + case XR_E_FREE: + case XR_E_UNKNOWN: + set_rtbno_state(mp, ext, XR_E_INUSE); break; - case XR_E_MULT: - set_rtbmap(ext, XR_E_MULT); - do_warn(_("data fork in rt inode %llu claims " - "used rt block %llu\n"), + + case XR_E_BAD_STATE: + do_error(_("bad state in rt block map %llu\n"), + ext); + + case XR_E_FS_MAP: + case XR_E_INO: + case XR_E_INUSE_FS: + do_error(_("data fork in rt inode %llu found " + "metadata block %llu in rt bmap\n"), ino, ext); - return 1; - case XR_E_FREE1: - default: - do_error(_("illegal state %d in rt block map " - "%llu\n"), state, b); + + case XR_E_INUSE: + if (pwe) + break; + + case XR_E_MULT: + set_rtbno_state(mp, ext, XR_E_MULT); + do_warn(_("data fork in rt inode %llu claims " + "used rt block %llu\n"), + ino, ext); + return 1; + + case XR_E_FREE1: + default: + do_error(_("illegal state %d in rt block map " + "%llu\n"), state, b); } } @@ -615,10 +621,9 @@ process_bmbt_reclist_int( char *forkname; int i; int state; + xfs_dfsbno_t e; xfs_agnumber_t agno; xfs_agblock_t agbno; - xfs_agblock_t ebno; - xfs_extlen_t blen; xfs_agnumber_t locked_agno = -1; int error = 1; @@ -720,7 +725,7 @@ process_bmbt_reclist_int( */ agno = XFS_FSB_TO_AGNO(mp, irec.br_startblock); agbno = XFS_FSB_TO_AGBNO(mp, irec.br_startblock); - ebno = agbno + irec.br_blockcount; + e = irec.br_startblock + irec.br_blockcount; if (agno != locked_agno) { if (locked_agno != -1) pthread_mutex_unlock(&ag_locks[locked_agno]); @@ -735,23 +740,38 @@ process_bmbt_reclist_int( * checking each entry without setting the * block bitmap */ - if (search_dup_extent(agno, agbno, ebno)) { - do_warn(_("%s fork in ino %llu claims " - "dup extent, off - %llu, " - "start - %llu, cnt %llu\n"), - forkname, ino, irec.br_startoff, - irec.br_startblock, - irec.br_blockcount); - goto done; + for (b = irec.br_startblock; b < e; b++, agbno++) { + if (search_dup_extent(mp, agno, agbno)) { + do_warn(_("%s fork in ino %llu claims " + "dup extent, off - %llu, " + "start - %llu, cnt %llu\n"), + forkname, ino, irec.br_startoff, + irec.br_startblock, + irec.br_blockcount); + goto done; + } } *tot += irec.br_blockcount; continue; } - for (b = irec.br_startblock; - agbno < ebno; - b += blen, agbno += blen) { - state = get_bmap_ext(agno, agbno, ebno, &blen); + for (b = irec.br_startblock; b < e; b++, agbno++) { + /* + * Process in chunks of 16 (XR_BB_UNIT/XR_BB) + * for common XR_E_UNKNOWN to XR_E_INUSE transition + */ + if (((agbno & XR_BB_MASK) == 0) && ((irec.br_startblock + irec.br_blockcount - b) >= (XR_BB_UNIT/XR_BB))) { + if (ba_bmap[agno][agbno>>XR_BB] == XR_E_UNKNOWN_LL) { + ba_bmap[agno][agbno>>XR_BB] = XR_E_INUSE_LL; + agbno += (XR_BB_UNIT/XR_BB) - 1; + b += (XR_BB_UNIT/XR_BB) - 1; + continue; + } + + } + + state = get_agbno_state(mp, agno, agbno); + switch (state) { case XR_E_FREE: case XR_E_FREE1: @@ -760,7 +780,7 @@ process_bmbt_reclist_int( forkname, ino, (__uint64_t) b); /* fall through ... */ case XR_E_UNKNOWN: - set_bmap_ext(agno, agbno, blen, XR_E_INUSE); + set_agbno_state(mp, agno, agbno, XR_E_INUSE); break; case XR_E_BAD_STATE: @@ -776,7 +796,7 @@ process_bmbt_reclist_int( case XR_E_INUSE: case XR_E_MULT: - set_bmap_ext(agno, agbno, blen, XR_E_MULT); + set_agbno_state(mp, agno, agbno, XR_E_MULT); do_warn(_("%s fork in %s inode %llu claims " "used block %llu\n"), forkname, ftype, ino, (__uint64_t) b); @@ -2030,7 +2050,7 @@ process_inode_data_fork( *nextents = 1; if (dinoc->di_format != XFS_DINODE_FMT_LOCAL && type != XR_INO_RTDATA) - *dblkmap = blkmap_alloc(*nextents, XFS_DATA_FORK); + *dblkmap = blkmap_alloc(*nextents); *nextents = 0; switch (dinoc->di_format) { @@ -2152,14 +2172,14 @@ process_inode_attr_fork( err = process_lclinode(mp, agno, ino, dino, XFS_ATTR_FORK); break; case XFS_DINODE_FMT_EXTENTS: - ablkmap = blkmap_alloc(*anextents, XFS_ATTR_FORK); + ablkmap = blkmap_alloc(*anextents); *anextents = 0; err = process_exinode(mp, agno, ino, dino, type, dirty, atotblocks, anextents, &ablkmap, XFS_ATTR_FORK, check_dups); break; case XFS_DINODE_FMT_BTREE: - ablkmap = blkmap_alloc(*anextents, XFS_ATTR_FORK); + ablkmap = blkmap_alloc(*anextents); *anextents = 0; err = process_btinode(mp, agno, ino, dino, type, dirty, atotblocks, anextents, &ablkmap, diff --git a/repair/dir2.c b/repair/dir2.c index d0739fd40..2723e3b92 100644 --- a/repair/dir2.c +++ b/repair/dir2.c @@ -103,19 +103,21 @@ da_read_buf( bplist = bparray; } for (i = 0; i < nex; i++) { +#ifdef XR_PF_TRACE pftrace("about to read off %llu (len = %d)", (long long)XFS_FSB_TO_DADDR(mp, bmp[i].startblock), XFS_FSB_TO_BB(mp, bmp[i].blockcount)); - +#endif bplist[i] = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, bmp[i].startblock), XFS_FSB_TO_BB(mp, bmp[i].blockcount), 0); if (!bplist[i]) goto failed; - +#ifdef XR_PF_TRACE pftrace("readbuf %p (%llu, %d)", bplist[i], (long long)XFS_BUF_ADDR(bplist[i]), XFS_BUF_COUNT(bplist[i])); +#endif } dabuf = malloc(XFS_DA_BUF_SIZE(nex)); if (dabuf == NULL) { @@ -246,8 +248,10 @@ da_brelse( } da_buf_done(dabuf); for (i = 0; i < nbuf; i++) { +#ifdef XR_PF_TRACE pftrace("putbuf %p (%llu)", bplist[i], (long long)XFS_BUF_ADDR(bplist[i])); +#endif libxfs_putbuf(bplist[i]); } if (bplist != &bp) @@ -534,7 +538,7 @@ verify_final_dir2_path(xfs_mount_t *mp, /* * bail out if this is the root block (top of tree) */ - if (this_level >= cursor->active) + if (this_level >= cursor->active) return(0); /* * set hashvalue to correctl reflect the now-validated @@ -1421,7 +1425,7 @@ process_dir2_data( * numbers. Do NOT touch the name until after we've computed * the hashvalue and done a namecheck() on the name. * - * Conditions must either set clearino to zero or set + * Conditions must either set clearino to zero or set * clearreason why it's being cleared. */ if (!ino_discovery && ent_ino == BADFSINO) { @@ -1452,7 +1456,7 @@ process_dir2_data( if (ino_discovery) { add_inode_uncertain(mp, ent_ino, 0); clearino = 0; - } else + } else clearreason = _("non-existent"); } else { /* diff --git a/repair/globals.h b/repair/globals.h index 5fb8149b4..9a78caee5 100644 --- a/repair/globals.h +++ b/repair/globals.h @@ -156,6 +156,11 @@ EXTERN int chunks_pblock; /* # of 64-ino chunks per allocation */ EXTERN int max_symlink_blocks; EXTERN __int64_t fs_max_file_offset; +/* block allocation bitmaps */ + +EXTERN __uint64_t **ba_bmap; /* see incore.h */ +EXTERN __uint64_t *rt_ba_bmap; /* see incore.h */ + /* realtime info */ EXTERN xfs_rtword_t *btmcompute; @@ -194,6 +199,10 @@ EXTERN pthread_mutex_t *ag_locks; EXTERN int report_interval; EXTERN __uint64_t *prog_rpt_done; +#ifdef XR_PF_TRACE +EXTERN FILE *pf_trace_file; +#endif + EXTERN int ag_stride; EXTERN int thread_count; diff --git a/repair/incore.c b/repair/incore.c index 682a3db86..27604e27c 100644 --- a/repair/incore.c +++ b/repair/incore.c @@ -18,7 +18,6 @@ #include #include "avl.h" -#include "btree.h" #include "globals.h" #include "incore.h" #include "agheader.h" @@ -26,296 +25,245 @@ #include "err_protos.h" #include "threads.h" -/* - * The following manages the in-core bitmap of the entire filesystem - * using extents in a btree. - * - * The btree items will point to one of the state values below, - * rather than storing the value itself in the pointer. - */ -static int states[16] = - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +/* ba bmap setupstuff. setting/getting state is in incore.h */ -static struct btree_root **ag_bmap; - -static void -update_bmap( - struct btree_root *bmap, - unsigned long offset, - xfs_extlen_t blen, - void *new_state) +void +setup_bmap(xfs_agnumber_t agno, xfs_agblock_t numblocks, xfs_drtbno_t rtblocks) { - unsigned long end = offset + blen; - int *cur_state; - unsigned long cur_key; - int *next_state; - unsigned long next_key; - int *prev_state; - - cur_state = btree_find(bmap, offset, &cur_key); - if (!cur_state) - return; + int i; + size_t size = 0; - if (offset == cur_key) { - /* if the start is the same as the "item" extent */ - if (cur_state == new_state) - return; - - /* - * Note: this may be NULL if we are updating the map for - * the superblock. - */ - prev_state = btree_peek_prev(bmap, NULL); - - next_state = btree_peek_next(bmap, &next_key); - if (next_key > end) { - /* different end */ - if (new_state == prev_state) { - /* #1: prev has same state, move offset up */ - btree_update_key(bmap, offset, end); - return; - } - - /* #4: insert new extent after, update current value */ - btree_update_value(bmap, offset, new_state); - btree_insert(bmap, end, cur_state); - return; - } + ba_bmap = (__uint64_t**)malloc(agno*sizeof(__uint64_t *)); + if (!ba_bmap) + do_error(_("couldn't allocate block map pointers\n")); + ag_locks = malloc(agno * sizeof(pthread_mutex_t)); + if (!ag_locks) + do_error(_("couldn't allocate block map locks\n")); - /* same end (and same start) */ - if (new_state == next_state) { - /* next has same state */ - if (new_state == prev_state) { - /* #3: merge prev & next */ - btree_delete(bmap, offset); - btree_delete(bmap, end); - return; - } - - /* #8: merge next */ - btree_update_value(bmap, offset, new_state); - btree_delete(bmap, end); - return; - } + for (i = 0; i < agno; i++) { + size = roundup((numblocks+(NBBY/XR_BB)-1) / (NBBY/XR_BB), + sizeof(__uint64_t)); - /* same start, same end, next has different state */ - if (new_state == prev_state) { - /* #5: prev has same state */ - btree_delete(bmap, offset); + ba_bmap[i] = (__uint64_t*)memalign(sizeof(__uint64_t), size); + if (!ba_bmap[i]) { + do_error(_("couldn't allocate block map, size = %d\n"), + numblocks); return; } + memset(ba_bmap[i], 0, size); + pthread_mutex_init(&ag_locks[i], NULL); + } - /* #6: update value only */ - btree_update_value(bmap, offset, new_state); + if (rtblocks == 0) { + rt_ba_bmap = NULL; return; } - /* different start, offset is in the middle of "cur" */ - prev_state = btree_peek_prev(bmap, NULL); - ASSERT(prev_state != NULL); - if (prev_state == new_state) - return; + size = roundup(rtblocks / (NBBY/XR_BB), sizeof(__uint64_t)); - if (end == cur_key) { - /* end is at the same point as the current extent */ - if (new_state == cur_state) { - /* #7: move next extent down */ - btree_update_key(bmap, end, offset); + rt_ba_bmap=(__uint64_t*)memalign(sizeof(__uint64_t), size); + if (!rt_ba_bmap) { + do_error( + _("couldn't allocate realtime block map, size = %llu\n"), + rtblocks); return; - } - - /* #9: different start, same end, add new extent */ - btree_insert(bmap, offset, new_state); - return; } - /* #2: insert an extent into the middle of another extent */ - btree_insert(bmap, offset, new_state); - btree_insert(bmap, end, prev_state); + /* + * start all real-time as free blocks + */ + set_bmap_rt(rtblocks); + + return; } +/* ARGSUSED */ void -set_bmap_ext( - xfs_agnumber_t agno, - xfs_agblock_t agbno, - xfs_extlen_t blen, - int state) +teardown_rt_bmap(xfs_mount_t *mp) { - update_bmap(ag_bmap[agno], agbno, blen, &states[state]); + if (rt_ba_bmap != NULL) { + free(rt_ba_bmap); + rt_ba_bmap = NULL; + } + + return; } -int -get_bmap_ext( - xfs_agnumber_t agno, - xfs_agblock_t agbno, - xfs_agblock_t maxbno, - xfs_extlen_t *blen) +/* ARGSUSED */ +void +teardown_ag_bmap(xfs_mount_t *mp, xfs_agnumber_t agno) { - int *statep; - unsigned long key; - - statep = btree_find(ag_bmap[agno], agbno, &key); - if (!statep) - return -1; - - if (key == agbno) { - if (blen) { - if (!btree_peek_next(ag_bmap[agno], &key)) - return -1; - *blen = MIN(maxbno, key) - agbno; - } - return *statep; - } + ASSERT(ba_bmap[agno] != NULL); - statep = btree_peek_prev(ag_bmap[agno], NULL); - if (!statep) - return -1; - if (blen) - *blen = MIN(maxbno, key) - agbno; + free(ba_bmap[agno]); + ba_bmap[agno] = NULL; - return *statep; + return; } -static uint64_t *rt_bmap; -static size_t rt_bmap_size; - -/* block records fit into __uint64_t's units */ -#define XR_BB_UNIT 64 /* number of bits/unit */ -#define XR_BB 4 /* bits per block record */ -#define XR_BB_NUM (XR_BB_UNIT/XR_BB) /* number of records per unit */ -#define XR_BB_MASK 0xF /* block record mask */ - -/* - * these work in real-time extents (e.g. fsbno == rt extent number) - */ -int -get_rtbmap( - xfs_drtbno_t bno) +/* ARGSUSED */ +void +teardown_bmap_finish(xfs_mount_t *mp) { - return (*(rt_bmap + bno / XR_BB_NUM) >> - ((bno % XR_BB_NUM) * XR_BB)) & XR_BB_MASK; + free(ba_bmap); + ba_bmap = NULL; + + return; } void -set_rtbmap( - xfs_drtbno_t bno, - int state) +teardown_bmap(xfs_mount_t *mp) { - *(rt_bmap + bno / XR_BB_NUM) = - ((*(rt_bmap + bno / XR_BB_NUM) & - (~((__uint64_t) XR_BB_MASK << ((bno % XR_BB_NUM) * XR_BB)))) | - (((__uint64_t) state) << ((bno % XR_BB_NUM) * XR_BB))); + xfs_agnumber_t i; + + for (i = 0; i < mp->m_sb.sb_agcount; i++) { + teardown_ag_bmap(mp, i); + } + + teardown_rt_bmap(mp); + teardown_bmap_finish(mp); + + return; } -static void -reset_rt_bmap(void) +/* + * block map initialization routines -- realtime, log, fs + */ +void +set_bmap_rt(xfs_drtbno_t num) { - if (rt_bmap) - memset(rt_bmap, 0x22, rt_bmap_size); /* XR_E_FREE */ + xfs_drtbno_t j; + xfs_drtbno_t size; + + /* + * for now, initialize all realtime blocks to be free + * (state == XR_E_FREE) + */ + size = howmany(num / (NBBY/XR_BB), sizeof(__uint64_t)); + + for (j = 0; j < size; j++) + rt_ba_bmap[j] = 0x2222222222222222LL; + + return; } -static void -init_rt_bmap( - xfs_mount_t *mp) +void +set_bmap_log(xfs_mount_t *mp) { - if (mp->m_sb.sb_rextents == 0) + xfs_dfsbno_t logend, i; + + if (mp->m_sb.sb_logstart == 0) return; - rt_bmap_size = roundup(mp->m_sb.sb_rextents / (NBBY / XR_BB), - sizeof(__uint64_t)); + logend = mp->m_sb.sb_logstart + mp->m_sb.sb_logblocks; - rt_bmap = memalign(sizeof(__uint64_t), rt_bmap_size); - if (!rt_bmap) { - do_error( - _("couldn't allocate realtime block map, size = %llu\n"), - mp->m_sb.sb_rextents); - return; + for (i = mp->m_sb.sb_logstart; i < logend ; i++) { + set_fsbno_state(mp, i, XR_E_INUSE_FS); } + + return; } -static void -free_rt_bmap(xfs_mount_t *mp) +void +set_bmap_fs(xfs_mount_t *mp) { - free(rt_bmap); - rt_bmap = NULL; -} + xfs_agnumber_t i; + xfs_agblock_t j; + xfs_agblock_t end; + + /* + * AG header is 4 sectors + */ + end = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); + + for (i = 0; i < mp->m_sb.sb_agcount; i++) + for (j = 0; j < end; j++) + set_agbno_state(mp, i, j, XR_E_INUSE_FS); + return; +} +#if 0 void -reset_bmaps(xfs_mount_t *mp) +set_bmap_fs_bt(xfs_mount_t *mp) { - xfs_agnumber_t agno; - xfs_agblock_t ag_size; - int ag_hdr_block; - - ag_hdr_block = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); - ag_size = mp->m_sb.sb_agblocks; - - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - if (agno == mp->m_sb.sb_agcount - 1) - ag_size = (xfs_extlen_t)(mp->m_sb.sb_dblocks - - (xfs_drfsbno_t)mp->m_sb.sb_agblocks * agno); -#ifdef BTREE_STATS - if (btree_find(ag_bmap[agno], 0, NULL)) { - printf("ag_bmap[%d] btree stats:\n", i); - btree_print_stats(ag_bmap[agno], stdout); - } -#endif + xfs_agnumber_t i; + xfs_agblock_t j; + xfs_agblock_t begin; + xfs_agblock_t end; + + begin = bnobt_root; + end = inobt_root + 1; + + for (i = 0; i < mp->m_sb.sb_agcount; i++) { /* - * We always insert an item for the first block having a - * given state. So the code below means: - * - * block 0..ag_hdr_block-1: XR_E_INUSE_FS - * ag_hdr_block..ag_size: XR_E_UNKNOWN - * ag_size... XR_E_BAD_STATE + * account for btree roots */ - btree_clear(ag_bmap[agno]); - btree_insert(ag_bmap[agno], 0, &states[XR_E_INUSE_FS]); - btree_insert(ag_bmap[agno], - ag_hdr_block, &states[XR_E_UNKNOWN]); - btree_insert(ag_bmap[agno], ag_size, &states[XR_E_BAD_STATE]); + for (j = begin; j < end; j++) + set_agbno_state(mp, i, j, XR_E_INUSE_FS); } - if (mp->m_sb.sb_logstart != 0) { - set_bmap_ext(XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart), - XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart), - mp->m_sb.sb_logblocks, XR_E_INUSE_FS); - } - - reset_rt_bmap(); + return; } +#endif void -init_bmaps(xfs_mount_t *mp) +incore_init(xfs_mount_t *mp) { - xfs_agnumber_t i; + int agcount = mp->m_sb.sb_agcount; + extern void incore_ino_init(xfs_mount_t *); + extern void incore_ext_init(xfs_mount_t *); - ag_bmap = calloc(mp->m_sb.sb_agcount, sizeof(struct btree_root *)); - if (!ag_bmap) - do_error(_("couldn't allocate block map btree roots\n")); + /* init block alloc bmap */ - ag_locks = calloc(mp->m_sb.sb_agcount, sizeof(pthread_mutex_t)); - if (!ag_locks) - do_error(_("couldn't allocate block map locks\n")); + setup_bmap(agcount, mp->m_sb.sb_agblocks, mp->m_sb.sb_rextents); + incore_ino_init(mp); + incore_ext_init(mp); - for (i = 0; i < mp->m_sb.sb_agcount; i++) { - btree_init(&ag_bmap[i]); - pthread_mutex_init(&ag_locks[i], NULL); - } + /* initialize random globals now that we know the fs geometry */ - init_rt_bmap(mp); - reset_bmaps(mp); + inodes_per_block = mp->m_sb.sb_inopblock; + + return; } -void -free_bmaps(xfs_mount_t *mp) +#if defined(XR_BMAP_TRACE) || defined(XR_BMAP_DBG) +int +get_agbno_state(xfs_mount_t *mp, xfs_agnumber_t agno, + xfs_agblock_t ag_blockno) { - xfs_agnumber_t i; + __uint64_t *addr; - for (i = 0; i < mp->m_sb.sb_agcount; i++) - btree_destroy(ag_bmap[i]); - free(ag_bmap); - ag_bmap = NULL; + addr = ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM; - free_rt_bmap(mp); + return((*addr >> (((ag_blockno)%XR_BB_NUM)*XR_BB)) & XR_BB_MASK); } + +void set_agbno_state(xfs_mount_t *mp, xfs_agnumber_t agno, + xfs_agblock_t ag_blockno, int state) +{ + __uint64_t *addr; + + addr = ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM; + + *addr = (((*addr) & + (~((__uint64_t) XR_BB_MASK << (((ag_blockno)%XR_BB_NUM)*XR_BB)))) | + (((__uint64_t) (state)) << (((ag_blockno)%XR_BB_NUM)*XR_BB))); +} + +int +get_fsbno_state(xfs_mount_t *mp, xfs_dfsbno_t blockno) +{ + return(get_agbno_state(mp, XFS_FSB_TO_AGNO(mp, blockno), + XFS_FSB_TO_AGBNO(mp, blockno))); +} + +void +set_fsbno_state(xfs_mount_t *mp, xfs_dfsbno_t blockno, int state) +{ + set_agbno_state(mp, XFS_FSB_TO_AGNO(mp, blockno), + XFS_FSB_TO_AGBNO(mp, blockno), state); + + return; +} +#endif diff --git a/repair/incore.h b/repair/incore.h index 99853fb0b..a22ef0fb0 100644 --- a/repair/incore.h +++ b/repair/incore.h @@ -20,40 +20,97 @@ #define XFS_REPAIR_INCORE_H #include "avl.h" - - /* * contains definition information. implementation (code) * is spread out in separate files. */ /* - * block map -- track state of each filesystem block. + * block bit map defs -- track state of each filesystem block. + * ba_bmap is an array of bitstrings declared in the globals.h file. + * the bitstrings are broken up into 64-bit chunks. one bitstring per AG. */ +#define BA_BMAP_SIZE(x) (howmany(x, 4)) -void init_bmaps(xfs_mount_t *mp); -void reset_bmaps(xfs_mount_t *mp); -void free_bmaps(xfs_mount_t *mp); +void set_bmap_rt(xfs_drfsbno_t numblocks); +void set_bmap_log(xfs_mount_t *mp); +void set_bmap_fs(xfs_mount_t *mp); +void teardown_bmap(xfs_mount_t *mp); -void set_bmap_ext(xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t blen, int state); -int get_bmap_ext(xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_agblock_t maxbno, xfs_extlen_t *blen); +void teardown_rt_bmap(xfs_mount_t *mp); +void teardown_ag_bmap(xfs_mount_t *mp, xfs_agnumber_t agno); +void teardown_bmap_finish(xfs_mount_t *mp); -void set_rtbmap(xfs_drtbno_t bno, int state); -int get_rtbmap(xfs_drtbno_t bno); +/* blocks are numbered from zero */ -static inline void -set_bmap(xfs_agnumber_t agno, xfs_agblock_t agbno, int state) -{ - set_bmap_ext(agno, agbno, 1, state); -} +/* block records fit into __uint64_t's units */ + +#define XR_BB_UNIT 64 /* number of bits/unit */ +#define XR_BB 4 /* bits per block record */ +#define XR_BB_NUM (XR_BB_UNIT/XR_BB) /* number of records per unit */ +#define XR_BB_MASK 0xF /* block record mask */ + +/* + * bitstring ops -- set/get block states, either in filesystem + * bno's or in agbno's. turns out that fsbno addressing is + * more convenient when dealing with bmap extracted addresses + * and agbno addressing is more convenient when dealing with + * meta-data extracted addresses. So the fsbno versions use + * mtype (which can be one of the block map types above) to + * set the correct block map while the agbno versions assume + * you want to use the regular block map. + */ + +#if defined(XR_BMAP_TRACE) || defined(XR_BMAP_DBG) +/* + * implemented as functions for debugging purposes + */ +int get_agbno_state(xfs_mount_t *mp, xfs_agnumber_t agno, + xfs_agblock_t ag_blockno); +void set_agbno_state(xfs_mount_t *mp, xfs_agnumber_t agno, + xfs_agblock_t ag_blockno, int state); + +int get_fsbno_state(xfs_mount_t *mp, xfs_dfsbno_t blockno); +void set_fsbno_state(xfs_mount_t *mp, xfs_dfsbno_t blockno, int state); +#else +/* + * implemented as macros for performance purposes + */ + +#define get_agbno_state(mp, agno, ag_blockno) \ + ((int) (*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) \ + >> (((ag_blockno)%XR_BB_NUM)*XR_BB)) \ + & XR_BB_MASK) +#define set_agbno_state(mp, agno, ag_blockno, state) \ + *(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) = \ + ((*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM) & \ + (~((__uint64_t) XR_BB_MASK << (((ag_blockno)%XR_BB_NUM)*XR_BB)))) | \ + (((__uint64_t) (state)) << (((ag_blockno)%XR_BB_NUM)*XR_BB))) + +#define get_fsbno_state(mp, blockno) \ + get_agbno_state(mp, XFS_FSB_TO_AGNO(mp, (blockno)), \ + XFS_FSB_TO_AGBNO(mp, (blockno))) +#define set_fsbno_state(mp, blockno, state) \ + set_agbno_state(mp, XFS_FSB_TO_AGNO(mp, (blockno)), \ + XFS_FSB_TO_AGBNO(mp, (blockno)), (state)) + + +#define get_agbno_rec(mp, agno, ag_blockno) \ + (*(ba_bmap[(agno)] + (ag_blockno)/XR_BB_NUM)) +#endif /* XR_BMAP_TRACE */ + +/* + * these work in real-time extents (e.g. fsbno == rt extent number) + */ +#define get_rtbno_state(mp, fsbno) \ + ((*(rt_ba_bmap + (fsbno)/XR_BB_NUM) >> \ + (((fsbno)%XR_BB_NUM)*XR_BB)) & XR_BB_MASK) +#define set_rtbno_state(mp, fsbno, state) \ + *(rt_ba_bmap + (fsbno)/XR_BB_NUM) = \ + ((*(rt_ba_bmap + (fsbno)/XR_BB_NUM) & \ + (~((__uint64_t) XR_BB_MASK << (((fsbno)%XR_BB_NUM)*XR_BB)))) | \ + (((__uint64_t) (state)) << (((fsbno)%XR_BB_NUM)*XR_BB))) -static inline int -get_bmap(xfs_agnumber_t agno, xfs_agblock_t agbno) -{ - return get_bmap_ext(agno, agbno, agbno + 1, NULL); -} /* * extent tree definitions @@ -170,11 +227,23 @@ get_bcnt_extent(xfs_agnumber_t agno, xfs_agblock_t startblock, /* * duplicate extent tree functions */ +void add_dup_extent(xfs_agnumber_t agno, + xfs_agblock_t startblock, + xfs_extlen_t blockcount); + +extern avltree_desc_t **extent_tree_ptrs; +/* ARGSUSED */ +static inline int +search_dup_extent(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno) +{ + ASSERT(agno < glob_agcount); + + if (avl_findrange(extent_tree_ptrs[agno], agbno) != NULL) + return(1); + + return(0); +} -int add_dup_extent(xfs_agnumber_t agno, xfs_agblock_t startblock, - xfs_extlen_t blockcount); -int search_dup_extent(xfs_agnumber_t agno, - xfs_agblock_t start_agbno, xfs_agblock_t end_agbno); void add_rt_dup_extent(xfs_drtbno_t startblock, xfs_extlen_t blockcount); @@ -202,7 +271,6 @@ void release_agbcnt_extent_tree(xfs_agnumber_t agno); */ void free_rt_dup_extent_tree(xfs_mount_t *mp); -void incore_ext_init(xfs_mount_t *); /* * per-AG extent trees shutdown routine -- all (bno, bcnt and dup) * at once. this one actually frees the memory instead of just recyling @@ -210,8 +278,6 @@ void incore_ext_init(xfs_mount_t *); */ void incore_ext_teardown(xfs_mount_t *mp); -void incore_ino_init(xfs_mount_t *); - /* * inode definitions */ diff --git a/repair/incore_ext.c b/repair/incore_ext.c index a362e5a6a..d0b8cdc4b 100644 --- a/repair/incore_ext.c +++ b/repair/incore_ext.c @@ -18,7 +18,6 @@ #include #include "avl.h" -#include "btree.h" #include "globals.h" #include "incore.h" #include "agheader.h" @@ -73,8 +72,8 @@ static rt_ext_flist_t rt_ext_flist; static avl64tree_desc_t *rt_ext_tree_ptr; /* dup extent tree for rt */ -static struct btree_root **dup_extent_trees; /* per ag dup extent trees */ - +avltree_desc_t **extent_tree_ptrs; /* array of extent tree ptrs */ + /* one per ag for dups */ static avltree_desc_t **extent_bno_ptrs; /* * array of extent tree ptrs * one per ag for free extents @@ -100,48 +99,6 @@ static pthread_mutex_t ext_flist_lock; static pthread_mutex_t rt_ext_tree_lock; static pthread_mutex_t rt_ext_flist_lock; -/* - * duplicate extent tree functions - */ - -void -release_dup_extent_tree( - xfs_agnumber_t agno) -{ - btree_clear(dup_extent_trees[agno]); -} - -int -add_dup_extent( - xfs_agnumber_t agno, - xfs_agblock_t startblock, - xfs_extlen_t blockcount) -{ -#ifdef XR_DUP_TRACE - fprintf(stderr, "Adding dup extent - %d/%d %d\n", agno, startblock, - blockcount); -#endif - return btree_insert(dup_extent_trees[agno], startblock, - (void *)(uintptr_t)(startblock + blockcount)); -} - -int -search_dup_extent( - xfs_agnumber_t agno, - xfs_agblock_t start_agbno, - xfs_agblock_t end_agbno) -{ - unsigned long bno; - - if (!btree_find(dup_extent_trees[agno], start_agbno, &bno)) - return 0; /* this really shouldn't happen */ - if (bno < end_agbno) - return 1; - return (uintptr_t)btree_peek_prev(dup_extent_trees[agno], NULL) > - start_agbno; -} - - /* * extent tree stuff is avl trees of duplicate extents, * sorted in order by block number. there is one tree per ag. @@ -253,6 +210,14 @@ release_extent_tree(avltree_desc_t *tree) /* * top-level (visible) routines */ +void +release_dup_extent_tree(xfs_agnumber_t agno) +{ + release_extent_tree(extent_tree_ptrs[agno]); + + return; +} + void release_agbno_extent_tree(xfs_agnumber_t agno) { @@ -557,6 +522,93 @@ get_bcnt_extent(xfs_agnumber_t agno, xfs_agblock_t startblock, return(ext); } +/* + * the next 2 routines manage the trees of duplicate extents -- 1 tree + * per AG + */ +void +add_dup_extent(xfs_agnumber_t agno, xfs_agblock_t startblock, + xfs_extlen_t blockcount) +{ + extent_tree_node_t *first, *last, *ext, *next_ext; + xfs_agblock_t new_startblock; + xfs_extlen_t new_blockcount; + + ASSERT(agno < glob_agcount); + +#ifdef XR_DUP_TRACE + fprintf(stderr, "Adding dup extent - %d/%d %d\n", agno, startblock, blockcount); +#endif + avl_findranges(extent_tree_ptrs[agno], startblock - 1, + startblock + blockcount + 1, + (avlnode_t **) &first, (avlnode_t **) &last); + /* + * find adjacent and overlapping extent blocks + */ + if (first == NULL && last == NULL) { + /* nothing, just make and insert new extent */ + + ext = mk_extent_tree_nodes(startblock, blockcount, XR_E_MULT); + + if (avl_insert(extent_tree_ptrs[agno], + (avlnode_t *) ext) == NULL) { + do_error(_("duplicate extent range\n")); + } + + return; + } + + ASSERT(first != NULL && last != NULL); + + /* + * find the new composite range, delete old extent nodes + * as we go + */ + new_startblock = startblock; + new_blockcount = blockcount; + + for (ext = first; + ext != (extent_tree_node_t *) last->avl_node.avl_nextino; + ext = next_ext) { + /* + * preserve the next inorder node + */ + next_ext = (extent_tree_node_t *) ext->avl_node.avl_nextino; + /* + * just bail if the new extent is contained within an old one + */ + if (ext->ex_startblock <= startblock && + ext->ex_blockcount >= blockcount) + return; + /* + * now check for overlaps and adjacent extents + */ + if (ext->ex_startblock + ext->ex_blockcount >= startblock + || ext->ex_startblock <= startblock + blockcount) { + + if (ext->ex_startblock < new_startblock) + new_startblock = ext->ex_startblock; + + if (ext->ex_startblock + ext->ex_blockcount > + new_startblock + new_blockcount) + new_blockcount = ext->ex_startblock + + ext->ex_blockcount - + new_startblock; + + avl_delete(extent_tree_ptrs[agno], (avlnode_t *) ext); + continue; + } + } + + ext = mk_extent_tree_nodes(new_startblock, new_blockcount, XR_E_MULT); + + if (avl_insert(extent_tree_ptrs[agno], (avlnode_t *) ext) == NULL) { + do_error(_("duplicate extent range\n")); + } + + return; +} + static __psunsigned_t avl_ext_start(avlnode_t *node) { @@ -852,9 +904,10 @@ incore_ext_init(xfs_mount_t *mp) pthread_mutex_init(&rt_ext_tree_lock, NULL); pthread_mutex_init(&rt_ext_flist_lock, NULL); - dup_extent_trees = calloc(agcount, sizeof(struct btree_root *)); - if (!dup_extent_trees) - do_error(_("couldn't malloc dup extent tree descriptor table\n")); + if ((extent_tree_ptrs = malloc(agcount * + sizeof(avltree_desc_t *))) == NULL) + do_error( + _("couldn't malloc dup extent tree descriptor table\n")); if ((extent_bno_ptrs = malloc(agcount * sizeof(avltree_desc_t *))) == NULL) @@ -867,6 +920,10 @@ incore_ext_init(xfs_mount_t *mp) _("couldn't malloc free by-bcnt extent tree descriptor table\n")); for (i = 0; i < agcount; i++) { + if ((extent_tree_ptrs[i] = + malloc(sizeof(avltree_desc_t))) == NULL) + do_error( + _("couldn't malloc dup extent tree descriptor\n")); if ((extent_bno_ptrs[i] = malloc(sizeof(avltree_desc_t))) == NULL) do_error( @@ -878,7 +935,7 @@ incore_ext_init(xfs_mount_t *mp) } for (i = 0; i < agcount; i++) { - btree_init(&dup_extent_trees[i]); + avl_init_tree(extent_tree_ptrs[i], &avl_extent_tree_ops); avl_init_tree(extent_bno_ptrs[i], &avl_extent_tree_ops); avl_init_tree(extent_bcnt_ptrs[i], &avl_extent_bcnt_tree_ops); } @@ -907,18 +964,18 @@ incore_ext_teardown(xfs_mount_t *mp) free(cur); for (i = 0; i < mp->m_sb.sb_agcount; i++) { - btree_destroy(dup_extent_trees[i]); + free(extent_tree_ptrs[i]); free(extent_bno_ptrs[i]); free(extent_bcnt_ptrs[i]); } - free(dup_extent_trees); free(extent_bcnt_ptrs); free(extent_bno_ptrs); + free(extent_tree_ptrs); - dup_extent_trees = NULL; - extent_bcnt_ptrs = NULL; - extent_bno_ptrs = NULL; + extent_bcnt_ptrs = extent_bno_ptrs = extent_tree_ptrs = NULL; + + return; } int diff --git a/repair/init.c b/repair/init.c index 654c406f0..7e5052c48 100644 --- a/repair/init.c +++ b/repair/init.c @@ -24,24 +24,20 @@ #include "pthread.h" #include "avl.h" #include "dir.h" -#include "bmap.h" #include "incore.h" #include "prefetch.h" +#include "radix-tree.h" #include -/* TODO: dirbuf/freemap key usage is completely b0rked - only used for dirv1 */ static pthread_key_t dirbuf_key; static pthread_key_t dir_freemap_key; static pthread_key_t attr_freemap_key; -extern pthread_key_t dblkmap_key; -extern pthread_key_t ablkmap_key; - static void ts_alloc(pthread_key_t key, unsigned n, size_t size) { void *voidp; - voidp = calloc(n, size); + voidp = malloc((n)*(size)); if (voidp == NULL) { do_error(_("ts_alloc: cannot allocate thread specific storage\n")); /* NO RETURN */ @@ -57,9 +53,6 @@ ts_create(void) pthread_key_create(&dirbuf_key, NULL); pthread_key_create(&dir_freemap_key, NULL); pthread_key_create(&attr_freemap_key, NULL); - - pthread_key_create(&dblkmap_key, NULL); - pthread_key_create(&ablkmap_key, NULL); } void @@ -158,5 +151,5 @@ xfs_init(libxfs_init_t *args) ts_create(); ts_init(); increase_rlimit(); - pftrace_init(); + radix_tree_init(); } diff --git a/repair/phase2.c b/repair/phase2.c index b2f2d624d..170a1953f 100644 --- a/repair/phase2.c +++ b/repair/phase2.c @@ -109,6 +109,7 @@ void phase2(xfs_mount_t *mp) { xfs_agnumber_t i; + xfs_agblock_t b; int j; ino_tree_node_t *ino_rec; @@ -133,6 +134,12 @@ phase2(xfs_mount_t *mp) do_log(_(" - scan filesystem freespace and inode maps...\n")); + /* + * account for space used by ag headers and log if internal + */ + set_bmap_log(mp); + set_bmap_fs(mp); + bad_ino_btree = 0; set_progress_msg(PROG_FMT_SCAN_AG, (__uint64_t) glob_agcount); @@ -168,8 +175,11 @@ phase2(xfs_mount_t *mp) /* * also mark blocks */ - set_bmap_ext(0, XFS_INO_TO_AGBNO(mp, mp->m_sb.sb_rootino), - mp->m_ialloc_blks, XR_E_INO); + for (b = 0; b < mp->m_ialloc_blks; b++) { + set_agbno_state(mp, 0, + b + XFS_INO_TO_AGBNO(mp, mp->m_sb.sb_rootino), + XR_E_INO); + } } else { do_log(_(" - found root inode chunk\n")); diff --git a/repair/phase3.c b/repair/phase3.c index 32e855cb5..c36a1c56a 100644 --- a/repair/phase3.c +++ b/repair/phase3.c @@ -61,8 +61,14 @@ walk_unlinked_list(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agino_t start_ino) agbno = XFS_AGINO_TO_AGBNO(mp, current_ino); pthread_mutex_lock(&ag_locks[agno]); - state = get_bmap(agno, agbno); - switch (state) { + switch (state = get_agbno_state(mp, + agno, agbno)) { + case XR_E_UNKNOWN: + case XR_E_FREE: + case XR_E_FREE1: + set_agbno_state(mp, agno, agbno, + XR_E_INO); + break; case XR_E_BAD_STATE: do_error(_( "bad state in block map %d\n"), @@ -79,7 +85,8 @@ walk_unlinked_list(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agino_t start_ino) * anyway, hopefully without * losing too much other data */ - set_bmap(agno, agbno, XR_E_INO); + set_agbno_state(mp, agno, agbno, + XR_E_INO); break; } pthread_mutex_unlock(&ag_locks[agno]); diff --git a/repair/phase4.c b/repair/phase4.c index aaef1f9e6..ebea37883 100644 --- a/repair/phase4.c +++ b/repair/phase4.c @@ -192,7 +192,8 @@ phase4(xfs_mount_t *mp) xfs_agnumber_t i; xfs_agblock_t j; xfs_agblock_t ag_end; - xfs_extlen_t blen; + xfs_agblock_t extent_start; + xfs_extlen_t extent_len; int ag_hdr_len = 4 * mp->m_sb.sb_sectsize; int ag_hdr_block; int bstate; @@ -225,13 +226,30 @@ phase4(xfs_mount_t *mp) ag_end = (i < mp->m_sb.sb_agcount - 1) ? mp->m_sb.sb_agblocks : mp->m_sb.sb_dblocks - (xfs_drfsbno_t) mp->m_sb.sb_agblocks * i; - + extent_start = extent_len = 0; /* * set up duplicate extent list for this ag */ - for (j = ag_hdr_block; j < ag_end; j += blen) { - bstate = get_bmap_ext(i, j, ag_end, &blen); - switch (bstate) { + for (j = ag_hdr_block; j < ag_end; j++) { + + /* Process in chunks of 16 (XR_BB_UNIT/XR_BB) */ + if ((extent_start == 0) && ((j & XR_BB_MASK) == 0)) { + switch(ba_bmap[i][j>>XR_BB]) { + case XR_E_UNKNOWN_LL: + case XR_E_FREE1_LL: + case XR_E_FREE_LL: + case XR_E_INUSE_LL: + case XR_E_INUSE_FS_LL: + case XR_E_INO_LL: + case XR_E_FS_MAP_LL: + j += (XR_BB_UNIT/XR_BB) - 1; + continue; + } + } + + bstate = get_agbno_state(mp, i, j); + + switch (bstate) { case XR_E_BAD_STATE: default: do_warn( @@ -245,13 +263,37 @@ phase4(xfs_mount_t *mp) case XR_E_INUSE_FS: case XR_E_INO: case XR_E_FS_MAP: + if (extent_start == 0) + continue; + else { + /* + * add extent and reset extent state + */ + add_dup_extent(i, extent_start, + extent_len); + extent_start = 0; + extent_len = 0; + } break; case XR_E_MULT: - add_dup_extent(i, j, blen); + if (extent_start == 0) { + extent_start = j; + extent_len = 1; + } else if (extent_len == MAXEXTLEN) { + add_dup_extent(i, extent_start, + extent_len); + extent_start = j; + extent_len = 1; + } else + extent_len++; break; } } - + /* + * catch tail-case, extent hitting the end of the ag + */ + if (extent_start != 0) + add_dup_extent(i, extent_start, extent_len); PROG_RPT_INC(prog_rpt_done[i], 1); } print_final_rpt(); @@ -263,7 +305,9 @@ phase4(xfs_mount_t *mp) rt_len = 0; for (bno = 0; bno < mp->m_sb.sb_rextents; bno++) { - bstate = get_rtbmap(bno); + + bstate = get_rtbno_state(mp, bno); + switch (bstate) { case XR_E_BAD_STATE: default: @@ -314,7 +358,19 @@ phase4(xfs_mount_t *mp) /* * initialize bitmaps for all AGs */ - reset_bmaps(mp); + for (i = 0; i < mp->m_sb.sb_agcount; i++) { + /* + * now reset the bitmap for all ags + */ + memset(ba_bmap[i], 0, + roundup((mp->m_sb.sb_agblocks+(NBBY/XR_BB)-1)/(NBBY/XR_BB), + sizeof(__uint64_t))); + for (j = 0; j < ag_hdr_block; j++) + set_agbno_state(mp, i, j, XR_E_INUSE_FS); + } + set_bmap_rt(mp->m_sb.sb_rextents); + set_bmap_log(mp); + set_bmap_fs(mp); do_log(_(" - check for inodes claiming duplicate blocks...\n")); set_progress_msg(PROG_FMT_DUP_BLOCKS, (__uint64_t) mp->m_sb.sb_icount); diff --git a/repair/phase5.c b/repair/phase5.c index d6a0f6a53..26f5aa22c 100644 --- a/repair/phase5.c +++ b/repair/phase5.c @@ -88,8 +88,10 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_agnumber_t agno) xfs_agblock_t agbno; xfs_agblock_t ag_end; uint free_blocks; - xfs_extlen_t blen; - int bstate; +#ifdef XR_BLD_FREE_TRACE + int old_state; + int state = XR_E_BAD_STATE; +#endif /* * scan the bitmap for the ag looking for continuous @@ -118,10 +120,30 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_agnumber_t agno) * ok, now find the number of extents, keep track of the * largest extent. */ - for (agbno = 0; agbno < ag_end; agbno += blen) { - bstate = get_bmap_ext(agno, agbno, ag_end, &blen); - if (bstate < XR_E_INUSE) { - free_blocks += blen; + for (agbno = 0; agbno < ag_end; agbno++) { +#if 0 + old_state = state; + state = get_agbno_state(mp, agno, agbno); + if (state != old_state) { + fprintf(stderr, "agbno %u - new state is %d\n", + agbno, state); + } +#endif + /* Process in chunks of 16 (XR_BB_UNIT/XR_BB) */ + if ((in_extent == 0) && ((agbno & XR_BB_MASK) == 0)) { + /* testing >= XR_E_INUSE */ + switch (ba_bmap[agno][agbno>>XR_BB]) { + case XR_E_INUSE_LL: + case XR_E_INUSE_FS_LL: + case XR_E_INO_LL: + case XR_E_FS_MAP_LL: + agbno += (XR_BB_UNIT/XR_BB) - 1; + continue; + } + + } + if (get_agbno_state(mp, agno, agbno) < XR_E_INUSE) { + free_blocks++; if (in_extent == 0) { /* * found the start of a free extent @@ -129,9 +151,9 @@ mk_incore_fstree(xfs_mount_t *mp, xfs_agnumber_t agno) in_extent = 1; num_extents++; extent_start = agbno; - extent_len = blen; + extent_len = 1; } else { - extent_len += blen; + extent_len++; } } else { if (in_extent) { @@ -1442,6 +1464,11 @@ phase5_func( agno); } + /* + * done with the AG bitmap, toss it... + */ + teardown_ag_bmap(mp, agno); + /* * ok, now set up the btree cursors for the * on-disk btrees (includs pre-allocating all @@ -1628,6 +1655,7 @@ phase5(xfs_mount_t *mp) _(" - generate realtime summary info and bitmap...\n")); rtinit(mp); generate_rtinfo(mp, btmcompute, sumcompute); + teardown_rt_bmap(mp); } do_log(_(" - reset superblock...\n")); diff --git a/repair/phase6.c b/repair/phase6.c index d0560631f..becedbd27 100644 --- a/repair/phase6.c +++ b/repair/phase6.c @@ -3661,6 +3661,11 @@ phase6(xfs_mount_t *mp) do_log(_("Phase 6 - check inode connectivity...\n")); + if (!no_modify) + teardown_bmap_finish(mp); + else + teardown_bmap(mp); + incore_ext_teardown(mp); add_ino_ex_data(mp); diff --git a/repair/prefetch.c b/repair/prefetch.c index aaa56d181..2c78db0d7 100644 --- a/repair/prefetch.c +++ b/repair/prefetch.c @@ -1,7 +1,6 @@ #include #include #include "avl.h" -#include "btree.h" #include "globals.h" #include "agheader.h" #include "incore.h" @@ -15,6 +14,7 @@ #include "threads.h" #include "prefetch.h" #include "progress.h" +#include "radix-tree.h" int do_prefetch = 1; @@ -83,8 +83,9 @@ pf_start_processing( prefetch_args_t *args) { if (!args->can_start_processing) { +#ifdef XR_PF_TRACE pftrace("signalling processing for AG %d", args->agno); - +#endif args->can_start_processing = 1; pthread_cond_signal(&args->start_processing); } @@ -95,8 +96,9 @@ pf_start_io_workers( prefetch_args_t *args) { if (!args->can_start_reading) { +#ifdef XR_PF_TRACE pftrace("signalling reading for AG %d", args->agno); - +#endif args->can_start_reading = 1; pthread_cond_broadcast(&args->start_reading); } @@ -126,24 +128,35 @@ pf_queue_io( pthread_mutex_lock(&args->lock); - btree_insert(args->io_queue, fsbno, bp); - if (fsbno > args->last_bno_read) { - if (B_IS_INODE(flag)) { + radix_tree_insert(&args->primary_io_queue, fsbno, bp); + if (!B_IS_INODE(flag)) + radix_tree_tag_set(&args->primary_io_queue, fsbno, 0); + else { args->inode_bufs_queued++; if (args->inode_bufs_queued == IO_THRESHOLD) pf_start_io_workers(args); } +#ifdef XR_PF_TRACE + pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to " + "primary queue (inode_bufs_queued = %d, last_bno = %lu)", + B_IS_INODE(flag) ? 'I' : 'M', bp, + (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, + args->inode_bufs_queued, args->last_bno_read); +#endif } else { +#ifdef XR_PF_TRACE + pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to " + "secondary queue (last_bno = %lu)", + B_IS_INODE(flag) ? 'I' : 'M', bp, + (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, + args->last_bno_read); +#endif ASSERT(!B_IS_INODE(flag)); XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2); + radix_tree_insert(&args->secondary_io_queue, fsbno, bp); } - pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue" - "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ? - 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, - args->inode_bufs_queued, args->last_bno_read); - pf_start_processing(args); pthread_mutex_unlock(&args->lock); @@ -181,9 +194,9 @@ pf_read_bmbt_reclist( while (irec.br_blockcount) { unsigned int len; - +#ifdef XR_PF_TRACE pftrace("queuing dir extent in AG %d", args->agno); - +#endif len = (irec.br_blockcount > mp->m_dirblkfsbs) ? mp->m_dirblkfsbs : irec.br_blockcount; pf_queue_io(args, irec.br_startblock, len, B_DIR_META); @@ -394,6 +407,7 @@ pf_batch_read( pf_which_t which, void *buf) { + struct radix_tree_root *queue; xfs_buf_t *bplist[MAX_BUFS]; unsigned int num; off64_t first_off, last_off, next_off; @@ -401,28 +415,27 @@ pf_batch_read( int i; int inode_bufs; unsigned long fsbno; - unsigned long max_fsbno; char *pbuf; - for (;;) { - num = 0; - if (which == PF_SECONDARY) { - bplist[0] = btree_find(args->io_queue, 0, &fsbno); - max_fsbno = MIN(fsbno + pf_max_fsbs, - args->last_bno_read); + queue = (which != PF_SECONDARY) ? &args->primary_io_queue + : &args->secondary_io_queue; + + while (radix_tree_lookup_first(queue, &fsbno) != NULL) { + + if (which != PF_META_ONLY) { + num = radix_tree_gang_lookup_ex(queue, + (void**)&bplist[0], fsbno, + fsbno + pf_max_fsbs, MAX_BUFS); + ASSERT(num > 0); + ASSERT(XFS_FSB_TO_DADDR(mp, fsbno) == + XFS_BUF_ADDR(bplist[0])); } else { - bplist[0] = btree_find(args->io_queue, - args->last_bno_read, &fsbno); - max_fsbno = fsbno + pf_max_fsbs; - } - while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) { - if (which != PF_META_ONLY || - !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num]))) - num++; - bplist[num] = btree_lookup_next(args->io_queue, &fsbno); + num = radix_tree_gang_lookup_tag(queue, + (void**)&bplist[0], fsbno, + MAX_BUFS / 4, 0); + if (num == 0) + return; } - if (!num) - return; /* * do a big read if 25% of the potential buffer is useful, @@ -431,22 +444,21 @@ pf_batch_read( */ first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0])); last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + - XFS_BUF_SIZE(bplist[num-1]); + XFS_BUF_SIZE(bplist[num-1]); while (last_off - first_off > pf_max_bytes) { num--; - last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR( - bplist[num-1])) + XFS_BUF_SIZE(bplist[num-1]); + last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + + XFS_BUF_SIZE(bplist[num-1]); } - if (num < ((last_off - first_off) >> - (mp->m_sb.sb_blocklog + 3))) { + if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) { /* * not enough blocks for one big read, so determine * the number of blocks that are close enough. */ last_off = first_off + XFS_BUF_SIZE(bplist[0]); for (i = 1; i < num; i++) { - next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR( - bplist[i])) + XFS_BUF_SIZE(bplist[i]); + next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) + + XFS_BUF_SIZE(bplist[i]); if (next_off - last_off > pf_batch_bytes) break; last_off = next_off; @@ -455,7 +467,7 @@ pf_batch_read( } for (i = 0; i < num; i++) { - if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp, + if (radix_tree_delete(queue, XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bplist[i]))) == NULL) do_error(_("prefetch corruption\n")); } @@ -509,16 +521,20 @@ pf_batch_read( } } for (i = 0; i < num; i++) { +#ifdef XR_PF_TRACE pftrace("putbuf %c %p (%llu) in AG %d", B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M', bplist[i], (long long)XFS_BUF_ADDR(bplist[i]), args->agno); +#endif libxfs_putbuf(bplist[i]); } pthread_mutex_lock(&args->lock); if (which != PF_SECONDARY) { +#ifdef XR_PF_TRACE pftrace("inode_bufs_queued for AG %d = %d", args->agno, args->inode_bufs_queued); +#endif /* * if primary inode queue running low, process metadata * in boths queues to avoid I/O starvation as the @@ -527,14 +543,15 @@ pf_batch_read( */ if (which == PF_PRIMARY && !args->queuing_done && args->inode_bufs_queued < IO_THRESHOLD) { +#ifdef XR_PF_TRACE pftrace("reading metadata bufs from primary queue for AG %d", args->agno); - +#endif pf_batch_read(args, PF_META_ONLY, buf); - +#ifdef XR_PF_TRACE pftrace("reading bufs from secondary queue for AG %d", args->agno); - +#endif pf_batch_read(args, PF_SECONDARY, buf); } } @@ -553,20 +570,22 @@ pf_io_worker( return NULL; pthread_mutex_lock(&args->lock); - while (!args->queuing_done || btree_find(args->io_queue, 0, NULL)) { + while (!args->queuing_done || args->primary_io_queue.height) { +#ifdef XR_PF_TRACE pftrace("waiting to start prefetch I/O for AG %d", args->agno); - +#endif while (!args->can_start_reading && !args->queuing_done) pthread_cond_wait(&args->start_reading, &args->lock); - +#ifdef XR_PF_TRACE pftrace("starting prefetch I/O for AG %d", args->agno); - +#endif pf_batch_read(args, PF_PRIMARY, buf); pf_batch_read(args, PF_SECONDARY, buf); +#ifdef XR_PF_TRACE pftrace("ran out of bufs to prefetch for AG %d", args->agno); - +#endif if (!args->queuing_done) args->can_start_reading = 0; } @@ -574,8 +593,9 @@ pf_io_worker( free(buf); +#ifdef XR_PF_TRACE pftrace("finished prefetch I/O for AG %d", args->agno); - +#endif return NULL; } @@ -617,7 +637,10 @@ pf_queuing_worker( break; } } + +#ifdef XR_PF_TRACE pftrace("starting prefetch for AG %d", args->agno); +#endif for (irec = findfirst_inode_rec(args->agno); irec != NULL; irec = next_ino_rec(irec)) { @@ -654,9 +677,10 @@ pf_queuing_worker( pthread_mutex_lock(&args->lock); +#ifdef XR_PF_TRACE pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)", args->agno, args->inode_bufs_queued); - +#endif args->queuing_done = 1; pf_start_io_workers(args); pf_start_processing(args); @@ -667,11 +691,13 @@ pf_queuing_worker( if (args->io_threads[i]) pthread_join(args->io_threads[i], NULL); +#ifdef XR_PF_TRACE pftrace("prefetch for AG %d finished", args->agno); - +#endif pthread_mutex_lock(&args->lock); - ASSERT(btree_find(args->io_queue, 0, NULL) == NULL); + ASSERT(args->primary_io_queue.height == 0); + ASSERT(args->secondary_io_queue.height == 0); args->prefetch_done = 1; if (args->next_args) @@ -688,8 +714,9 @@ pf_create_prefetch_thread( { int err; +#ifdef XR_PF_TRACE pftrace("creating queue thread for AG %d", args->agno); - +#endif err = pthread_create(&args->queuing_thread, NULL, pf_queuing_worker, args); if (err != 0) { @@ -728,7 +755,8 @@ start_inode_prefetch( args = calloc(1, sizeof(prefetch_args_t)); - btree_init(&args->io_queue); + INIT_RADIX_TREE(&args->primary_io_queue, 0); + INIT_RADIX_TREE(&args->secondary_io_queue, 0); if (pthread_mutex_init(&args->lock, NULL) != 0) do_error(_("failed to initialize prefetch mutex\n")); if (pthread_cond_init(&args->start_reading, NULL) != 0) @@ -776,12 +804,14 @@ wait_for_inode_prefetch( pthread_mutex_lock(&args->lock); while (!args->can_start_processing) { +#ifdef XR_PF_TRACE pftrace("waiting to start processing AG %d", args->agno); - +#endif pthread_cond_wait(&args->start_processing, &args->lock); } +#ifdef XR_PF_TRACE pftrace("can start processing AG %d", args->agno); - +#endif pthread_mutex_unlock(&args->lock); } @@ -792,39 +822,25 @@ cleanup_inode_prefetch( if (args == NULL) return; +#ifdef XR_PF_TRACE pftrace("waiting AG %d prefetch to finish", args->agno); - +#endif if (args->queuing_thread) pthread_join(args->queuing_thread, NULL); +#ifdef XR_PF_TRACE pftrace("AG %d prefetch done", args->agno); - +#endif pthread_mutex_destroy(&args->lock); pthread_cond_destroy(&args->start_reading); pthread_cond_destroy(&args->start_processing); sem_destroy(&args->ra_count); - btree_destroy(args->io_queue); free(args); } #ifdef XR_PF_TRACE -static FILE *pf_trace_file; - -void -pftrace_init(void) -{ - pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w"); - setvbuf(pf_trace_file, NULL, _IOLBF, 1024); -} - -void -pftrace_done(void) -{ - fclose(pf_trace_file); -} - void _pftrace(const char *func, const char *msg, ...) { @@ -839,8 +855,7 @@ _pftrace(const char *func, const char *msg, ...) buf[sizeof(buf)-1] = '\0'; va_end(args); - fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, - func, buf); + fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, func, buf); } #endif diff --git a/repair/prefetch.h b/repair/prefetch.h index 44a406c01..60ba96646 100644 --- a/repair/prefetch.h +++ b/repair/prefetch.h @@ -3,6 +3,7 @@ #include #include "incore.h" +#include "radix-tree.h" extern int do_prefetch; @@ -13,7 +14,8 @@ typedef struct prefetch_args { pthread_mutex_t lock; pthread_t queuing_thread; pthread_t io_threads[PF_THREAD_COUNT]; - struct btree_root *io_queue; + struct radix_tree_root primary_io_queue; + struct radix_tree_root secondary_io_queue; pthread_cond_t start_reading; pthread_cond_t start_processing; int agno; @@ -50,15 +52,8 @@ cleanup_inode_prefetch( #ifdef XR_PF_TRACE -void pftrace_init(void); -void pftrace_done(void); - #define pftrace(msg...) _pftrace(__FUNCTION__, ## msg) void _pftrace(const char *, const char *, ...); -#else -static inline void pftrace_init(void) { }; -static inline void pftrace_done(void) { }; -static inline void pftrace(const char *msg, ...) { }; #endif #endif /* _XFS_REPAIR_PREFETCH_H */ diff --git a/repair/radix-tree.c b/repair/radix-tree.c new file mode 100644 index 000000000..36a6324d8 --- /dev/null +++ b/repair/radix-tree.c @@ -0,0 +1,805 @@ +/* + * Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * Copyright (C) 2005 SGI, Christoph Lameter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include "radix-tree.h" + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +#define RADIX_TREE_MAP_SHIFT 6 +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#ifdef RADIX_TREE_TAGS +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) +#endif + +struct radix_tree_node { + unsigned int count; + void *slots[RADIX_TREE_MAP_SIZE]; +#ifdef RADIX_TREE_TAGS + unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; +#endif +}; + +struct radix_tree_path { + struct radix_tree_node *node; + int offset; +}; + +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2) + +static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH]; + +/* + * Radix tree node cache. + */ + +#define radix_tree_node_alloc(r) ((struct radix_tree_node *) \ + calloc(1, sizeof(struct radix_tree_node))) +#define radix_tree_node_free(n) free(n) + +#ifdef RADIX_TREE_TAGS + +static inline void tag_set(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + *((__uint32_t *)node->tags[tag] + (offset >> 5)) |= (1 << (offset & 31)); +} + +static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + __uint32_t *p = (__uint32_t*)node->tags[tag] + (offset >> 5); + __uint32_t m = 1 << (offset & 31); + *p &= ~m; +} + +static inline int tag_get(struct radix_tree_node *node, unsigned int tag, + int offset) +{ + return 1 & (((const __uint32_t *)node->tags[tag])[offset >> 5] >> (offset & 31)); +} + +/* + * Returns 1 if any slot in the node has this tag set. + * Otherwise returns 0. + */ +static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) +{ + int idx; + for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { + if (node->tags[tag][idx]) + return 1; + } + return 0; +} + +#endif + +/* + * Return the maximum key which can be store into a + * radix tree with height HEIGHT. + */ +static inline unsigned long radix_tree_maxindex(unsigned int height) +{ + return height_to_maxindex[height]; +} + +/* + * Extend a radix tree so it can store key @index. + */ +static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) +{ + struct radix_tree_node *node; + unsigned int height; +#ifdef RADIX_TREE_TAGS + char tags[RADIX_TREE_MAX_TAGS]; + int tag; +#endif + + /* Figure out what the height should be. */ + height = root->height + 1; + while (index > radix_tree_maxindex(height)) + height++; + + if (root->rnode == NULL) { + root->height = height; + goto out; + } + +#ifdef RADIX_TREE_TAGS + /* + * Prepare the tag status of the top-level node for propagation + * into the newly-pushed top-level node(s) + */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + tags[tag] = 0; + if (any_tag_set(root->rnode, tag)) + tags[tag] = 1; + } +#endif + do { + if (!(node = radix_tree_node_alloc(root))) + return -ENOMEM; + + /* Increase the height. */ + node->slots[0] = root->rnode; + +#ifdef RADIX_TREE_TAGS + /* Propagate the aggregated tag info into the new root */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (tags[tag]) + tag_set(node, tag, 0); + } +#endif + node->count = 1; + root->rnode = node; + root->height++; + } while (height > root->height); +out: + return 0; +} + +/** + * radix_tree_insert - insert into a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Insert an item into the radix tree at position @index. + */ +int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_node *node = NULL, *slot; + unsigned int height, shift; + int offset; + int error; + + /* Make sure the tree is high enough. */ + if ((!index && !root->rnode) || + index > radix_tree_maxindex(root->height)) { + error = radix_tree_extend(root, index); + if (error) + return error; + } + + slot = root->rnode; + height = root->height; + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + + offset = 0; /* uninitialised var warning */ + do { + if (slot == NULL) { + /* Have to add a child node. */ + if (!(slot = radix_tree_node_alloc(root))) + return -ENOMEM; + if (node) { + node->slots[offset] = slot; + node->count++; + } else + root->rnode = slot; + } + + /* Go a level down */ + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + node = slot; + slot = node->slots[offset]; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } while (height > 0); + + if (slot != NULL) + return -EEXIST; + + ASSERT(node); + node->count++; + node->slots[offset] = item; +#ifdef RADIX_TREE_TAGS + ASSERT(!tag_get(node, 0, offset)); + ASSERT(!tag_get(node, 1, offset)); +#endif + return 0; +} + +static inline void **__lookup_slot(struct radix_tree_root *root, + unsigned long index) +{ + unsigned int height, shift; + struct radix_tree_node **slot; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return NULL; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = &root->rnode; + + while (height > 0) { + if (*slot == NULL) + return NULL; + + slot = (struct radix_tree_node **) + ((*slot)->slots + + ((index >> shift) & RADIX_TREE_MAP_MASK)); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + return (void **)slot; +} + +/** + * radix_tree_lookup_slot - lookup a slot in a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the slot corresponding to the position @index in the radix tree + * @root. This is useful for update-if-exists operations. + */ +void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) +{ + return __lookup_slot(root, index); +} + +/** + * radix_tree_lookup - perform lookup operation on a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. + */ +void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) +{ + void **slot; + + slot = __lookup_slot(root, index); + return slot != NULL ? *slot : NULL; +} + +/** + * raid_tree_first_key - find the first index key in the radix tree + * @root: radix tree root + * @index: where the first index will be placed + * + * Returns the first entry and index key in the radix tree @root. + */ +void *radix_tree_lookup_first(struct radix_tree_root *root, unsigned long *index) +{ + unsigned int height, shift; + struct radix_tree_node *slot; + unsigned long i; + + height = root->height; + *index = 0; + if (height == 0) + return NULL; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + for (; height > 1; height--) { + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (slot->slots[i] != NULL) + break; + } + ASSERT(i < RADIX_TREE_MAP_SIZE); + + *index |= (i << shift); + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (slot->slots[i] != NULL) { + *index |= i; + return slot->slots[i]; + } + } + return NULL; +} + +#ifdef RADIX_TREE_TAGS + +/** + * radix_tree_tag_set - set a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) + * corresponding to @index in the radix tree. From + * the root all the way down to the leaf node. + * + * Returns the address of the tagged item. Setting a tag on a not-present + * item is a bug. + */ +void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, unsigned int tag) +{ + unsigned int height, shift; + struct radix_tree_node *slot; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return NULL; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + while (height > 0) { + int offset; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + if (!tag_get(slot, tag, offset)) + tag_set(slot, tag, offset); + slot = slot->slots[offset]; + ASSERT(slot != NULL); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + return slot; +} + +/** + * radix_tree_tag_clear - clear a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index + * + * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) + * corresponding to @index in the radix tree. If + * this causes the leaf node to have no tags set then clear the tag in the + * next-to-leaf node, etc. + * + * Returns the address of the tagged item on success, else NULL. ie: + * has the same return value and semantics as radix_tree_lookup(). + */ +void *radix_tree_tag_clear(struct radix_tree_root *root, + unsigned long index, unsigned int tag) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + struct radix_tree_node *slot; + unsigned int height, shift; + void *ret = NULL; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + slot = root->rnode; + + while (height > 0) { + int offset; + + if (slot == NULL) + goto out; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp[1].offset = offset; + pathp[1].node = slot; + slot = slot->slots[offset]; + pathp++; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + ret = slot; + if (ret == NULL) + goto out; + + do { + if (!tag_get(pathp->node, tag, pathp->offset)) + goto out; + tag_clear(pathp->node, tag, pathp->offset); + if (any_tag_set(pathp->node, tag)) + goto out; + pathp--; + } while (pathp->node); +out: + return ret; +} + +#endif + +static unsigned int +__lookup(struct radix_tree_root *root, void **results, unsigned long index, + unsigned int max_items, unsigned long *next_index) +{ + unsigned int nr_found = 0; + unsigned int shift, height; + struct radix_tree_node *slot; + unsigned long i; + + height = root->height; + if (height == 0) + goto out; + + shift = (height-1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + for ( ; height > 1; height--) { + + for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; + i < RADIX_TREE_MAP_SIZE; i++) { + if (slot->slots[i] != NULL) + break; + index &= ~((1UL << shift) - 1); + index += 1UL << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + } + if (i == RADIX_TREE_MAP_SIZE) + goto out; + + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } + + /* Bottom level: grab some items */ + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { + index++; + if (slot->slots[i]) { + results[nr_found++] = slot->slots[i]; + if (nr_found == max_items) + goto out; + } + } +out: + *next_index = index; + return nr_found; +} + +/** + * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items. Places + * them at *@results and returns the number of items which were placed at + * *@results. + * + * The implementation is naive. + */ +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup(root, results + ret, cur_index, + max_items - ret, &next_index); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} + +/** + * radix_tree_gang_lookup_ex - perform multiple lookup on a radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @last_index: don't lookup past this key + * @max_items: place up to this many items at *results + * + * Performs an index-ascending scan of the tree for present items starting + * @first_index until @last_index up to as many as @max_items. Places + * them at *@results and returns the number of items which were placed + * at *@results. + * + * The implementation is naive. + */ +unsigned int +radix_tree_gang_lookup_ex(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned long last_index, + unsigned int max_items) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + while (ret < max_items && cur_index < last_index) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup(root, results + ret, cur_index, + max_items - ret, &next_index); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} + +#ifdef RADIX_TREE_TAGS + +static unsigned int +__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, + unsigned int max_items, unsigned long *next_index, unsigned int tag) +{ + unsigned int nr_found = 0; + unsigned int shift; + unsigned int height = root->height; + struct radix_tree_node *slot; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + while (height > 0) { + unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; + + for ( ; i < RADIX_TREE_MAP_SIZE; i++) { + if (tag_get(slot, tag, i)) { + ASSERT(slot->slots[i] != NULL); + break; + } + index &= ~((1UL << shift) - 1); + index += 1UL << shift; + if (index == 0) + goto out; /* 32-bit wraparound */ + } + if (i == RADIX_TREE_MAP_SIZE) + goto out; + height--; + if (height == 0) { /* Bottom level: grab some items */ + unsigned long j = index & RADIX_TREE_MAP_MASK; + + for ( ; j < RADIX_TREE_MAP_SIZE; j++) { + index++; + if (tag_get(slot, tag, j)) { + ASSERT(slot->slots[j] != NULL); + results[nr_found++] = slot->slots[j]; + if (nr_found == max_items) + goto out; + } + } + } + shift -= RADIX_TREE_MAP_SHIFT; + slot = slot->slots[i]; + } +out: + *next_index = index; + return nr_found; +} + +/** + * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree + * based on a tag + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * @tag: the tag index (< RADIX_TREE_MAX_TAGS) + * + * Performs an index-ascending scan of the tree for present items which + * have the tag indexed by @tag set. Places the items at *@results and + * returns the number of items which were placed at *@results. + */ +unsigned int +radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items, + unsigned int tag) +{ + const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long cur_index = first_index; + unsigned int ret = 0; + + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup_tag(root, results + ret, cur_index, + max_items - ret, &next_index, tag); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} + +#endif + +/** + * radix_tree_shrink - shrink height of a radix tree to minimal + * @root radix tree root + */ +static inline void radix_tree_shrink(struct radix_tree_root *root) +{ + /* try to shrink tree height */ + while (root->height > 1 && + root->rnode->count == 1 && + root->rnode->slots[0]) { + struct radix_tree_node *to_free = root->rnode; + + root->rnode = to_free->slots[0]; + root->height--; + /* must only free zeroed nodes into the slab */ +#ifdef RADIX_TREE_TAGS + tag_clear(to_free, 0, 0); + tag_clear(to_free, 1, 0); +#endif + to_free->slots[0] = NULL; + to_free->count = 0; + radix_tree_node_free(to_free); + } +} + +/** + * radix_tree_delete - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * + * Remove the item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +{ + struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path; + struct radix_tree_path *orig_pathp; + struct radix_tree_node *slot; + unsigned int height, shift; + void *ret = NULL; +#ifdef RADIX_TREE_TAGS + char tags[RADIX_TREE_MAX_TAGS]; + int nr_cleared_tags; + int tag; +#endif + int offset; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto out; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + pathp->node = NULL; + slot = root->rnode; + + for ( ; height > 0; height--) { + if (slot == NULL) + goto out; + + pathp++; + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + pathp->offset = offset; + pathp->node = slot; + slot = slot->slots[offset]; + shift -= RADIX_TREE_MAP_SHIFT; + } + + ret = slot; + if (ret == NULL) + goto out; + + orig_pathp = pathp; + +#ifdef RADIX_TREE_TAGS + /* + * Clear all tags associated with the just-deleted item + */ + nr_cleared_tags = 0; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + tags[tag] = 1; + if (tag_get(pathp->node, tag, pathp->offset)) { + tag_clear(pathp->node, tag, pathp->offset); + if (!any_tag_set(pathp->node, tag)) { + tags[tag] = 0; + nr_cleared_tags++; + } + } + } + + for (pathp--; nr_cleared_tags && pathp->node; pathp--) { + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (tags[tag]) + continue; + + tag_clear(pathp->node, tag, pathp->offset); + if (any_tag_set(pathp->node, tag)) { + tags[tag] = 1; + nr_cleared_tags--; + } + } + } +#endif + /* Now free the nodes we do not need anymore */ + for (pathp = orig_pathp; pathp->node; pathp--) { + pathp->node->slots[pathp->offset] = NULL; + pathp->node->count--; + + if (pathp->node->count) { + if (pathp->node == root->rnode) + radix_tree_shrink(root); + goto out; + } + + /* Node with zero slots in use so free it */ + radix_tree_node_free(pathp->node); + } + root->rnode = NULL; + root->height = 0; +out: + return ret; +} + +#ifdef RADIX_TREE_TAGS +/** + * radix_tree_tagged - test whether any items in the tree are tagged + * @root: radix tree root + * @tag: tag to test + */ +int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) +{ + struct radix_tree_node *rnode; + rnode = root->rnode; + if (!rnode) + return 0; + return any_tag_set(rnode, tag); +} +#endif + +static unsigned long __maxindex(unsigned int height) +{ + unsigned int tmp = height * RADIX_TREE_MAP_SHIFT; + unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1; + + if (tmp >= RADIX_TREE_INDEX_BITS) + index = ~0UL; + return index; +} + +static void radix_tree_init_maxindex(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) + height_to_maxindex[i] = __maxindex(i); +} + +void radix_tree_init(void) +{ + radix_tree_init_maxindex(); +} diff --git a/repair/radix-tree.h b/repair/radix-tree.h new file mode 100644 index 000000000..e16e08d5f --- /dev/null +++ b/repair/radix-tree.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#ifndef __XFS_SUPPORT_RADIX_TREE_H__ +#define __XFS_SUPPORT_RADIX_TREE_H__ + +#define RADIX_TREE_TAGS + +struct radix_tree_root { + unsigned int height; + struct radix_tree_node *rnode; +}; + +#define RADIX_TREE_INIT(mask) { \ + .height = 0, \ + .rnode = NULL, \ +} + +#define RADIX_TREE(name, mask) \ + struct radix_tree_root name = RADIX_TREE_INIT(mask) + +#define INIT_RADIX_TREE(root, mask) \ +do { \ + (root)->height = 0; \ + (root)->rnode = NULL; \ +} while (0) + +#ifdef RADIX_TREE_TAGS +#define RADIX_TREE_MAX_TAGS 2 +#endif + +int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +void *radix_tree_lookup(struct radix_tree_root *, unsigned long); +void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void *radix_tree_lookup_first(struct radix_tree_root *, unsigned long *); +void *radix_tree_delete(struct radix_tree_root *, unsigned long); +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items); +unsigned int +radix_tree_gang_lookup_ex(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned long last_index, + unsigned int max_items); + +void radix_tree_init(void); + +#ifdef RADIX_TREE_TAGS +void *radix_tree_tag_set(struct radix_tree_root *root, + unsigned long index, unsigned int tag); +void *radix_tree_tag_clear(struct radix_tree_root *root, + unsigned long index, unsigned int tag); +int radix_tree_tag_get(struct radix_tree_root *root, + unsigned long index, unsigned int tag); +unsigned int +radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items, + unsigned int tag); +int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); +#endif + +#endif /* __XFS_SUPPORT_RADIX_TREE_H__ */ diff --git a/repair/rt.c b/repair/rt.c index 7645128de..7036aa2b2 100644 --- a/repair/rt.c +++ b/repair/rt.c @@ -91,7 +91,7 @@ generate_rtinfo(xfs_mount_t *mp, bits = 0; for (i = 0; i < sizeof(xfs_rtword_t) * NBBY && extno < mp->m_sb.sb_rextents; i++, extno++) { - if (get_rtbmap(extno) == XR_E_FREE) { + if (get_rtbno_state(mp, extno) == XR_E_FREE) { sb_frextents++; bits |= freebit; @@ -218,7 +218,7 @@ process_rtbitmap(xfs_mount_t *mp, bit < bitsperblock && extno < mp->m_sb.sb_rextents; bit++, extno++) { if (xfs_isset(words, bit)) { - set_rtbmap(extno, XR_E_FREE); + set_rtbno_state(mp, extno, XR_E_FREE); sb_frextents++; if (prevbit == 0) { start_bmbno = bmbno; diff --git a/repair/scan.c b/repair/scan.c index f2bf86307..18ac38513 100644 --- a/repair/scan.c +++ b/repair/scan.c @@ -148,9 +148,6 @@ scanfunc_bmap( xfs_dfiloff_t last_key; char *forkname; int numrecs; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - int state; if (whichfork == XFS_DATA_FORK) forkname = _("data"); @@ -232,16 +229,11 @@ _("bad back (left) sibling pointer (saw %llu should be NULL (0))\n" bm_cursor->level[level].right_fsbno = be64_to_cpu(block->bb_u.l.bb_rightsib); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - - pthread_mutex_lock(&ag_locks[agno]); - state = get_bmap(agno, agbno); - switch (state) { + switch (get_fsbno_state(mp, bno)) { case XR_E_UNKNOWN: case XR_E_FREE1: case XR_E_FREE: - set_bmap(agno, agbno, XR_E_INUSE); + set_fsbno_state(mp, bno, XR_E_INUSE); break; case XR_E_FS_MAP: case XR_E_INUSE: @@ -253,17 +245,19 @@ _("bad back (left) sibling pointer (saw %llu should be NULL (0))\n" * we made it here, the block probably * contains btree data. */ - set_bmap(agno, agbno, XR_E_MULT); + set_fsbno_state(mp, bno, XR_E_MULT); do_warn( _("inode 0x%llx bmap block 0x%llx claimed, state is %d\n"), - ino, (__uint64_t) bno, state); + ino, (__uint64_t) bno, + get_fsbno_state(mp, bno)); break; case XR_E_MULT: case XR_E_INUSE_FS: - set_bmap(agno, agbno, XR_E_MULT); + set_fsbno_state(mp, bno, XR_E_MULT); do_warn( _("inode 0x%llx bmap block 0x%llx claimed, state is %d\n"), - ino, (__uint64_t) bno, state); + ino, (__uint64_t) bno, + get_fsbno_state(mp, bno)); /* * if we made it to here, this is probably a bmap block * that is being used by *another* file as a bmap block @@ -278,19 +272,18 @@ _("bad back (left) sibling pointer (saw %llu should be NULL (0))\n" default: do_warn( _("bad state %d, inode 0x%llx bmap block 0x%llx\n"), - state, ino, (__uint64_t) bno); + get_fsbno_state(mp, bno), + ino, (__uint64_t) bno); break; } - pthread_mutex_unlock(&ag_locks[agno]); } else { /* * attribute fork for realtime files is in the regular * filesystem */ if (type != XR_INO_RTDATA || whichfork != XFS_DATA_FORK) { - if (search_dup_extent(XFS_FSB_TO_AGNO(mp, bno), - XFS_FSB_TO_AGBNO(mp, bno), - XFS_FSB_TO_AGBNO(mp, bno) + 1)) + if (search_dup_extent(mp, XFS_FSB_TO_AGNO(mp, bno), + XFS_FSB_TO_AGBNO(mp, bno))) return(1); } else { if (search_rt_dup_extent(mp, bno)) @@ -485,15 +478,19 @@ scanfunc_allocbt( /* * check for btree blocks multiply claimed */ - state = get_bmap(agno, bno); - switch (state != XR_E_UNKNOWN) { - set_bmap(agno, bno, XR_E_MULT); + state = get_agbno_state(mp, agno, bno); + + switch (state) { + case XR_E_UNKNOWN: + set_agbno_state(mp, agno, bno, XR_E_FS_MAP); + break; + default: + set_agbno_state(mp, agno, bno, XR_E_MULT); do_warn( _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), name, state, agno, bno, suspect); return; } - set_bmap(agno, bno, XR_E_FS_MAP); numrecs = be16_to_cpu(block->bb_numrecs); @@ -514,7 +511,7 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), rp = XFS_ALLOC_REC_ADDR(mp, block, 1); for (i = 0; i < numrecs; i++) { xfs_agblock_t b, end; - xfs_extlen_t len, blen; + xfs_extlen_t len; b = be32_to_cpu(rp[i].ar_startblock); len = be32_to_cpu(rp[i].ar_blockcount); @@ -527,11 +524,12 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), if (!verify_agbno(mp, agno, end - 1)) continue; - for ( ; b < end; b += blen) { - state = get_bmap_ext(agno, b, end, &blen); + for ( ; b < end; b++) { + state = get_agbno_state(mp, agno, b); switch (state) { case XR_E_UNKNOWN: - set_bmap(agno, b, XR_E_FREE1); + set_agbno_state(mp, agno, b, + XR_E_FREE1); break; case XR_E_FREE1: /* @@ -539,15 +537,14 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), * FREE1 blocks later */ if (magic == XFS_ABTC_MAGIC) { - set_bmap_ext(agno, b, blen, - XR_E_FREE); + set_agbno_state(mp, agno, b, + XR_E_FREE); break; } default: do_warn( - _("block (%d,%d-%d) multiply claimed by %s space tree, state - %d\n"), - agno, b, b + blen - 1, - name, state); + _("block (%d,%d) multiply claimed by %s space tree, state - %d\n"), + agno, b, name, state); break; } } @@ -703,14 +700,13 @@ _("bad ending inode # (%llu (0x%x 0x%x)) in ino rec, skipping rec\n"), j < XFS_INODES_PER_CHUNK; j += mp->m_sb.sb_inopblock) { agbno = XFS_AGINO_TO_AGBNO(mp, ino + j); - - state = get_bmap(agno, agbno); + state = get_agbno_state(mp, agno, agbno); if (state == XR_E_UNKNOWN) { - set_bmap(agno, agbno, XR_E_INO); + set_agbno_state(mp, agno, agbno, XR_E_INO); } else if (state == XR_E_INUSE_FS && agno == 0 && ino + j >= first_prealloc_ino && ino + j < last_prealloc_ino) { - set_bmap(agno, agbno, XR_E_INO); + set_agbno_state(mp, agno, agbno, XR_E_INO); } else { do_warn( _("inode chunk claims used block, inobt block - agno %d, bno %d, inopb %d\n"), @@ -847,15 +843,16 @@ scanfunc_ino( * check for btree blocks multiply claimed, any unknown/free state * is ok in the bitmap block. */ - state = get_bmap(agno, bno); + state = get_agbno_state(mp, agno, bno); + switch (state) { case XR_E_UNKNOWN: case XR_E_FREE1: case XR_E_FREE: - set_bmap(agno, bno, XR_E_FS_MAP); + set_agbno_state(mp, agno, bno, XR_E_FS_MAP); break; default: - set_bmap(agno, bno, XR_E_MULT); + set_agbno_state(mp, agno, bno, XR_E_MULT); do_warn( _("inode btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), state, agno, bno, suspect); @@ -957,7 +954,7 @@ scan_freelist( if (XFS_SB_BLOCK(mp) != XFS_AGFL_BLOCK(mp) && XFS_AGF_BLOCK(mp) != XFS_AGFL_BLOCK(mp) && XFS_AGI_BLOCK(mp) != XFS_AGFL_BLOCK(mp)) - set_bmap(agno, XFS_AGFL_BLOCK(mp), XR_E_FS_MAP); + set_agbno_state(mp, agno, XFS_AGFL_BLOCK(mp), XR_E_FS_MAP); if (be32_to_cpu(agf->agf_flcount) == 0) return; @@ -975,7 +972,7 @@ scan_freelist( for (;;) { bno = be32_to_cpu(agfl->agfl_bno[i]); if (verify_agbno(mp, agno, bno)) - set_bmap(agno, bno, XR_E_FREE); + set_agbno_state(mp, agno, bno, XR_E_FREE); else do_warn(_("bad agbno %u in agfl, agno %d\n"), bno, agno); diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c index e36eeae9a..8bf20bbfd 100644 --- a/repair/xfs_repair.c +++ b/repair/xfs_repair.c @@ -39,6 +39,7 @@ extern void phase4(xfs_mount_t *); extern void phase5(xfs_mount_t *); extern void phase6(xfs_mount_t *); extern void phase7(xfs_mount_t *); +extern void incore_init(xfs_mount_t *); #define XR_MAX_SECT_SIZE (64 * 1024) @@ -535,6 +536,11 @@ main(int argc, char **argv) bindtextdomain(PACKAGE, LOCALEDIR); textdomain(PACKAGE); +#ifdef XR_PF_TRACE + pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w"); + setvbuf(pf_trace_file, NULL, _IOLBF, 1024); +#endif + temp_mp = &xfs_m; setbuf(stdout, NULL); @@ -687,14 +693,9 @@ main(int argc, char **argv) calc_mkfs(mp); /* - * initialize block alloc map + * check sb filesystem stats and initialize in-core data structures */ - init_bmaps(mp); - incore_ino_init(mp); - incore_ext_init(mp); - - /* initialize random globals now that we know the fs geometry */ - inodes_per_block = mp->m_sb.sb_inopblock; + incore_init(mp); if (parse_sb_version(&mp->m_sb)) { do_warn( @@ -722,11 +723,6 @@ main(int argc, char **argv) } timestamp(PHASE_END, 5, NULL); - /* - * Done with the block usage maps, toss them... - */ - free_bmaps(mp); - if (!bad_ino_btree) { phase6(mp); timestamp(PHASE_END, 6, NULL); @@ -848,7 +844,8 @@ _("Note - stripe unit (%d) and width (%d) fields have been reset.\n" if (verbose) summary_report(); do_log(_("done\n")); - pftrace_done(); - +#ifdef XR_PF_TRACE + fclose(pf_trace_file); +#endif return (0); }