From: Tom Yu Date: Mon, 8 Aug 2016 13:15:42 +0000 (-0400) Subject: Refactor btree recursive traversal code X-Git-Tag: krb5-1.15-beta1~88 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=590af488351316b6f4ddb70ce67a2730b4fcc40d;p=thirdparty%2Fkrb5.git Refactor btree recursive traversal code Previous releases had a nonstandard entry point (bt_rseq) into libdb2 to perform recursive traversal of a btree database that might be corrupt so that an operator could attempt data recovery. This entry point became inaccessible to user commands after krb5-1.5 due to integration of the DAL. Refactor the recursive traversal code into the existing btree sequential traversal code, accepting new movement flags R_RNEXT (recursively advance to next item) and R_RPREV. Add commands to the libdb2 btree test program bttest to exercise this functionality. Fix up the existing "rlist" command of bttest to use the updated interface. ticket: 8476 (new) subject: Restore recursive dump functionality --- diff --git a/src/plugins/kdb/db2/libdb2/btree/bt_seq.c b/src/plugins/kdb/db2/libdb2/btree/bt_seq.c index b39d89edd1..2c8c2de96c 100644 --- a/src/plugins/kdb/db2/libdb2/btree/bt_seq.c +++ b/src/plugins/kdb/db2/libdb2/btree/bt_seq.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2002 by the Massachusetts Institute of Technology. + * Copyright (C) 2002, 2016 by the Massachusetts Institute of Technology. * All rights reserved. * * Export of this software from the United States of America may @@ -77,6 +77,9 @@ static int __bt_first __P((BTREE *, const DBT *, EPG *, int *)); static int __bt_seqadv __P((BTREE *, EPG *, int)); static int __bt_seqset __P((BTREE *, EPG *, DBT *, int)); +static int bt_rseq_next(BTREE *, EPG *); +static int bt_rseq_prev(BTREE *, EPG *); + /* * Sequential scan support. * @@ -124,6 +127,8 @@ __bt_seq(dbp, key, data, flags) switch (flags) { case R_NEXT: case R_PREV: + case R_RNEXT: + case R_RPREV: if (F_ISSET(&t->bt_cursor, CURS_INIT)) { status = __bt_seqadv(t, &e, flags); break; @@ -202,6 +207,8 @@ __bt_seqset(t, ep, key, flags) return (__bt_first(t, key, ep, &exact)); case R_FIRST: /* First record. */ case R_NEXT: + case R_RNEXT: + BT_CLR(t); /* Walk down the left-hand side of the tree. */ for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) @@ -216,6 +223,7 @@ __bt_seqset(t, ep, key, flags) if (h->flags & (P_BLEAF | P_RLEAF)) break; pg = GETBINTERNAL(h, 0)->pgno; + BT_PUSH(t, h->pgno, 0); mpool_put(t->bt_mp, h, 0); } ep->page = h; @@ -223,6 +231,8 @@ __bt_seqset(t, ep, key, flags) break; case R_LAST: /* Last record. */ case R_PREV: + case R_RPREV: + BT_CLR(t); /* Walk down the right-hand side of the tree. */ for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) @@ -237,6 +247,7 @@ __bt_seqset(t, ep, key, flags) if (h->flags & (P_BLEAF | P_RLEAF)) break; pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; + BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1); mpool_put(t->bt_mp, h, 0); } @@ -330,6 +341,7 @@ __bt_seqadv(t, ep, flags) */ switch (flags) { case R_NEXT: /* Next record. */ + case R_RNEXT: /* * The cursor was deleted in duplicate records, and moved * forward to a record that has yet to be returned. Clear @@ -339,6 +351,11 @@ __bt_seqadv(t, ep, flags) goto usecurrent; idx = c->pg.index; if (++idx == NEXTINDEX(h)) { + if (flags == R_RNEXT) { + ep->page = h; + ep->index = idx; + return (bt_rseq_next(t, ep)); + } pg = h->nextpg; mpool_put(t->bt_mp, h, 0); if (pg == P_INVALID) @@ -349,6 +366,7 @@ __bt_seqadv(t, ep, flags) } break; case R_PREV: /* Previous record. */ + case R_RPREV: /* * The cursor was deleted in duplicate records, and moved * backward to a record that has yet to be returned. Clear @@ -362,6 +380,11 @@ usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); } idx = c->pg.index; if (idx == 0) { + if (flags == R_RPREV) { + ep->page = h; + ep->index = idx; + return (bt_rseq_prev(t, ep)); + } pg = h->prevpg; mpool_put(t->bt_mp, h, 0); if (pg == P_INVALID) @@ -379,6 +402,84 @@ usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); return (RET_SUCCESS); } +/* + * Get the first item on the next page, but by going up and down the tree. + */ +static int +bt_rseq_next(BTREE *t, EPG *ep) +{ + PAGE *h; + indx_t idx; + EPGNO *up; + db_pgno_t pg; + + h = ep->page; + idx = ep->index; + do { + /* Move up the tree. */ + up = BT_POP(t); + mpool_put(t->bt_mp, h, 0); + /* Did we hit the right edge of the root? */ + if (up == NULL) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL) + return (RET_ERROR); + idx = up->index; + } while (++idx == NEXTINDEX(h)); + + while (!(h->flags & (P_BLEAF | P_RLEAF))) { + /* Move back down the tree. */ + BT_PUSH(t, h->pgno, idx); + pg = GETBINTERNAL(h, idx)->pgno; + mpool_put(t->bt_mp, h, 0); + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + idx = 0; + } + ep->page = h; + ep->index = idx; + return (RET_SUCCESS); +} + +/* + * Get the last item on the previous page, but by going up and down the tree. + */ +static int +bt_rseq_prev(BTREE *t, EPG *ep) +{ + PAGE *h; + indx_t idx; + EPGNO *up; + db_pgno_t pg; + + h = ep->page; + idx = ep->index; + do { + /* Move up the tree. */ + up = BT_POP(t); + mpool_put(t->bt_mp, h, 0); + /* Did we hit the left edge of the root? */ + if (up == NULL) + return (RET_SPECIAL); + if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL) + return (RET_ERROR); + idx = up->index; + } while (idx == 0); + --idx; + while (!(h->flags & (P_BLEAF | P_RLEAF))) { + /* Move back down the tree. */ + BT_PUSH(t, h->pgno, idx); + pg = GETBINTERNAL(h, idx)->pgno; + mpool_put(t->bt_mp, h, 0); + if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) + return (RET_ERROR); + idx = NEXTINDEX(h) - 1; + } + ep->page = h; + ep->index = idx; + return (RET_SUCCESS); +} + /* * __bt_first -- * Find the first entry. @@ -513,389 +614,3 @@ __bt_setcur(t, pgno, idx) t->bt_cursor.pg.index = idx; F_SET(&t->bt_cursor, CURS_INIT); } - -/* Recursive descent cursor. */ -typedef struct rcursor_ { - CURSOR cursor; - size_t ssize; - EPGNO *stack; - EPGNO *sp; -} RCURSOR; -#define RCURSOR_MINSS 64 - -static int bt_rcinit(void **); -static void bt_rcdestroy(void **); -static int bt_rcpush(RCURSOR *, db_pgno_t, u_int); -static EPGNO *bt_rcpop(RCURSOR *); -static void bt_rcclr(RCURSOR *); -static int bt_rcgrowstk(RCURSOR *); -static int bt_rseqset(BTREE *, EPG *, DBT *, RCURSOR *, int); -static int bt_rseqadv(BTREE *, EPG *, RCURSOR *, int); - -static int -bt_rcinit(curs) - void **curs; -{ - RCURSOR *rc; - - rc = *curs = malloc(sizeof(RCURSOR)); - if (rc == NULL) { - errno = ENOMEM; - return RET_ERROR; - } - memset(rc, 0, sizeof(*rc)); - - rc->ssize = RCURSOR_MINSS; - rc->stack = malloc(rc->ssize * sizeof(EPGNO)); - if (rc->stack == NULL) { - free(rc); - errno = ENOMEM; - return RET_ERROR; - } - bt_rcclr(rc); - return RET_SUCCESS; -} - -static void -bt_rcdestroy(curs) - void **curs; -{ - RCURSOR *rc; - - rc = *curs; - free(rc->stack); - free(rc); - *curs = NULL; -} - -static int -bt_rcpush(rc, p, i) - RCURSOR *rc; - db_pgno_t p; - u_int i; -{ - int status; - - rc->sp->pgno = p; - rc->sp->index = i; - if (++rc->sp > rc->stack + rc->ssize) { - status = bt_rcgrowstk(rc); - if (status != RET_SUCCESS) - return status; - } - return RET_SUCCESS; -} - -static EPGNO * -bt_rcpop(rc) - RCURSOR *rc; -{ - return (rc->sp == rc->stack) ? NULL : --rc->sp; -} - -static void -bt_rcclr(rc) - RCURSOR *rc; -{ - rc->sp = rc->stack; -} - -static int -bt_rcgrowstk(rc) - RCURSOR *rc; -{ - size_t osize; - EPGNO *e; - - osize = rc->ssize; - rc->ssize *= 2; - e = realloc(rc->stack, rc->ssize * sizeof(EPGNO)); - if (e == NULL) { - rc->ssize = osize; - errno = ENOMEM; - return RET_ERROR; - } - rc->stack = e; - return RET_SUCCESS; -} - -/* - * bt_rseq -- - * Like __bt_seq but does recursive descent tree traversal - * instead of using the prev/next pointers. - */ -int -bt_rseq(dbp, key, data, curs, flags) - const DB *dbp; - DBT *key, *data; - void **curs; - u_int flags; -{ - RCURSOR *rc; - BTREE *t; - EPG e; - int status; - - t = dbp->internal; - - /* Toss any page pinned across calls. */ - if (t->bt_pinned != NULL) { - mpool_put(t->bt_mp, t->bt_pinned, 0); - t->bt_pinned = NULL; - } - - if (curs == NULL) { - errno = EINVAL; - return RET_ERROR; - } - if (*curs == NULL) { - status = bt_rcinit(curs); - if (status != RET_SUCCESS) - return RET_ERROR; - } - rc = *curs; - - /* - * If scan unitialized as yet, or starting at a specific record, set - * the scan to a specific key. Both bt_rseqset and bt_rseqadv pin - * the page the cursor references if they're successful. - */ - switch (flags) { - case R_NEXT: - case R_PREV: - if (F_ISSET(&rc->cursor, CURS_INIT)) { - status = bt_rseqadv(t, &e, rc, flags); - break; - } - /* FALLTHROUGH */ - case R_FIRST: - case R_LAST: - case R_CURSOR: - status = bt_rseqset(t, &e, key, rc, flags); - break; - default: - errno = EINVAL; - return (RET_ERROR); - } - - if (status == RET_SUCCESS) { - status = - __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); - - /* - * If the user is doing concurrent access, we copied the - * key/data, toss the page. - */ - if (F_ISSET(t, B_DB_LOCK)) - mpool_put(t->bt_mp, e.page, 0); - else - t->bt_pinned = e.page; - } else if (status == RET_SPECIAL) - bt_rcdestroy(curs); - return (status); -} - -/* - * bt_rseqset -- - * Set the sequential scan to a specific key. - * - * Parameters: - * t: tree - * ep: storage for returned key - * key: key for initial scan position - * rc: recursion cursor - * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV - * - * Side effects: - * Pins the page the cursor references. - * Updates rc's stack and cursor. - * - * Returns: - * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. - */ -static int -bt_rseqset(t, ep, key, rc, flags) - BTREE *t; - EPG *ep; - DBT *key; - RCURSOR *rc; - int flags; -{ - PAGE *h; - db_pgno_t pg; - int status; - - /* - * Find the first, last or specific key in the tree and point the - * cursor at it. The cursor may not be moved until a new key has - * been found. - */ - switch (flags) { - case R_CURSOR: /* Not implemented. */ - errno = EINVAL; - return RET_ERROR; - case R_FIRST: /* First record. */ - case R_NEXT: - bt_rcclr(rc); - /* Walk down the left-hand side of the tree. */ - for (pg = P_ROOT;;) { - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - - /* Check for an empty tree. */ - if (NEXTINDEX(h) == 0) { - mpool_put(t->bt_mp, h, 0); - return (RET_SPECIAL); - } - - if (h->flags & (P_BLEAF | P_RLEAF)) - break; - pg = GETBINTERNAL(h, 0)->pgno; - status = bt_rcpush(rc, h->pgno, 0); - mpool_put(t->bt_mp, h, 0); - if (status != RET_SUCCESS) - return status; - } - ep->page = h; - ep->index = 0; - break; - case R_LAST: /* Last record. */ - case R_PREV: - bt_rcclr(rc); - /* Walk down the right-hand side of the tree. */ - for (pg = P_ROOT;;) { - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - - /* Check for an empty tree. */ - if (NEXTINDEX(h) == 0) { - mpool_put(t->bt_mp, h, 0); - return (RET_SPECIAL); - } - - if (h->flags & (P_BLEAF | P_RLEAF)) - break; - pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; - status = bt_rcpush(rc, h->pgno, NEXTINDEX(h) - 1); - mpool_put(t->bt_mp, h, 0); - if (status != RET_SUCCESS) - return status; - } - ep->page = h; - ep->index = NEXTINDEX(h) - 1; - break; - } - rc->cursor.pg.pgno = ep->page->pgno; - rc->cursor.pg.index = ep->index; - F_CLR(&rc->cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); - F_SET(&rc->cursor, CURS_INIT); - return (RET_SUCCESS); -} - -/* - * bt_rseqadvance -- - * Advance the sequential scan. - * - * Parameters: - * t: tree - * ep: return page - * rc: recursion cursor - * flags: R_NEXT, R_PREV - * - * Side effects: - * Pins the page the new key/data record is on. - * Updates rc's stack and cursor. - * - * Returns: - * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. - */ -static int -bt_rseqadv(t, ep, rc, flags) - BTREE *t; - EPG *ep; - RCURSOR *rc; - int flags; -{ - CURSOR *c; - PAGE *h; - indx_t idx = 0; - db_pgno_t pg; - int status; - EPGNO *e; - - /* - * There are a couple of states that we can be in. The cursor has - * been initialized by the time we get here, but that's all we know. - */ - c = &rc->cursor; - - /* Get the page referenced by the cursor. */ - if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) - return (RET_ERROR); - - /* - * Find the next/previous record in the tree and point the cursor at - * it. The cursor may not be moved until a new key has been found. - */ - switch (flags) { - case R_NEXT: /* Next record. */ - idx = c->pg.index; - while (++idx == NEXTINDEX(h)) { - /* Crawl up if we hit the right edge. */ - e = bt_rcpop(rc); - mpool_put(t->bt_mp, h, 0); - if (e == NULL) /* Hit the right edge of root. */ - return RET_SPECIAL; - idx = e->index; - pg = e->pgno; - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - } - while (!(h->flags & (P_BLEAF | P_RLEAF))) { - /* Crawl down the left until we hit a leaf. */ - status = bt_rcpush(rc, h->pgno, idx); - pg = GETBINTERNAL(h, idx)->pgno; - mpool_put(t->bt_mp, h, 0); - if (status != RET_SUCCESS) - return status; - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - idx = 0; - } - break; - case R_PREV: /* Previous record. */ - idx = c->pg.index; - while (!idx) { - /* Crawl up if we hit the left edge. */ - e = bt_rcpop(rc); - mpool_put(t->bt_mp, h, 0); - if (e == NULL) /* Hit the left edge of root. */ - return RET_SPECIAL; - idx = e->index; - pg = e->pgno; - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - } - idx--; - while (!(h->flags & (P_BLEAF | P_RLEAF))) { - /* Crawl down the right until we hit a leaf. */ - status = bt_rcpush(rc, h->pgno, idx); - pg = GETBINTERNAL(h, idx)->pgno; - mpool_put(t->bt_mp, h, 0); - if (status != RET_SUCCESS) - return status; - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - idx = NEXTINDEX(h) - 1; - } - break; - } - - ep->page = h; - ep->index = idx; - c->pg.pgno = h->pgno; - c->pg.index = idx; - F_CLR(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); - F_SET(c, CURS_INIT); - return (RET_SUCCESS); -} diff --git a/src/plugins/kdb/db2/libdb2/btree/extern.h b/src/plugins/kdb/db2/libdb2/btree/extern.h index 3aa88417e3..c29b084644 100644 --- a/src/plugins/kdb/db2/libdb2/btree/extern.h +++ b/src/plugins/kdb/db2/libdb2/btree/extern.h @@ -63,15 +63,6 @@ #define __bt_dump __kdb2_bt_dump #define __bt_stat __kdb2_bt_stat -#define bt_rcinit kdb2_bt_rcinit -#define bt_rcdestroy kdb2_bt_rcdestroy -#define bt_rcpush kdb2_bt_rcpush -#define bt_rcpop kdb2_bt_rcpop -#define bt_rcclr kdb2_bt_rcclr -#define bt_rcgrowstk kdb2_bt_rcgrowstk -#define bt_rseqset kdb2_bt_rseqset -#define bt_rseqadv kdb2_bt_rseqadv - int __bt_close __P((DB *)); int __bt_cmp __P((BTREE *, const DBT *, EPG *)); int __bt_crsrdel __P((BTREE *, EPGNO *)); diff --git a/src/plugins/kdb/db2/libdb2/include/db.hin b/src/plugins/kdb/db2/libdb2/include/db.hin index dca0b00677..9ae8f8be5c 100644 --- a/src/plugins/kdb/db2/libdb2/include/db.hin +++ b/src/plugins/kdb/db2/libdb2/include/db.hin @@ -63,6 +63,23 @@ typedef struct { #define R_SETCURSOR 10 /* put (RECNO) */ #define R_RECNOSYNC 11 /* sync (RECNO) */ +/* + * Recursive sequential scan. + * + * This avoids using sibling pointers, permitting (possibly partial) + * recovery from some kinds of btree corruption. Start a sequential + * scan as usual, but use R_RNEXT or R_RPREV to move forward or + * backward. + * + * This probably doesn't work with btrees that allow duplicate keys. + * Database modifications during the scan can also modify the parent + * page stack needed for correct functioning. Intermixing + * non-recursive traversal by using R_NEXT or R_PREV can also make the + * page stack inconsistent with the cursor and cause problems. + */ +#define R_RNEXT 128 /* seq (BTREE, RECNO) */ +#define R_RPREV 129 /* seq (BTREE, RECNO) */ + typedef enum { DB_BTREE, DB_HASH, DB_RECNO } DBTYPE; /* @@ -166,10 +183,8 @@ typedef struct { #endif #define dbopen kdb2_dbopen -#define bt_rseq kdb2_bt_rseq /* XXX kludge */ __BEGIN_DECLS DB *dbopen __P((const char *, int, int, DBTYPE, const void *)); -int bt_rseq(const DB*, DBT *, DBT *, void **, u_int); /* XXX kludge */ __END_DECLS #endif /* !_DB_H_ */ diff --git a/src/plugins/kdb/db2/libdb2/libdb.exports b/src/plugins/kdb/db2/libdb2/libdb.exports index 4d3494704a..7828da1146 100644 --- a/src/plugins/kdb/db2/libdb2/libdb.exports +++ b/src/plugins/kdb/db2/libdb2/libdb.exports @@ -73,7 +73,6 @@ __kdb2_rec_sync __kdb2_rec_vmap __kdb2_rec_vpipe __kdb2_split_page -kdb2_bt_rseq kdb2_dbm_clearerr kdb2_dbm_close kdb2_dbm_delete diff --git a/src/plugins/kdb/db2/libdb2/test/btree.tests/main.c b/src/plugins/kdb/db2/libdb2/test/btree.tests/main.c index 5b9890b8a2..33de7a2ff8 100644 --- a/src/plugins/kdb/db2/libdb2/test/btree.tests/main.c +++ b/src/plugins/kdb/db2/libdb2/test/btree.tests/main.c @@ -94,6 +94,8 @@ void previous __P((DB *, char **)); void show __P((DB *, char **)); #endif void rlist __P((DB *, char **)); +void rnext __P((DB *, char **)); +void rprev __P((DB *, char **)); void usage __P((void)); void user __P((DB *)); @@ -131,6 +133,8 @@ cmd_table commands[] = { "p", 0, 0, previous, "previous", "move cursor back one record", "q", 0, 0, NULL, "quit", "quit", "rli", 1, 1, rlist, "rlist file", "list to a file (recursive)", + "rn", 0, 0, rnext, "rnext", "move cursor forward one record (recursive)", + "rp", 0, 0, rprev, "rprev", "move cursor back one record (recursive)", #ifdef DEBUG "sh", 1, 0, show, "show page", "dump a page", #endif @@ -652,17 +656,15 @@ rlist(db, argv) DBT data, key; FILE *fp; int status; - void *cookie; - cookie = NULL; if ((fp = fopen(argv[1], "w")) == NULL) { (void)fprintf(stderr, "%s: %s\n", argv[1], strerror(errno)); return; } - status = bt_rseq(db, &key, &data, &cookie, R_FIRST); + status = (*db->seq)(db, &key, &data, R_FIRST); while (status == RET_SUCCESS) { (void)fprintf(fp, "%.*s\n", (int)key.size, key.data); - status = bt_rseq(db, &key, &data, &cookie, R_NEXT); + status = (*db->seq)(db, &key, &data, R_RNEXT); } (void)fclose(fp); if (status == RET_ERROR) @@ -773,6 +775,52 @@ previous(db, argv) } } +void +rnext(db, argv) + DB *db; + char **argv; +{ + DBT data, key; + int status; + + status = (*db->seq)(db, &key, &data, R_RNEXT); + + switch (status) { + case RET_ERROR: + perror("rnext/seq"); + break; + case RET_SPECIAL: + (void)printf("no more keys\n"); + break; + case RET_SUCCESS: + keydata(&key, &data); + break; + } +} + +void +rprev(db, argv) + DB *db; + char **argv; +{ + DBT data, key; + int status; + + status = (*db->seq)(db, &key, &data, R_RPREV); + + switch (status) { + case RET_ERROR: + perror("rprev/seq"); + break; + case RET_SPECIAL: + (void)printf("no more keys\n"); + break; + case RET_SUCCESS: + keydata(&key, &data); + break; + } +} + #ifdef DEBUG void show(db, argv)