From: Howard Chu Date: Mon, 15 Aug 2011 22:40:08 +0000 (-0700) Subject: Checkpoint X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ebe954fbdb07eaa1410e084b93817c0b3db9fef2;p=thirdparty%2Fopenldap.git Checkpoint --- diff --git a/servers/slapd/back-mdb/id2entry.c b/servers/slapd/back-mdb/id2entry.c index 951cb0b639..f7d026b342 100644 --- a/servers/slapd/back-mdb/id2entry.c +++ b/servers/slapd/back-mdb/id2entry.c @@ -23,18 +23,16 @@ #include "back-mdb.h" static int mdb_id2entry_put( - BackendDB *be, - DB_TXN *tid, + Operation *op, + MDB_txn *tid, Entry *e, int flag ) { - struct mdb_info *mdb = (struct mdb_info *) be->be_private; - DB *db = mdb->bi_id2entry->bdi_db; - DBT key, data; + struct mdb_info *mdb = (struct mdb_info *) op->o_bd->be_private; + MDB_dbi dbi = mdb->mi_id2entry->mdi_dbi; + MDB_val key, data; struct berval bv; int rc; - ID nid; -#ifdef MDB_HIER struct berval odn, ondn; /* We only store rdns, and they go in the dn2id database. */ @@ -43,28 +41,22 @@ static int mdb_id2entry_put( e->e_name = slap_empty_bv; e->e_nname = slap_empty_bv; -#endif - DBTzero( &key ); - /* Store ID in BigEndian format */ - key.data = &nid; - key.size = sizeof(ID); - MDB_ID2DISK( e->e_id, &nid ); + key.mv_data = &e->e_id; + key.mv_size = sizeof(ID); - rc = entry_encode( e, &bv ); -#ifdef MDB_HIER + rc = mdb_entry_encode( op, e, &bv ); e->e_name = odn; e->e_nname = ondn; -#endif if( rc != LDAP_SUCCESS ) { return -1; } - DBTzero( &data ); - bv2DBT( &bv, &data ); + data.mv_size = bv.bv_len; + data.mv_data = bv.bv_val; - rc = db->put( db, tid, &key, &data, flag ); + rc = mdb_put( tid, dbi, &key, &data, flag ); - free( bv.bv_val ); + op->o_tmpfree( op->o_tmpmemctx, bv.bv_val ); return rc; } @@ -75,129 +67,67 @@ static int mdb_id2entry_put( int mdb_id2entry_add( - BackendDB *be, - DB_TXN *tid, + Operation *op, + MDB_txn *tid, Entry *e ) { - return mdb_id2entry_put(be, tid, e, DB_NOOVERWRITE); + return mdb_id2entry_put(op, tid, e, MDB_NOOVERWRITE); } int mdb_id2entry_update( - BackendDB *be, - DB_TXN *tid, + Operation *op, + MDB_txn *tid, Entry *e ) { - return mdb_id2entry_put(be, tid, e, 0); + return mdb_id2entry_put(op, tid, e, 0); } int mdb_id2entry( - BackendDB *be, - DB_TXN *tid, + Operation *op, + MDB_txn *tid, ID id, Entry **e ) { - struct mdb_info *mdb = (struct mdb_info *) be->be_private; - DB *db = mdb->bi_id2entry->bdi_db; - DBT key, data; - DBC *cursor; + struct mdb_info *mdb = (struct mdb_info *) op->o_bd->be_private; + MDB_dbi dbi = mdb->mi_id2entry->mdi_dbi; + MDB_val key, data; EntryHeader eh; char buf[16]; int rc = 0, off; - ID nid; *e = NULL; - DBTzero( &key ); - key.data = &nid; - key.size = sizeof(ID); - MDB_ID2DISK( id, &nid ); - - DBTzero( &data ); - data.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL; + key.mv_data = &id; + key.mv_size = sizeof(ID); /* fetch it */ - rc = db->cursor( db, tid, &cursor, mdb->bi_db_opflags ); + rc = mdb_get( tid, dbi, &key, &data ); if ( rc ) return rc; - /* Get the nattrs / nvals counts first */ - data.ulen = data.dlen = sizeof(buf); - data.data = buf; - rc = cursor->c_get( cursor, &key, &data, DB_SET ); - if ( rc ) goto finish; - - - eh.bv.bv_val = buf; - eh.bv.bv_len = data.size; - rc = entry_header( &eh ); - if ( rc ) goto finish; - - /* Get the size */ - data.flags ^= DB_DBT_PARTIAL; - data.ulen = 0; - rc = cursor->c_get( cursor, &key, &data, DB_CURRENT ); - if ( rc != DB_BUFFER_SMALL ) goto finish; - - /* Allocate a block and retrieve the data */ - off = eh.data - eh.bv.bv_val; - eh.bv.bv_len = eh.nvals * sizeof( struct berval ) + data.size; - eh.bv.bv_val = ch_malloc( eh.bv.bv_len ); - eh.data = eh.bv.bv_val + eh.nvals * sizeof( struct berval ); - data.data = eh.data; - data.ulen = data.size; - - /* skip past already parsed nattr/nvals */ - eh.data += off; - - rc = cursor->c_get( cursor, &key, &data, DB_CURRENT ); - -finish: - cursor->c_close( cursor ); - - if( rc != 0 ) { - return rc; - } - -#ifdef SLAP_ZONE_ALLOC - rc = entry_decode(&eh, e, mdb->bi_cache.c_zctx); -#else - rc = entry_decode(&eh, e); -#endif + rc = mdb_entry_decode(&eh, e); if( rc == 0 ) { (*e)->e_id = id; - } else { - /* only free on error. On success, the entry was - * decoded in place. - */ -#ifndef SLAP_ZONE_ALLOC - ch_free(eh.bv.bv_val); -#endif } -#ifdef SLAP_ZONE_ALLOC - ch_free(eh.bv.bv_val); -#endif return rc; } int mdb_id2entry_delete( BackendDB *be, - DB_TXN *tid, + MDB_txn *tid, Entry *e ) { struct mdb_info *mdb = (struct mdb_info *) be->be_private; - DB *db = mdb->bi_id2entry->bdi_db; - DBT key; + MDB_dbi dbi = mdb->mi_id2entry->mdi_dbi; + MDB_val key; int rc; - ID nid; - DBTzero( &key ); - key.data = &nid; - key.size = sizeof(ID); - MDB_ID2DISK( e->e_id, &nid ); + key.mv_data = &e->e_id; + key.mv_size = sizeof(ID); /* delete from database */ - rc = db->del( db, tid, &key, 0 ); + rc = mdb_del( tid, dbi, &key, NULL, 0 ); return rc; } @@ -236,7 +166,7 @@ int mdb_entry_release( int rw ) { struct mdb_info *mdb = (struct mdb_info *) op->o_bd->be_private; - struct mdb_op_info *boi; + struct mdb_op_info *moi; OpExtra *oex; /* slapMode : SLAP_SERVER_MODE, SLAP_TOOL_MODE, @@ -245,24 +175,20 @@ int mdb_entry_release( if ( slapMode == SLAP_SERVER_MODE ) { /* If not in our cache, just free it */ if ( !e->e_private ) { -#ifdef SLAP_ZONE_ALLOC - return mdb_entry_return( mdb, e, -1 ); -#else return mdb_entry_return( e ); -#endif } /* free entry and reader or writer lock */ LDAP_SLIST_FOREACH( oex, &op->o_extra, oe_next ) { if ( oex->oe_key == mdb ) break; } - boi = (struct mdb_op_info *)oex; + moi = (struct mdb_op_info *)oex; /* lock is freed with txn */ - if ( !boi || boi->boi_txn ) { + if ( !moi || moi->moi_txn ) { mdb_unlocked_cache_return_entry_rw( mdb, e, rw ); } else { struct mdb_lock_info *bli, *prev; - for ( prev=(struct mdb_lock_info *)&boi->boi_locks, + for ( prev=(struct mdb_lock_info *)&moi->boi_locks, bli = boi->boi_locks; bli; prev=bli, bli=bli->bli_next ) { if ( bli->bli_id == e->e_id ) { mdb_cache_return_entry_rw( mdb, e, rw, &bli->bli_lock ); @@ -282,22 +208,10 @@ int mdb_entry_release( } } } else { -#ifdef SLAP_ZONE_ALLOC - int zseq = -1; - if (e->e_private != NULL) { - BEI(e)->bei_e = NULL; - zseq = BEI(e)->bei_zseq; - } -#else if (e->e_private != NULL) BEI(e)->bei_e = NULL; -#endif e->e_private = NULL; -#ifdef SLAP_ZONE_ALLOC - mdb_entry_return ( mdb, e, zseq ); -#else mdb_entry_return ( e ); -#endif } return 0; @@ -353,18 +267,9 @@ dn2entry_retry: /* can we find entry */ rc = mdb_dn2entry( op, txn, ndn, &ei, 0, &lock ); switch( rc ) { - case DB_NOTFOUND: + case MDB_NOTFOUND: case 0: break; - case DB_LOCK_DEADLOCK: - case DB_LOCK_NOTGRANTED: - /* the txn must abort and retry */ - if ( txn ) { - if ( boi ) boi->boi_err = rc; - return LDAP_BUSY; - } - ldap_pvt_thread_yield(); - goto dn2entry_retry; default: if ( boi ) boi->boi_err = rc; return (rc != LDAP_BUSY) ? LDAP_OTHER : LDAP_BUSY; diff --git a/servers/slapd/back-mdb/mdb.c b/servers/slapd/back-mdb/mdb.c index d7246555ad..95b6fb8f35 100644 --- a/servers/slapd/back-mdb/mdb.c +++ b/servers/slapd/back-mdb/mdb.c @@ -83,44 +83,51 @@ typedef uint16_t indx_t; #define DEFAULT_MAPSIZE 1048576 /* Lock descriptor stuff */ -#define RXBODY \ - ULONG mr_txnid; \ - pid_t mr_pid; \ - pthread_t mr_tid -typedef struct MDB_rxbody { - RXBODY; -} MDB_rxbody; - #ifndef CACHELINE -# ifdef __APPLE__ -# define CACHELINE 128 /* 64 is too small to contain a mutex */ -# else -# define CACHELINE 64 /* most CPUs. Itanium uses 128 */ -# endif +#define CACHELINE 64 /* most CPUs. Itanium uses 128 */ #endif +typedef struct MDB_rxbody { + ULONG mrb_txnid; + pid_t mrb_pid; + pthread_t mrb_tid; +} MDB_rxbody; + typedef struct MDB_reader { - RXBODY; - /* cache line alignment */ - char pad[CACHELINE-sizeof(MDB_rxbody)]; + union { + MDB_rxbody mrx; +#define mr_txnid mru.mrx.mrb_txnid +#define mr_pid mru.mrx.mrb_pid +#define mr_tid mru.mrx.mrb_tid + /* cache line alignment */ + char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mru; } MDB_reader; -#define TXBODY \ - uint32_t mt_magic; \ - uint32_t mt_version; \ - pthread_mutex_t mt_mutex; \ - ULONG mt_txnid; \ - uint32_t mt_numreaders typedef struct MDB_txbody { - TXBODY; + uint32_t mtb_magic; + uint32_t mtb_version; + pthread_mutex_t mtb_mutex; + ULONG mtb_txnid; + uint32_t mtb_numreaders; } MDB_txbody; typedef struct MDB_txninfo { - TXBODY; - char pad[CACHELINE-sizeof(MDB_txbody)]; - pthread_mutex_t mt_wmutex; - char pad2[CACHELINE-sizeof(pthread_mutex_t)]; - MDB_reader mt_readers[1]; + union { + MDB_txbody mtb; +#define mti_magic mt1.mtb.mtb_magic +#define mti_version mt1.mtb.mtb_version +#define mti_mutex mt1.mtb.mtb_mutex +#define mti_txnid mt1.mtb.mtb_txnid +#define mti_numreaders mt1.mtb.mtb_numreaders + char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; + } mt1; + union { + pthread_mutex_t mt2_wmutex; +#define mti_wmutex mt2.mt2_wmutex + char pad[(sizeof(pthread_mutex_t)+CACHELINE-1) & ~(CACHELINE-1)]; + } mt2; + MDB_reader mti_readers[1]; } MDB_txninfo; /* Common header for all page types. Overflow pages @@ -138,6 +145,7 @@ typedef struct MDB_page { /* represents a page of storage */ #define P_OVERFLOW 0x04 /* overflow page */ #define P_META 0x08 /* meta page */ #define P_DIRTY 0x10 /* dirty page */ +#define P_LEAF2 0x20 /* DB with small, fixed size keys and no data */ uint32_t mp_flags; #define mp_lower mp_pb.pb.pb_lower #define mp_upper mp_pb.pb.pb_upper @@ -148,6 +156,10 @@ typedef struct MDB_page { /* represents a page of storage */ indx_t pb_upper; /* upper bound of free space */ } pb; uint32_t pb_pages; /* number of overflow pages */ + struct { + indx_t pb_ksize; /* on a LEAF2 page */ + indx_t pb_numkeys; + } pb2; } mp_pb; indx_t mp_ptrs[1]; /* dynamic size */ } MDB_page; @@ -191,7 +203,6 @@ typedef struct MDB_meta { /* meta (footer) page content */ } MDB_meta; typedef struct MDB_dhead { /* a dirty page */ - STAILQ_ENTRY(MDB_dpage) md_next; /* queue of dirty pages */ MDB_page *md_parent; unsigned md_pi; /* parent index */ int md_num; @@ -202,8 +213,6 @@ typedef struct MDB_dpage { MDB_page p; } MDB_dpage; -STAILQ_HEAD(dirty_queue, MDB_dpage); /* FIXME: use a sorted data structure */ - typedef struct MDB_oldpages { struct MDB_oldpages *mo_next; ULONG mo_txnid; @@ -245,7 +254,6 @@ struct MDB_cursor { struct MDB_xcursor *mc_xcursor; }; -#define METAHASHLEN offsetof(MDB_meta, mm_hash) #define METADATA(p) ((void *)((char *)p + PAGEHDRSZ)) typedef struct MDB_node { @@ -278,15 +286,15 @@ struct MDB_txn { MDB_env *mt_env; pgno_t *mt_free_pgs; /* this is an IDL */ union { - struct dirty_queue *dirty_queue; /* modified pages */ + MIDL2 *dirty_list; /* modified pages */ MDB_reader *reader; } mt_u; MDB_dbx *mt_dbxs; /* array */ MDB_db *mt_dbs; unsigned int mt_numdbs; -#define MDB_TXN_RDONLY 0x01 /* read-only transaction */ -#define MDB_TXN_ERROR 0x02 /* an error has occurred */ +#define MDB_TXN_RDONLY 0x01 /* read-only transaction */ +#define MDB_TXN_ERROR 0x02 /* an error has occurred */ #define MDB_TXN_METOGGLE 0x04 /* used meta page 1 */ unsigned int mt_flags; }; @@ -302,7 +310,10 @@ typedef struct MDB_xcursor { struct MDB_env { int me_fd; int me_lfd; - uint32_t me_flags; + int me_mfd; /* just for writing the meta pages */ + uint16_t me_flags; + uint16_t me_db_toggle; + unsigned int me_psize; unsigned int me_maxreaders; unsigned int me_numdbs; unsigned int me_maxdbs; @@ -314,13 +325,12 @@ struct MDB_env { MDB_txn *me_txn; /* current write transaction */ size_t me_mapsize; off_t me_size; /* current file size */ - unsigned int me_psize; - int me_db_toggle; MDB_dbx *me_dbxs; /* array */ MDB_db *me_dbs[2]; MDB_oldpages *me_pghead; pthread_key_t me_txkey; /* thread-key for readers */ pgno_t me_free_pgs[MDB_IDL_UM_SIZE]; + MIDL2 me_dirty_list[MDB_IDL_DB_SIZE]; }; #define NODESIZE offsetof(MDB_node, mn_data) @@ -334,7 +344,6 @@ struct MDB_env { #define NODEDSZ(node) ((node)->mn_dsize) #define MDB_COMMIT_PAGES 64 /* max number of pages to write in one commit */ -#define MDB_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */ static int mdb_search_page_root(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, @@ -472,6 +481,7 @@ mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) MDB_dpage *dp; pgno_t pgno = P_INVALID; ULONG oldest; + MIDL2 mid; if (txn->mt_txnid > 2) { @@ -524,11 +534,11 @@ mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) } if (txn->mt_env->me_pghead) { unsigned int i; - for (i=0; imt_env->me_txns->mt_numreaders; i++) { - ULONG mr = txn->mt_env->me_txns->mt_readers[i].mr_txnid; + for (i=0; imt_env->me_txns->mti_numreaders; i++) { + ULONG mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid; if (!mr) continue; if (mr < oldest) - oldest = txn->mt_env->me_txns->mt_readers[i].mr_txnid; + oldest = txn->mt_env->me_txns->mti_readers[i].mr_txnid; } if (oldest > txn->mt_env->me_pghead->mo_txnid) { MDB_oldpages *mop = txn->mt_env->me_pghead; @@ -563,13 +573,15 @@ mdb_alloc_page(MDB_txn *txn, MDB_page *parent, unsigned int parent_idx, int num) dp->h.md_num = num; dp->h.md_parent = parent; dp->h.md_pi = parent_idx; - STAILQ_INSERT_TAIL(txn->mt_u.dirty_queue, dp, h.md_next); if (pgno == P_INVALID) { dp->p.mp_pgno = txn->mt_next_pgno; txn->mt_next_pgno += num; } else { dp->p.mp_pgno = pgno; } + mid.mid = dp->p.mp_pgno; + mid.mptr = dp; + mdb_midl2_insert(txn->mt_u.dirty_list, &mid); return dp; } @@ -609,7 +621,7 @@ mdb_env_sync(MDB_env *env, int force) { int rc = 0; if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { - if (fsync(env->me_fd)) + if (fdatasync(env->me_fd)) rc = errno; } return rc; @@ -629,38 +641,34 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret) if (rdonly) { txn->mt_flags |= MDB_TXN_RDONLY; } else { - txn->mt_u.dirty_queue = calloc(1, sizeof(*txn->mt_u.dirty_queue)); - if (txn->mt_u.dirty_queue == NULL) { - free(txn); - return ENOMEM; - } - STAILQ_INIT(txn->mt_u.dirty_queue); - - pthread_mutex_lock(&env->me_txns->mt_wmutex); - env->me_txns->mt_txnid++; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs = env->me_free_pgs; txn->mt_free_pgs[0] = 0; + + pthread_mutex_lock(&env->me_txns->mti_wmutex); + env->me_txns->mti_txnid++; } - txn->mt_txnid = env->me_txns->mt_txnid; + txn->mt_txnid = env->me_txns->mti_txnid; if (rdonly) { MDB_reader *r = pthread_getspecific(env->me_txkey); if (!r) { unsigned int i; - pthread_mutex_lock(&env->me_txns->mt_mutex); - for (i=0; ime_txns->mt_numreaders; i++) - if (env->me_txns->mt_readers[i].mr_pid == 0) + pthread_mutex_lock(&env->me_txns->mti_mutex); + for (i=0; ime_txns->mti_numreaders; i++) + if (env->me_txns->mti_readers[i].mr_pid == 0) break; if (i == env->me_maxreaders) { return ENOSPC; } - env->me_txns->mt_readers[i].mr_pid = getpid(); - env->me_txns->mt_readers[i].mr_tid = pthread_self(); - r = &env->me_txns->mt_readers[i]; + env->me_txns->mti_readers[i].mr_pid = getpid(); + env->me_txns->mti_readers[i].mr_tid = pthread_self(); + r = &env->me_txns->mti_readers[i]; pthread_setspecific(env->me_txkey, r); - if (i >= env->me_txns->mt_numreaders) - env->me_txns->mt_numreaders = i+1; - pthread_mutex_unlock(&env->me_txns->mt_mutex); + if (i >= env->me_txns->mti_numreaders) + env->me_txns->mti_numreaders = i+1; + pthread_mutex_unlock(&env->me_txns->mti_mutex); } r->mr_txnid = txn->mt_txnid; txn->mt_u.reader = r; @@ -700,7 +708,6 @@ mdb_txn_begin(MDB_env *env, int rdonly, MDB_txn **ret) void mdb_txn_abort(MDB_txn *txn) { - MDB_dpage *dp; MDB_env *env; if (txn == NULL) @@ -719,12 +726,8 @@ mdb_txn_abort(MDB_txn *txn) unsigned int i; /* Discard all dirty pages. */ - while (!STAILQ_EMPTY(txn->mt_u.dirty_queue)) { - dp = STAILQ_FIRST(txn->mt_u.dirty_queue); - STAILQ_REMOVE_HEAD(txn->mt_u.dirty_queue, h.md_next); - free(dp); - } - free(txn->mt_u.dirty_queue); + for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) + free(txn->mt_u.dirty_list[i].mptr); while ((mop = txn->mt_env->me_pghead)) { txn->mt_env->me_pghead = mop->mo_next; @@ -732,10 +735,10 @@ mdb_txn_abort(MDB_txn *txn) } env->me_txn = NULL; - env->me_txns->mt_txnid--; + env->me_txns->mti_txnid--; for (i=2; ime_numdbs; i++) env->me_dbxs[i].md_dirty = 0; - pthread_mutex_unlock(&env->me_txns->mt_wmutex); + pthread_mutex_unlock(&env->me_txns->mti_wmutex); } free(txn); @@ -776,7 +779,7 @@ mdb_txn_commit(MDB_txn *txn) return EINVAL; } - if (STAILQ_EMPTY(txn->mt_u.dirty_queue)) + if (!txn->mt_u.dirty_list[0].mid) goto done; DPRINTF("committing transaction %lu on mdbenv %p, root page %lu", @@ -849,7 +852,8 @@ mdb_txn_commit(MDB_txn *txn) n = 0; done = 1; size = 0; - STAILQ_FOREACH(dp, txn->mt_u.dirty_queue, h.md_next) { + for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) { + dp = txn->mt_u.dirty_list[i].mptr; if (dp->p.mp_pgno != next) { if (n) { DPRINTF("committing %u dirty pages", n); @@ -901,15 +905,11 @@ mdb_txn_commit(MDB_txn *txn) /* Drop the dirty pages. */ - while (!STAILQ_EMPTY(txn->mt_u.dirty_queue)) { - dp = STAILQ_FIRST(txn->mt_u.dirty_queue); - STAILQ_REMOVE_HEAD(txn->mt_u.dirty_queue, h.md_next); - free(dp); - } + for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) + free(txn->mt_u.dirty_list[i].mptr); if ((n = mdb_env_sync(env, 0)) != 0 || - (n = mdb_env_write_meta(txn)) != MDB_SUCCESS || - (n = mdb_env_sync(env, 0)) != 0) { + (n = mdb_env_write_meta(txn)) != MDB_SUCCESS) { mdb_txn_abort(txn); return n; } @@ -936,8 +936,7 @@ mdb_txn_commit(MDB_txn *txn) free(txn->mt_dbs); } - pthread_mutex_unlock(&env->me_txns->mt_wmutex); - free(txn->mt_u.dirty_queue); + pthread_mutex_unlock(&env->me_txns->mti_wmutex); free(txn); txn = NULL; @@ -1063,8 +1062,7 @@ mdb_env_write_meta(MDB_txn *txn) off += env->me_psize; off += PAGEHDRSZ; - lseek(env->me_fd, off, SEEK_SET); - rc = write(env->me_fd, ptr, len); + rc = pwrite(env->me_fd, ptr, len, off); if (rc != len) { DPRINTF("write failed, disk error?"); return errno; @@ -1105,6 +1103,7 @@ mdb_env_create(MDB_env **env) e->me_maxdbs = 2; e->me_fd = -1; e->me_lfd = -1; + e->me_mfd = -1; *env = e; return MDB_SUCCESS; } @@ -1218,7 +1217,7 @@ mdb_env_share_locks(MDB_env *env) { struct flock lock_info; - env->me_txns->mt_txnid = env->me_meta->mm_txnid; + env->me_txns->mti_txnid = env->me_meta->mm_txnid; memset((void *)&lock_info, 0, sizeof(lock_info)); lock_info.l_type = F_RDLCK; @@ -1283,22 +1282,22 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) pthread_mutexattr_init(&mattr); pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); - pthread_mutex_init(&env->me_txns->mt_mutex, &mattr); - pthread_mutex_init(&env->me_txns->mt_wmutex, &mattr); - env->me_txns->mt_version = MDB_VERSION; - env->me_txns->mt_magic = MDB_MAGIC; - env->me_txns->mt_txnid = 0; - env->me_txns->mt_numreaders = 0; + pthread_mutex_init(&env->me_txns->mti_mutex, &mattr); + pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr); + env->me_txns->mti_version = MDB_VERSION; + env->me_txns->mti_magic = MDB_MAGIC; + env->me_txns->mti_txnid = 0; + env->me_txns->mti_numreaders = 0; } else { - if (env->me_txns->mt_magic != MDB_MAGIC) { + if (env->me_txns->mti_magic != MDB_MAGIC) { DPRINTF("lock region has invalid magic"); rc = EINVAL; goto fail; } - if (env->me_txns->mt_version != MDB_VERSION) { + if (env->me_txns->mti_version != MDB_VERSION) { DPRINTF("lock region is version %u, expected version %u", - env->me_txns->mt_version, MDB_VERSION); + env->me_txns->mti_version, MDB_VERSION); rc = MDB_VERSION_MISMATCH; goto fail; } @@ -1347,6 +1346,16 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) close(env->me_fd); env->me_fd = -1; } else { + /* synchronous fd for meta writes */ + if (!(flags & (MDB_RDONLY|MDB_NOSYNC))) + oflags |= O_DSYNC; + if ((env->me_mfd = open(dpath, oflags, mode)) == -1) { + rc = errno; + close(env->me_fd); + env->me_fd = -1; + return rc; + } + env->me_path = strdup(path); DPRINTF("opened dbenv %p", (void *) env); pthread_key_create(&env->me_txkey, mdb_env_reader_dest); @@ -1377,6 +1386,7 @@ mdb_env_close(MDB_env *env) if (env->me_map) { munmap(env->me_map, env->me_mapsize); } + close(env->me_mfd); close(env->me_fd); if (env->me_txns) { size_t size = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); @@ -1489,14 +1499,16 @@ mdb_get_page(MDB_txn *txn, pgno_t pgno) MDB_page *p = NULL; int found = 0; - if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY) && !STAILQ_EMPTY(txn->mt_u.dirty_queue)) { + if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY) && txn->mt_u.dirty_list[0].mid) { MDB_dpage *dp; - STAILQ_FOREACH(dp, txn->mt_u.dirty_queue, h.md_next) { - if (dp->p.mp_pgno == pgno) { - p = &dp->p; - found = 1; - break; - } + MIDL2 id; + unsigned x; + id.mid = pgno; + x = mdb_midl2_search(txn->mt_u.dirty_list, &id); + if (x <= txn->mt_u.dirty_list[0].mid && txn->mt_u.dirty_list[x].mid == pgno) { + dp = txn->mt_u.dirty_list[x].mptr; + p = &dp->p; + found = 1; } } if (!found) { diff --git a/servers/slapd/back-mdb/mdb.h b/servers/slapd/back-mdb/mdb.h index f7e98e58b6..7fb2186da1 100644 --- a/servers/slapd/back-mdb/mdb.h +++ b/servers/slapd/back-mdb/mdb.h @@ -110,8 +110,6 @@ typedef struct MDB_stat { } MDB_stat; char *mdb_version(int *major, int *minor, int *patch); -char *mdb_strerror(int rc); - int mdb_env_create(MDB_env **env); int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode); int mdb_env_stat(MDB_env *env, MDB_stat *stat); diff --git a/servers/slapd/back-mdb/midl.c b/servers/slapd/back-mdb/midl.c index 8b39acad65..136798f75f 100644 --- a/servers/slapd/back-mdb/midl.c +++ b/servers/slapd/back-mdb/midl.c @@ -24,6 +24,9 @@ typedef unsigned long pgno_t; /* Sort the IDLs from highest to lowest */ #define IDL_CMP(x,y) ( x > y ? -1 : ( x < y ? 1 : 0 ) ) +/* Sort the IDL2s from lowest to highest */ +#define IDL2_CMP(x,y) ( x < y ? -1 : ( x > y ? 1 : 0 ) ) + unsigned mdb_midl_search( ID *ids, ID id ) { /* @@ -62,7 +65,7 @@ unsigned mdb_midl_search( ID *ids, ID id ) int mdb_midl_insert( ID *ids, ID id ) { - unsigned x; + unsigned x, i; if (MDB_IDL_IS_RANGE( ids )) { /* if already in range, treat as a dup */ @@ -101,9 +104,78 @@ int mdb_midl_insert( ID *ids, ID id ) } else { /* insert id */ - AC_MEMCPY( &ids[x+1], &ids[x], (ids[0]-x) * sizeof(ID) ); + for (i=ids[0]; i>x; i--) + ids[i] = ids[i-1]; ids[x] = id; } return 0; } + +unsigned mdb_midl2_search( MIDL2 *ids, MIDL2 *id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 0; + int val = 0; + unsigned n = ids[0].mid; + + while( 0 < n ) { + int pivot = n >> 1; + cursor = base + pivot; + val = IDL2_CMP( id->mid, ids[cursor + 1].mid ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor + 1; + n -= pivot + 1; + + } else { + return cursor + 1; + } + } + + if( val > 0 ) { + return cursor + 2; + } else { + return cursor + 1; + } +} + +int mdb_midl2_insert( MIDL2 *ids, MIDL2 *id ) +{ + unsigned x, i; + + x = mdb_midl2_search( ids, id ); + assert( x > 0 ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0].mid && ids[x].mid == id->mid ) { + /* duplicate */ + return -1; + } + + if ( ids[0].mid >= MDB_IDL_DB_MAX ) { + /* too big */ + return -2; + + } else { + /* insert id */ + ids[0].mid++; + for (i=ids[0].mid; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = *id; + } + + return 0; +} diff --git a/servers/slapd/back-mdb/midl.h b/servers/slapd/back-mdb/midl.h index aeb0af6203..31a4b67bd2 100644 --- a/servers/slapd/back-mdb/midl.h +++ b/servers/slapd/back-mdb/midl.h @@ -17,8 +17,6 @@ #ifndef _MDB_MIDL_H_ #define _MDB_MIDL_H_ -#define AC_MEMCPY(dst,src,size) memcpy(dst,src,size) - #define ID unsigned long #define NOID ((ID)~0) @@ -61,7 +59,7 @@ #define MDB_IDL_IS_ALL( range, ids ) ( (ids)[0] == NOID \ && (ids)[1] <= (range)[1] && (range)[2] <= (ids)[2] ) -#define MDB_IDL_CPY( dst, src ) (AC_MEMCPY( dst, src, MDB_IDL_SIZEOF( src ) )) +#define MDB_IDL_CPY( dst, src ) (memcpy( dst, src, MDB_IDL_SIZEOF( src ) )) #define MDB_IDL_ID( bdb, ids, id ) MDB_IDL_RANGE( ids, id, ((bdb)->bi_lastid) ) #define MDB_IDL_ALL( bdb, ids ) MDB_IDL_RANGE( ids, 1, ((bdb)->bi_lastid) ) @@ -75,4 +73,12 @@ int mdb_midl_insert( ID *ids, ID id ); +typedef struct MIDL2 { + ID mid; + void *mptr; +} MIDL2; + +unsigned mdb_midl2_search( MIDL2 *ids, MIDL2 *id ); +int mdb_midl2_insert( MIDL2 *ids, MIDL2 *id ); + #endif /* _MDB_MIDL_H_ */ diff --git a/servers/slapd/back-mdb/proto-mdb.h b/servers/slapd/back-mdb/proto-mdb.h index f117ff832d..fab2f63f54 100644 --- a/servers/slapd/back-mdb/proto-mdb.h +++ b/servers/slapd/back-mdb/proto-mdb.h @@ -126,12 +126,12 @@ int mdb_filter_candidates( */ int mdb_id2entry_add( - BackendDB *be, + Operation *op, MDB_txn *tid, Entry *e ); int mdb_id2entry_update( - BackendDB *be, + Operation *op, MDB_txn *tid, Entry *e ); @@ -141,7 +141,7 @@ int mdb_id2entry_delete( Entry *e); int mdb_id2entry( - BackendDB *be, + Operation *op, MDB_txn *tid, ID id, Entry **e);