From: Kris Zyp Date: Wed, 19 Feb 2020 05:49:03 +0000 (-0700) Subject: ITS#9017 LMDB: improve Windows sync commit perf X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dfb3bbed656132456001c5aaca246fd4430e5ef5;p=thirdparty%2Fopenldap.git ITS#9017 LMDB: improve Windows sync commit perf --- diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 603267b9b1..e0b61e1c4f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1565,9 +1565,12 @@ struct MDB_env { HANDLE me_fd; /**< The main data file */ HANDLE me_lfd; /**< The lock file */ HANDLE me_mfd; /**< For writing and syncing the meta pages */ -#if MDB_RPAGE_CACHE && defined(_WIN32) +#ifdef _WIN32 +#ifdef MDB_RPAGE_CACHE HANDLE me_fmh; /**< File Mapping handle */ #endif + HANDLE me_ovfd; /**< Overlapped/async with write-through file handle */ +#endif /* _WIN32 */ /** Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U /** Some fields are initialized. */ @@ -1620,6 +1623,8 @@ struct MDB_env { int me_live_reader; /**< have liveness lock in reader table */ #ifdef _WIN32 int me_pidquery; /**< Used in OpenProcess */ + OVERLAPPED *ov; /**< Used for for overlapping I/O requests */ + int ovs; /**< Count of OVERLAPPEDs */ #endif #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ @@ -2977,7 +2982,11 @@ mdb_env_sync0(MDB_env *env, int force, pgno_t numpgs) int rc = 0; if (env->me_flags & MDB_RDONLY) return EACCES; - if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { + if (force +#ifndef _WIN32 /* Sync is normally achieved in Windows by doing WRITE_THROUGH writes */ + || !(env->me_flags & MDB_NOSYNC) +#endif + ) { if (env->me_flags & MDB_WRITEMAP) { int flags = ((env->me_flags & MDB_MAPASYNC) && !force) ? MS_ASYNC : MS_SYNC; @@ -3907,19 +3916,50 @@ mdb_page_flush(MDB_txn *txn, int keep) #endif #ifdef _WIN32 OVERLAPPED ov; + MDB_page *wdp; + int async_i = 0; + HANDLE fd = (env->me_flags & MDB_NOSYNC) ? env->me_fd : env->me_ovfd; #else struct iovec iov[MDB_COMMIT_PAGES]; + HANDLE fd = env->me_fd; +#endif ssize_t wsize = 0, wres; off_t wpos = 0, next_pos = 1; /* impossible pos, so pos != next_pos */ int n = 0; -#endif j = i = keep; - if (env->me_flags & MDB_WRITEMAP) { + if (env->me_flags & MDB_WRITEMAP +#ifdef _WIN32 + /* In windows, we still do writes to the file (with write-through enabled in sync mode), + * as this is faster than FlushViewOfFile/FlushFileBuffers */ + && (env->me_flags & MDB_NOSYNC) +#endif + ) { goto done; } +#ifdef _WIN32 + if (pagecount - keep >= env->ovs) { + /* ran out of room in ov array, and re-malloc, copy handles and free previous */ + int ovs = (pagecount - keep) * 1.5; /* provide extra padding to reduce number of re-allocations */ + int new_size = ovs * sizeof(OVERLAPPED); + ov = malloc(new_size); + if (ov == NULL) + return ENOMEM; + int previous_size = env->ovs * sizeof(OVERLAPPED); + memcpy(ov, env->ov, previous_size); /* Copy previous OVERLAPPED data to retain event handles */ + /* And clear rest of memory */ + memset(&ov[env->ovs], 0, new_size - previous_size); + if (env->ovs > 0) { + free(env->ov); /* release previous allocation */ + } + + env->ov = ov; + env->ovs = ovs; + } +#endif + /* Write the pages */ for (;;) { if (++i <= pagecount) { @@ -3945,60 +3985,58 @@ mdb_page_flush(MDB_txn *txn, int keep) if (IS_OVERFLOW(dp)) size *= dp->mp_pages; #endif } + /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE #ifdef _WIN32 - else break; - -#if MDB_RPAGE_CACHE - if (env->me_encfunc) { - MDB_val in, out; - encp = mdb_page_malloc(txn, nump. 0); - if (!encp) - return ENOMEM; - in.mv_size = size; - in.mv_data = dp; - out.mv_size = size; - out.mv_data = encp; - env->me_encfunc(&in, &out, env->me_enckey, 1); - dp = encp; - } -#endif - /* Windows actually supports scatter/gather I/O, but only on - * unbuffered file handles. Since we're relying on the OS page - * cache for all our data, that's self-defeating. So we just - * write pages one at a time. We use the ov structure to set - * the write offset, to at least save the overhead of a Seek - * system call. - */ - DPRINTF(("committing page %"Yu, pgno)); - memset(&ov, 0, sizeof(ov)); - ov.Offset = pos & 0xffffffff; - ov.OffsetHigh = pos >> 16 >> 16; - rc = 0; - if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) - rc = ErrCode(); -#if MDB_RPAGE_CACHE - if (env->me_encfunc) - mdb_dpage_free_n(env, dp, nump); + /* Windows actually supports scatter/gather I/O, but only on + * unbuffered file handles. Since we're relying on the OS page + * cache for all our data, that's self-defeating. So we just + * write pages one at a time. We use the ov structure to set + * the write offset, to at least save the overhead of a Seek + * system call. + * If writemap is enabled, consecutive page positions infer + * contiguous (mapped) memory. + * Otherwise force write pages one at a time. + */ + || !(env->me_flags & MDB_WRITEMAP) #endif - if (rc) { - DPRINTF(("WriteFile: %d", rc)); - return rc; - } -#else - /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ - if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { + ) { if (n) { retry_write: rc = 0; /* Write previous page(s) */ +#ifdef _WIN32 + OVERLAPPED *this_ov = &ov[async_i]; + /* Clear status, and keep hEvent, we reuse that */ + this_ov->Internal = 0; + this_ov->Offset = wpos & 0xffffffff; + this_ov->OffsetHigh = wpos >> 16 >> 16; + if (!F_ISSET(env->me_flags, MDB_NOSYNC) && !this_ov->hEvent) { + HANDLE event = CreateEvent(NULL, FALSE, FALSE, NULL); + if (!event) { + rc = ErrCode(); + DPRINTF(("CreateEvent: %s", strerror(rc))); + return rc; + } + this_ov->hEvent = event; + } + if (!WriteFile(fd, wdp, wsize, NULL, this_ov)) { + rc = ErrCode(); + if (rc != ERROR_IO_PENDING) { + DPRINTF(("WriteFile: %d", rc)); + return rc; + } + } + async_i++; +#else /* _WIN32 */ #ifdef MDB_USE_PWRITEV - wres = pwritev(env->me_fd, iov, n, wpos); + wres = pwritev(fd, iov, n, wpos); #else if (n == 1) { - wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + wres = pwrite(fd, iov[0].iov_base, wsize, wpos); } else { retry_seek: - if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { + if (lseek(fd, wpos, SEEK_SET) == -1) { rc = ErrCode(); if (rc == EINTR) goto retry_seek; @@ -4006,7 +4044,7 @@ retry_seek: wres = wsize; } else { rc = 0; - wres = writev(env->me_fd, iov, n); + wres = writev(fd, iov, n); } } #endif @@ -4021,6 +4059,7 @@ retry_seek: DPUTS("short write, filesystem full?"); } } +#endif /* _WIN32 */ #if MDB_RPAGE_CACHE if (env->me_encfunc) { int j, num1; @@ -4053,13 +4092,16 @@ retry_seek: dp = encp; } #endif - DPRINTF(("committing page %"Yu, pgno)); - next_pos = pos + size; +#ifdef _WIN32 + wdp = dp; +#else iov[n].iov_len = size; iov[n].iov_base = (char *)dp; +#endif + DPRINTF(("committing page %"Yu, pgno)); + next_pos = pos + size; wsize += size; n++; -#endif /* _WIN32 */ } #if MDB_RPAGE_CACHE if (MDB_REMAPPING(env->me_flags) && pgno > txn->mt_last_pgno) @@ -4072,21 +4114,49 @@ retry_seek: */ CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); - for (i = keep; ++i <= pagecount; ) { - dp = dl[i].mptr; - /* This is a page we skipped above */ - if (!dl[i].mid) { - dl[++j] = dl[i]; - dl[j].mid = dp->mp_pgno; - continue; +#ifdef _WIN32 + if (!F_ISSET(env->me_flags, MDB_NOSYNC)) { + /* Now wait for all the asynchronous/overlapped sync/write-through writes to complete. + * We start with the last one so that all the others should already be complete and + * we reduce thread suspend/resuming (in practice, typically about 99.5% of writes are + * done after the last write is done) */ + rc = 0; + while (--async_i >= 0) { + if (ov[async_i].hEvent) { + if (!GetOverlappedResult(fd, &ov[async_i], &wres, TRUE)) { + rc = ErrCode(); /* Continue on so that all the event signals are reset */ + } + } + } + if (rc) { /* any error on GetOverlappedResult, exit now */ + return rc; + } + } +#endif /* _WIN32 */ + + if (!(env->me_flags & MDB_WRITEMAP)) { + /* Don't free pages when using writemap (can only get here in NOSYNC mode in Windows) + * MIPS has cache coherency issues, this is a no-op everywhere else + * Note: for any size >= on-chip cache size, entire on-chip cache is + * flushed. + */ + for (i = keep; ++i <= pagecount; ) { + dp = dl[i].mptr; + /* This is a page we skipped above */ + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdb_dpage_free(env, dp); } - mdb_dpage_free(env, dp); } + +done: i--; txn->mt_dirty_room += i - j; dl[0].mid = j; -done: return MDB_SUCCESS; } @@ -4443,7 +4513,6 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) if (len == -1 && ErrCode() == EINTR) continue; \ rc = (len >= 0); break; } while(1) #endif - DPUTS("writing new meta page"); psize = env->me_psize; @@ -4513,6 +4582,7 @@ mdb_env_write_meta(MDB_txn *txn) if (mapsize < env->me_mapsize) mapsize = env->me_mapsize; +#ifndef _WIN32 /* We don't want to ever use MSYNC/FlushViewOfFile in Windows */ if (flags & MDB_WRITEMAP) { mp->mm_mapsize = mapsize; mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; @@ -4528,11 +4598,10 @@ mdb_env_write_meta(MDB_txn *txn) unsigned meta_size = env->me_psize; rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; ptr = (char *)mp - PAGEHDRSZ; -#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ + /* POSIX msync() requires ptr = start of OS page */ r2 = (ptr - env->me_map) & (env->me_os_psize - 1); ptr -= r2; meta_size += r2; -#endif if (MDB_MSYNC(ptr, meta_size, rc)) { rc = ErrCode(); goto fail; @@ -4540,6 +4609,7 @@ mdb_env_write_meta(MDB_txn *txn) } goto done; } +#endif metab.mm_txnid = mp->mm_txnid; metab.mm_last_pg = mp->mm_last_pg; @@ -4921,7 +4991,7 @@ mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) /** File type, access mode etc. for #mdb_fopen() */ enum mdb_fopen_type { #ifdef _WIN32 - MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS + MDB_O_RDONLY, MDB_O_RDWR, MDB_O_OVERLAPPED, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS #else /* A comment in mdb_fopen() explains some O_* flag choices. */ MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ @@ -4982,6 +5052,11 @@ mdb_fopen(const MDB_env *env, MDB_name *fname, disp = OPEN_ALWAYS; attrs = FILE_ATTRIBUTE_NORMAL; switch (which) { + case MDB_O_OVERLAPPED: /* for unbuffered asynchronous writes (write-through mode)*/ + acc = GENERIC_WRITE; + disp = OPEN_EXISTING; + attrs = FILE_FLAG_OVERLAPPED|FILE_FLAG_WRITE_THROUGH; + break; case MDB_O_RDONLY: /* read-only datafile */ acc = GENERIC_READ; disp = OPEN_EXISTING; @@ -5071,6 +5146,7 @@ mdb_env_open2(MDB_env *env, int prev) if (!NtCreateSection) return MDB_PROBLEM; } + env->ovs = 0; #endif /* _WIN32 */ #ifdef BROKEN_FDATASYNC @@ -5909,6 +5985,11 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode mode, &env->me_fd); if (rc) goto leave; +#ifdef _WIN32 + rc = mdb_fopen(env, &fname, MDB_O_OVERLAPPED, mode, &env->me_ovfd); + if (rc) + goto leave; +#endif if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { rc = mdb_env_setup_locks(env, &fname, mode, &excl); @@ -5917,14 +5998,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode } if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) { - if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { - /* Synchronous fd for meta writes. Needed even with - * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. - */ - rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); - if (rc) - goto leave; - } + /* Synchronous fd for meta writes. Needed even with + * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. + */ + rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); + if (rc) + goto leave; DPRINTF(("opened dbenv %p", (void *) env)); if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { rc = mdb_env_share_locks(env, &excl); @@ -6030,6 +6109,16 @@ mdb_env_close_active(MDB_env *env, int excl) } if (env->me_mfd != INVALID_HANDLE_VALUE) (void) close(env->me_mfd); +#ifdef _WIN32 + if (env->ovs > 0) { + for (i = 0; i < env->ovs; i++) { + CloseHandle(env->ov[i].hEvent); + } + free(env->ov); + } + if (env->me_ovfd != INVALID_HANDLE_VALUE) + (void) close(env->me_ovfd); +#endif if (env->me_fd != INVALID_HANDLE_VALUE) (void) close(env->me_fd); if (env->me_txns) {