2 * See the file LICENSE for redistribution information.
4 * Copyright (c) 1996, 1997
5 * Sleepycat Software. All rights reserved.
10 static const char sccsid
[] = "@(#)mp_fget.c 10.33 (Sleepycat) 12/2/97";
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
26 #include "common_ext.h"
28 int __sleep_on_every_page_get
; /* XXX: thread debugging option. */
32 * Get a page from the file.
35 memp_fget(dbmfp
, pgnoaddr
, flags
, addrp
)
45 size_t bucket
, mf_offset
;
47 int b_incr
, b_inserted
, readonly_alloc
, ret
;
56 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
57 * files here, and create non-existent pages in readonly files if the
58 * flags are set, later. The reason is that the hash access method
59 * wants to get empty pages that don't really exist in readonly files.
60 * The only alternative is for hash to write the last "bucket" all the
61 * time, which we don't want to do because one of our big goals in life
62 * is to keep database files small. It's sleazy as hell, but we catch
63 * any attempt to actually write the file in memp_fput().
65 #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
68 __db_fchk(dbmp
->dbenv
, "memp_fget", flags
, OKFLAGS
)) != 0)
78 return (__db_ferr(dbmp
->dbenv
, "memp_fget", 1));
85 * We want to switch threads as often as possible. Sleep every time
86 * we get a new page to make it more likely.
88 if (__sleep_on_every_page_get
&&
89 (__db_yield
== NULL
|| __db_yield() != 0))
95 mf_offset
= R_OFFSET(dbmp
, mfp
);
98 b_incr
= b_inserted
= ret
= 0;
103 * If mmap'ing the file, just return a pointer. However, if another
104 * process has opened the file for writing since we mmap'd it, start
105 * playing the game by their rules, i.e. everything goes through the
106 * cache. All pages previously returned should be safe, as long as
107 * a locking protocol was observed.
110 * We don't discard the map because we don't know when all of the
111 * pages will have been discarded from the process' address space.
112 * It would be possible to do so by reference counting the open
113 * pages from the mmap, but it's unclear to me that it's worth it.
115 if (dbmfp
->addr
!= NULL
&& F_ISSET(dbmfp
->mfp
, MP_CAN_MMAP
)) {
117 if (LF_ISSET(DB_MPOOL_LAST
))
118 *pgnoaddr
= mfp
->last_pgno
;
122 * Allocate a page that can never really exist. See
123 * the comment above about non-existent pages and the
124 * hash access method.
126 if (LF_ISSET(DB_MPOOL_CREATE
| DB_MPOOL_NEW
))
128 else if (*pgnoaddr
> mfp
->last_pgno
) {
129 __db_err(dbmp
->dbenv
,
130 "%s: page %lu doesn't exist",
131 __memp_fn(dbmfp
), (u_long
)*pgnoaddr
);
136 if (!readonly_alloc
) {
137 addr
= R_ADDR(dbmfp
, *pgnoaddr
* mfp
->stat
.st_pagesize
);
146 /* Check if requesting the last page or a new page. */
147 if (LF_ISSET(DB_MPOOL_LAST
))
148 *pgnoaddr
= mfp
->last_pgno
;
150 if (LF_ISSET(DB_MPOOL_NEW
)) {
151 *pgnoaddr
= mfp
->last_pgno
+ 1;
155 /* Check the BH hash bucket queue. */
156 bucket
= BUCKET(mp
, mf_offset
, *pgnoaddr
);
158 bhp
= SH_TAILQ_FIRST(&dbmp
->htab
[bucket
], __bh
);
159 bhp
!= NULL
; bhp
= SH_TAILQ_NEXT(bhp
, hq
, __bh
)) {
161 if (bhp
->pgno
== *pgnoaddr
&& bhp
->mf_offset
== mf_offset
) {
163 ++mp
->stat
.st_hash_searches
;
164 if (cnt
> mp
->stat
.st_hash_longest
)
165 mp
->stat
.st_hash_longest
= cnt
;
166 mp
->stat
.st_hash_examined
+= cnt
;
171 ++mp
->stat
.st_hash_searches
;
172 if (cnt
> mp
->stat
.st_hash_longest
)
173 mp
->stat
.st_hash_longest
= cnt
;
174 mp
->stat
.st_hash_examined
+= cnt
;
178 * Allocate a new buffer header and data space, and mark the contents
181 if ((ret
= __memp_ralloc(dbmp
, sizeof(BH
) -
182 sizeof(u_int8_t
) + mfp
->stat
.st_pagesize
, NULL
, &bhp
)) != 0)
186 if ((ALIGNTYPE
)addr
& (sizeof(size_t) - 1)) {
187 __db_err(dbmp
->dbenv
,
188 "Internal error: BH data NOT size_t aligned.");
192 memset(bhp
, 0, sizeof(BH
));
193 LOCKINIT(dbmp
, &bhp
->mutex
);
196 * Prepend the bucket header to the head of the appropriate MPOOL
197 * bucket hash list. Append the bucket header to the tail of the
200 * We have to do this before we read in the page so we can discard
201 * our region lock without screwing up the world.
203 bucket
= BUCKET(mp
, mf_offset
, *pgnoaddr
);
204 SH_TAILQ_INSERT_HEAD(&dbmp
->htab
[bucket
], bhp
, hq
, __bh
);
205 SH_TAILQ_INSERT_TAIL(&mp
->bhq
, bhp
, q
);
206 ++mp
->stat
.st_page_clean
;
209 /* Set the page number, and associated MPOOLFILE. */
210 bhp
->mf_offset
= mf_offset
;
211 bhp
->pgno
= *pgnoaddr
;
214 * If we know we created the page, zero it out and continue.
217 * Note: DB_MPOOL_NEW deliberately doesn't call the pgin function.
218 * If DB_MPOOL_CREATE is used, then the application's pgin function
219 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
220 * it can detect all of its page creates, and not bother.
222 * Otherwise, read the page into memory, optionally creating it if
223 * DB_MPOOL_CREATE is set.
225 * Increment the reference count for created buffers, but importantly,
226 * increment the reference count for buffers we're about to read so
227 * that the buffer can't move.
232 if (LF_ISSET(DB_MPOOL_NEW
))
233 memset(addr
, 0, mfp
->stat
.st_pagesize
);
236 * It's possible for the read function to fail, which means
237 * that we fail as well.
239 reread
: if ((ret
= __memp_pgread(dbmfp
,
240 bhp
, LF_ISSET(DB_MPOOL_CREATE
| DB_MPOOL_NEW
))) != 0)
245 * The __memp_pgread call discarded and reacquired the region
246 * lock. Because the buffer reference count was incremented
247 * before the region lock was discarded the buffer can't move
248 * and its contents can't change.
250 ++mp
->stat
.st_cache_miss
;
251 ++mfp
->stat
.st_cache_miss
;
255 found
: /* Increment the reference count. */
256 if (bhp
->ref
== UINT16_T_MAX
) {
257 __db_err(dbmp
->dbenv
,
258 "%s: too many references to page %lu",
259 __memp_fn(dbmfp
), bhp
->pgno
);
267 * Any found buffer might be trouble.
270 * I/O in progress, wait for it to finish. Because the buffer
271 * reference count was incremented before the region lock was
272 * discarded we know the buffer can't move and its contents
275 for (cnt
= 0; F_ISSET(bhp
, BH_LOCKED
); ++cnt
) {
279 * Sleep so that we don't simply spin, switching locks.
280 * (See the comment in include/mp.h.)
283 (__db_yield
== NULL
|| __db_yield() != 0))
286 LOCKBUFFER(dbmp
, bhp
);
287 /* Waiting for I/O to finish... */
288 UNLOCKBUFFER(dbmp
, bhp
);
294 * The buffer is garbage.
296 if (F_ISSET(bhp
, BH_TRASH
))
301 * The buffer was written, and the contents need to be
304 if (F_ISSET(bhp
, BH_CALLPGIN
)) {
305 if ((ret
= __memp_pg(dbmfp
, bhp
, 1)) != 0)
307 F_CLR(bhp
, BH_CALLPGIN
);
310 ++mp
->stat
.st_cache_hit
;
311 ++mfp
->stat
.st_cache_hit
;
315 * If we're returning a page after our current notion of the last-page,
316 * update our information. Note, there's no way to un-instantiate this
317 * page, it's going to exist whether it's returned to us dirty or not.
319 if (bhp
->pgno
> mfp
->last_pgno
)
320 mfp
->last_pgno
= bhp
->pgno
;
322 mapret
: LOCKHANDLE(dbmp
, dbmfp
->mutexp
);
324 UNLOCKHANDLE(dbmp
, dbmfp
->mutexp
);
328 * If no other process is already waiting on a created buffer,
329 * go ahead and discard it, it's not useful.
333 if (b_inserted
&& bhp
->ref
== 0)
334 __memp_bhfree(dbmp
, mfp
, bhp
, 1);
339 *(void **)addrp
= addr
;