]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/fs/rock/RockRebuild.cc
2 * DEBUG: section 79 Disk IO Routines
7 #include "fs/rock/RockDbCell.h"
8 #include "fs/rock/RockRebuild.h"
9 #include "fs/rock/RockSwapDir.h"
11 #include "ipc/StoreMap.h"
13 #include "SquidTime.h"
14 #include "store_rebuild.h"
22 CBDATA_NAMESPACED_CLASS_INIT(Rock
, Rebuild
);
25 \defgroup RockFsRebuild Rock Store Rebuild
28 \section Overview Overview
29 * Several layers of information are manipualted during the rebuild:
31 * Store Entry: Response message plus all the metainformation associated with
32 * it. Identified by store key. At any given time, from Squid point
33 * of view, there is only one entry with a given key, but several
34 * different entries with the same key can be observed in any historical
35 * archive (such as an access log or a store database).
37 * Slot chain: A sequence of db slots representing a Store Entry state at
38 * some point in time. Identified by key+version combination. Due to
39 * transaction aborts, crashes, and idle periods, some chains may contain
40 * incomplete or stale information. We assume that no two different chains
41 * have the same key and version. If that assumption fails, we may serve a
42 * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed.
44 * Db slot: A db record containing a piece of a single store entry and linked
45 * to other slots with the same key and version fields, forming a chain.
46 * Slots are identified by their absolute position in the database file,
47 * which is naturally unique.
49 * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is
50 * entry-level and is stored at fileno position. In other words, the array of
51 * LoadingEntries should be interpreted as two arrays, one that maps slot ID
52 * to the LoadingEntry::mapped/free/more members, and the second one that maps
53 * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno.
55 * When information from the newly loaded db slot contradicts the entry-level
56 * information collected so far (e.g., the versions do not match or the total
57 * chain size after the slot contribution exceeds the expected number), the
58 * whole entry (and not just the chain or the slot!) is declared corrupted.
60 * Why invalidate the whole entry? Rock Store is written for high-load
61 * environments with large caches, where there is usually very few idle slots
62 * in the database. A space occupied by a purged entry is usually immediately
63 * reclaimed. A Squid crash or a transaction abort is rather unlikely to
64 * leave a relatively large number of stale slots in the database. Thus, the
65 * number of potentially corrupted entries is relatively small. On the other
66 * hand, the damage from serving a single hadgepodge entry may be significant
67 * to the user. In such an environment, invalidating the whole entry has
68 * negligible performance impact but saves us from high-damage bugs.
74 /// maintains information about the store entry being loaded from disk
75 /// used for identifying partially stored/loaded entries
79 LoadingEntry(): size(0), version(0), state(leEmpty
), anchored(0),
80 mapped(0), freed(0), more(-1) {}
82 /* store entry-level information indexed by sfileno */
83 uint64_t size
; ///< payload seen so far
84 uint32_t version
; ///< DbCellHeader::version to distinguish same-URL chains
85 uint8_t state
:3; ///< current entry state (one of the State values)
86 uint8_t anchored
:1; ///< whether we loaded the inode slot for this entry
88 /* db slot-level information indexed by slotId, starting with firstSlot */
89 uint8_t mapped
:1; ///< whether this slot was added to a mapped entry
90 uint8_t freed
:1; ///< whether this slot was marked as free
91 Ipc::StoreMapSliceId more
; ///< another slot in some entry chain (unordered)
92 bool used() const { return freed
|| mapped
|| more
!= -1; }
94 /// possible entry states
95 typedef enum { leEmpty
= 0, leLoading
, leLoaded
, leCorrupted
, leIgnored
} State
;
98 } /* namespace Rock */
100 Rock::Rebuild::Rebuild(SwapDir
*dir
): AsyncJob("Rock::Rebuild"),
113 memset(&counts
, 0, sizeof(counts
));
114 dbSize
= sd
->diskOffsetLimit(); // we do not care about the trailer waste
115 dbSlotSize
= sd
->slotSize
;
116 dbEntryLimit
= sd
->entryLimitActual();
117 dbSlotLimit
= sd
->slotLimitActual();
118 assert(dbEntryLimit
<= dbSlotLimit
);
121 Rock::Rebuild::~Rebuild()
128 /// prepares and initiates entry loading sequence
130 Rock::Rebuild::start()
132 // in SMP mode, only the disker is responsible for populating the map
133 if (UsingSmp() && !IamDiskProcess()) {
134 debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" <<
135 sd
->index
<< " from " << sd
->filePath
);
136 mustStop("non-disker");
140 debugs(47, DBG_IMPORTANT
, "Loading cache_dir #" << sd
->index
<<
141 " from " << sd
->filePath
);
143 fd
= file_open(sd
->filePath
, O_RDONLY
| O_BINARY
);
145 failure("cannot open db", errno
);
147 char hdrBuf
[SwapDir::HeaderSize
];
148 if (read(fd
, hdrBuf
, sizeof(hdrBuf
)) != SwapDir::HeaderSize
)
149 failure("cannot read db header", errno
);
151 // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours
152 assert(sizeof(DbCellHeader
) < SM_PAGE_SIZE
);
153 buf
.init(SM_PAGE_SIZE
, SM_PAGE_SIZE
);
155 dbOffset
= SwapDir::HeaderSize
;
157 entries
= new LoadingEntry
[dbSlotLimit
];
162 /// continues after a pause if not done
164 Rock::Rebuild::checkpoint()
167 eventAdd("Rock::Rebuild", Rock::Rebuild::Steps
, this, 0.01, 1, true);
171 Rock::Rebuild::doneAll() const
173 return loadingPos
>= dbSlotLimit
&& validationPos
>= dbSlotLimit
&&
178 Rock::Rebuild::Steps(void *data
)
180 // use async call to enable job call protection that time events lack
181 CallJobHere(47, 5, static_cast<Rebuild
*>(data
), Rock::Rebuild
, steps
);
185 Rock::Rebuild::steps()
187 if (loadingPos
< dbSlotLimit
)
196 Rock::Rebuild::loadingSteps()
198 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
199 dbOffset
<< " <= " << dbSize
);
201 // Balance our desire to maximize the number of entries processed at once
202 // (and, hence, minimize overheads and total rebuild time) with a
203 // requirement to also process Coordinator events, disk I/Os, etc.
204 const int maxSpentMsec
= 50; // keep small: most RAM I/Os are under 1ms
205 const timeval loopStart
= current_time
;
208 while (loadingPos
< dbSlotLimit
) {
210 dbOffset
+= dbSlotSize
;
214 if (counts
.scancount
% 1000 == 0)
215 storeRebuildProgress(sd
->index
, dbSlotLimit
, counts
.scancount
);
217 if (opt_foreground_rebuild
)
218 continue; // skip "few entries at a time" check below
221 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
222 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
223 debugs(47, 5, HERE
<< "pausing after " << loaded
<< " entries in " <<
224 elapsedMsec
<< "ms; " << (elapsedMsec
/loaded
) << "ms per entry");
231 Rock::Rebuild::loadOneSlot()
233 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
234 dbOffset
<< " <= " << dbSize
);
238 if (lseek(fd
, dbOffset
, SEEK_SET
) < 0)
239 failure("cannot seek to db entry", errno
);
243 if (!storeRebuildLoadEntry(fd
, sd
->index
, buf
, counts
))
246 const SlotId slotId
= loadingPos
;
250 if (buf
.contentSize() < static_cast<mb_size_t
>(sizeof(header
))) {
251 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
252 "Ignoring truncated " << buf
.contentSize() << "-byte " <<
253 "cache entry meta data at " << dbOffset
);
254 freeSlotIfIdle(slotId
, true);
257 memcpy(&header
, buf
.content(), sizeof(header
));
258 if (header
.empty()) {
259 freeSlotIfIdle(slotId
, false);
262 if (!header
.sane(dbSlotSize
, dbSlotLimit
)) {
263 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
264 "Ignoring malformed cache entry meta data at " << dbOffset
);
265 freeSlotIfIdle(slotId
, true);
268 buf
.consume(sizeof(header
)); // optimize to avoid memmove()
270 useNewSlot(slotId
, header
);
273 /// parse StoreEntry basics and add them to the map, returning true on success
275 Rock::Rebuild::importEntry(Ipc::StoreMapAnchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
277 cache_key key
[SQUID_MD5_DIGEST_LENGTH
];
279 const uint64_t knownSize
= header
.entrySize
> 0 ?
280 header
.entrySize
: anchor
.basics
.swap_file_sz
.get();
281 if (!storeRebuildParseEntry(buf
, loadedE
, key
, counts
, knownSize
))
284 // the entry size may still be unknown at this time
286 debugs(47, 8, "importing basics for entry " << fileno
<<
287 " swap_file_sz: " << loadedE
.swap_file_sz
);
290 // we have not validated whether all db cells for this entry were loaded
291 EBIT_CLR(anchor
.basics
.flags
, ENTRY_VALIDATED
);
299 Rock::Rebuild::validationSteps()
301 debugs(47, 5, sd
->index
<< " validating from " << validationPos
);
303 // see loadingSteps() for the rationale; TODO: avoid duplication
304 const int maxSpentMsec
= 50; // keep small: validation does not do I/O
305 const timeval loopStart
= current_time
;
308 while (validationPos
< dbSlotLimit
) {
313 if (validationPos
% 1000 == 0)
314 debugs(20, 2, "validated: " << validationPos
);
316 if (opt_foreground_rebuild
)
317 continue; // skip "few entries at a time" check below
320 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
321 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
322 debugs(47, 5, "pausing after " << validated
<< " entries in " <<
323 elapsedMsec
<< "ms; " << (elapsedMsec
/validated
) << "ms per entry");
330 Rock::Rebuild::validateOneEntry()
332 LoadingEntry
&e
= entries
[validationPos
];
335 case LoadingEntry::leEmpty
:
336 break; // no entry hashed to this position
338 case LoadingEntry::leLoading
:
339 freeBadEntry(validationPos
, "partially stored");
342 case LoadingEntry::leLoaded
:
343 break; // we have already unlocked this entry
345 case LoadingEntry::leCorrupted
:
346 break; // we have already removed this entry
350 /// Marks remaining bad entry slots as free and unlocks the entry. The map
351 /// cannot do this because Loading entries may have holes in the slots chain.
353 Rock::Rebuild::freeBadEntry(const sfileno fileno
, const char *eDescription
)
355 debugs(47, 2, "cache_dir #" << sd
->index
<< ' ' << eDescription
<<
356 " entry " << fileno
<< " is ignored during rebuild");
358 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
360 bool freedSome
= false;
361 // free all loaded non-anchor slots
362 SlotId slotId
= entries
[anchor
.start
].more
;
363 while (slotId
>= 0) {
364 const SlotId next
= entries
[slotId
].more
;
365 freeSlot(slotId
, false);
369 // free anchor slot if it was loaded
370 if (entries
[fileno
].anchored
) {
371 freeSlot(anchor
.start
, false);
376 sd
->map
->forgetWritingEntry(fileno
);
381 Rock::Rebuild::swanSong()
383 debugs(47,3, HERE
<< "cache_dir #" << sd
->index
<< " rebuild level: " <<
384 StoreController::store_dirs_rebuilding
);
385 --StoreController::store_dirs_rebuilding
;
386 storeRebuildComplete(&counts
);
390 Rock::Rebuild::failure(const char *msg
, int errNo
)
392 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
393 dbOffset
<< " <= " << dbSize
);
396 debugs(47, DBG_CRITICAL
, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo
));
397 debugs(47, DBG_CRITICAL
, "Do you need to run 'squid -z' to initialize storage?");
400 fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.",
401 sd
->index
, sd
->filePath
, msg
);
404 /// adds slot to the free slot index
406 Rock::Rebuild::freeSlot(const SlotId slotId
, const bool invalid
)
408 debugs(47,5, sd
->index
<< " frees slot " << slotId
);
409 LoadingEntry
&le
= entries
[slotId
];
415 //sd->unlink(fileno); leave garbage on disk, it should not hurt
418 Ipc::Mem::PageId pageId
;
419 pageId
.pool
= sd
->index
+1;
420 pageId
.number
= slotId
+1;
421 sd
->freeSlots
->push(pageId
);
424 /// adds slot to the free slot index but only if the slot is unused
426 Rock::Rebuild::freeSlotIfIdle(const SlotId slotId
, const bool invalid
)
428 const LoadingEntry
&le
= entries
[slotId
];
430 // mapped slots must be freed via freeBadEntry() to keep the map in sync
434 freeSlot(slotId
, invalid
);
437 /// adds slot to the entry chain in the map
439 Rock::Rebuild::mapSlot(const SlotId slotId
, const DbCellHeader
&header
)
441 LoadingEntry
&le
= entries
[slotId
];
446 Ipc::StoreMapSlice slice
;
447 slice
.next
= header
.nextSlot
;
448 slice
.size
= header
.payloadSize
;
449 sd
->map
->importSlice(slotId
, slice
);
452 /// adds slot to an existing entry chain; caller must check that the slot
453 /// belongs to the chain it is being added to
455 Rock::Rebuild::addSlotToEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
457 LoadingEntry
&le
= entries
[fileno
];
458 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
460 assert(le
.version
== header
.version
);
462 // mark anchor as loaded or add the secondary slot to the chain
463 LoadingEntry
&inode
= entries
[header
.firstSlot
];
464 if (header
.firstSlot
== slotId
) {
465 debugs(47,5, "adding inode");
466 assert(!inode
.freed
);
469 debugs(47,9, "linking " << slotId
<< " to " << inode
.more
);
470 // we do not need to preserve the order
471 LoadingEntry
&slice
= entries
[slotId
];
472 assert(!slice
.freed
);
473 assert(slice
.more
< 0);
474 slice
.more
= inode
.more
;
478 if (header
.firstSlot
== slotId
&& !importEntry(anchor
, fileno
, header
)) {
479 le
.state
= LoadingEntry::leCorrupted
;
480 freeBadEntry(fileno
, "corrupted metainfo");
484 // set total entry size and/or check it for consistency
485 debugs(47, 8, "header.entrySize: " << header
.entrySize
<< " swap_file_sz: " << anchor
.basics
.swap_file_sz
);
486 uint64_t totalSize
= header
.entrySize
;
487 assert(totalSize
!= static_cast<uint64_t>(-1));
488 if (!totalSize
&& anchor
.basics
.swap_file_sz
) {
489 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
490 // perhaps we loaded a later slot (with entrySize) earlier
491 totalSize
= anchor
.basics
.swap_file_sz
;
492 } else if (totalSize
&& !anchor
.basics
.swap_file_sz
) {
493 anchor
.basics
.swap_file_sz
= totalSize
;
494 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
495 } else if (totalSize
!= anchor
.basics
.swap_file_sz
) {
496 le
.state
= LoadingEntry::leCorrupted
;
497 freeBadEntry(fileno
, "size mismatch");
501 le
.size
+= header
.payloadSize
;
503 if (totalSize
> 0 && le
.size
> totalSize
) { // overflow
504 debugs(47, 8, "overflow: " << le
.size
<< " > " << totalSize
);
505 le
.state
= LoadingEntry::leCorrupted
;
506 freeBadEntry(fileno
, "overflowing");
510 mapSlot(slotId
, header
);
511 if (totalSize
> 0 && le
.size
== totalSize
) {
512 // entry fully loaded, unlock it
513 // we have validated that all db cells for this entry were loaded
514 EBIT_SET(anchor
.basics
.flags
, ENTRY_VALIDATED
);
515 le
.state
= LoadingEntry::leLoaded
;
516 sd
->map
->closeForWriting(fileno
, false);
521 /// initialize housekeeping information for a newly accepted entry
523 Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
525 anchor
.setKey(reinterpret_cast<const cache_key
*>(header
.key
));
526 assert(header
.firstSlot
>= 0);
527 anchor
.start
= header
.firstSlot
;
529 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
531 LoadingEntry
&le
= entries
[fileno
];
532 le
.state
= LoadingEntry::leLoading
;
533 le
.version
= header
.version
;
537 /// handle a slot from an entry that we have not seen before
539 Rock::Rebuild::startNewEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
541 // If some other from-disk entry is/was using this slot as its inode OR
542 // if some other from-disk entry is/was using our inode slot, then the
543 // entries are conflicting. We cannot identify other entries, so we just
544 // remove ours and hope that the others were/will be handled correctly.
545 const LoadingEntry
&slice
= entries
[slotId
];
546 const LoadingEntry
&inode
= entries
[header
.firstSlot
];
547 if (slice
.used() || inode
.used()) {
548 debugs(47,8, "slice/inode used: " << slice
.used() << inode
.used());
549 LoadingEntry
&le
= entries
[fileno
];
550 le
.state
= LoadingEntry::leCorrupted
;
551 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
552 // if not idle, the other entry will handle its slice
557 // A miss may have been stored at our fileno while we were loading other
558 // slots from disk. We ought to preserve that entry because it is fresher.
559 const bool overwriteExisting
= false;
560 if (Ipc::StoreMap::Anchor
*anchor
= sd
->map
->openForWritingAt(fileno
, overwriteExisting
)) {
561 primeNewEntry(*anchor
, fileno
, header
);
562 addSlotToEntry(fileno
, slotId
, header
); // may fail
563 assert(anchor
->basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
565 // A new from-network entry is occupying our map slot; let it be, but
566 // save us from the trouble of going through the above motions again.
567 LoadingEntry
&le
= entries
[fileno
];
568 le
.state
= LoadingEntry::leIgnored
;
569 freeSlotIfIdle(slotId
, false);
573 /// does the header belong to the fileno entry being loaded?
575 Rock::Rebuild::sameEntry(const sfileno fileno
, const DbCellHeader
&header
) const
577 const Ipc::StoreMap::Anchor
&anchor
= sd
->map
->writeableEntry(fileno
);
578 const LoadingEntry
&le
= entries
[fileno
];
579 // any order will work, but do fast comparisons first:
580 return le
.version
== header
.version
&&
581 anchor
.start
== static_cast<Ipc::StoreMapSliceId
>(header
.firstSlot
) &&
582 anchor
.sameKey(reinterpret_cast<const cache_key
*>(header
.key
));
585 /// is the new header consistent with information already loaded?
587 Rock::Rebuild::canAdd(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
) const
589 if (!sameEntry(fileno
, header
)) {
590 debugs(79, 7, "cannot add; wrong entry");
594 const LoadingEntry
&le
= entries
[slotId
];
595 // We cannot add a slot that was already declared free or mapped.
596 if (le
.freed
|| le
.mapped
) {
597 debugs(79, 7, "cannot add; freed/mapped: " << le
.freed
<< le
.mapped
);
601 if (slotId
== header
.firstSlot
) {
602 // If we are the inode, the anchored flag cannot be set yet.
603 if (entries
[fileno
].anchored
) {
604 debugs(79, 7, "cannot add; extra anchor");
608 // And there should have been some other slot for this entry to exist.
610 debugs(79, 7, "cannot add; missing slots");
617 // We are the continuation slice so the more field is reserved for us.
619 debugs(79, 7, "cannot add; foreign slot");
626 /// handle freshly loaded (and validated) db slot header
628 Rock::Rebuild::useNewSlot(const SlotId slotId
, const DbCellHeader
&header
)
630 LoadingEntry
&slice
= entries
[slotId
];
631 assert(!slice
.freed
); // we cannot free what was not loaded
633 const cache_key
*const key
=
634 reinterpret_cast<const cache_key
*>(header
.key
);
635 const sfileno fileno
= sd
->map
->anchorIndexByKey(key
);
636 assert(0 <= fileno
&& fileno
< dbEntryLimit
);
638 LoadingEntry
&le
= entries
[fileno
];
639 debugs(47,9, "entry " << fileno
<< " state: " << le
.state
<< ", inode: " <<
640 header
.firstSlot
<< ", size: " << header
.payloadSize
);
644 case LoadingEntry::leEmpty
: {
645 startNewEntry(fileno
, slotId
, header
);
649 case LoadingEntry::leLoading
: {
650 if (canAdd(fileno
, slotId
, header
)) {
651 addSlotToEntry(fileno
, slotId
, header
);
653 // either the loading chain or this slot is stale;
654 // be conservative and ignore both (and any future ones)
655 le
.state
= LoadingEntry::leCorrupted
;
656 freeBadEntry(fileno
, "duplicated");
657 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
663 case LoadingEntry::leLoaded
: {
664 // either the previously loaded chain or this slot is stale;
665 // be conservative and ignore both (and any future ones)
666 le
.state
= LoadingEntry::leCorrupted
;
667 sd
->map
->freeEntry(fileno
); // may not be immediately successful
668 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
673 case LoadingEntry::leCorrupted
: {
674 // previously seen slots messed things up so we must ignore this one
675 freeSlotIfIdle(slotId
, false);
679 case LoadingEntry::leIgnored
: {
680 // already replaced by a fresher or colliding from-network entry
681 freeSlotIfIdle(slotId
, false);