]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/fs/rock/RockRebuild.cc
2 * DEBUG: section 79 Disk IO Routines
7 #include "fs/rock/RockDbCell.h"
8 #include "fs/rock/RockRebuild.h"
9 #include "fs/rock/RockSwapDir.h"
11 #include "ipc/StoreMap.h"
13 #include "SquidTime.h"
14 #include "store_rebuild.h"
20 CBDATA_NAMESPACED_CLASS_INIT(Rock
, Rebuild
);
23 \defgroup RockFsRebuild Rock Store Rebuild
26 \section Overview Overview
27 * Several layers of information are manipualted during the rebuild:
29 * Store Entry: Response message plus all the metainformation associated with
30 * it. Identified by store key. At any given time, from Squid point
31 * of view, there is only one entry with a given key, but several
32 * different entries with the same key can be observed in any historical
33 * archive (such as an access log or a store database).
35 * Slot chain: A sequence of db slots representing a Store Entry state at
36 * some point in time. Identified by key+version combination. Due to
37 * transaction aborts, crashes, and idle periods, some chains may contain
38 * incomplete or stale information. We assume that no two different chains
39 * have the same key and version. If that assumption fails, we may serve a
40 * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed.
42 * Db slot: A db record containing a piece of a single store entry and linked
43 * to other slots with the same key and version fields, forming a chain.
44 * Slots are identified by their absolute position in the database file,
45 * which is naturally unique.
47 * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is
48 * entry-level and is stored at fileno position. In other words, the array of
49 * LoadingEntries should be interpreted as two arrays, one that maps slot ID
50 * to the LoadingEntry::mapped/free/more members, and the second one that maps
51 * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno.
53 * When information from the newly loaded db slot contradicts the entry-level
54 * information collected so far (e.g., the versions do not match or the total
55 * chain size after the slot contribution exceeds the expected number), the
56 * whole entry (and not just the chain or the slot!) is declared corrupted.
58 * Why invalidate the whole entry? Rock Store is written for high-load
59 * environments with large caches, where there is usually very few idle slots
60 * in the database. A space occupied by a purged entry is usually immediately
61 * reclaimed. A Squid crash or a transaction abort is rather unlikely to
62 * leave a relatively large number of stale slots in the database. Thus, the
63 * number of potentially corrupted entries is relatively small. On the other
64 * hand, the damage from serving a single hadgepodge entry may be significant
65 * to the user. In such an environment, invalidating the whole entry has
66 * negligible performance impact but saves us from high-damage bugs.
72 /// maintains information about the store entry being loaded from disk
73 /// used for identifying partially stored/loaded entries
77 LoadingEntry(): size(0), version(0), state(leEmpty
), anchored(0),
78 mapped(0), freed(0), more(-1) {}
80 /* store entry-level information indexed by sfileno */
81 uint64_t size
; ///< payload seen so far
82 uint32_t version
; ///< DbCellHeader::version to distinguish same-URL chains
83 uint32_t state
:3; ///< current entry state (one of the State values)
84 uint32_t anchored
:1; ///< whether we loaded the inode slot for this entry
86 /* db slot-level information indexed by slotId, starting with firstSlot */
87 uint32_t mapped
:1; ///< whether this slot was added to a mapped entry
88 uint32_t freed
:1; ///< whether this slot was marked as free
89 sfileno more
:25; ///< another slot in some entry chain (unordered)
90 bool used() const { return freed
|| mapped
|| more
!= -1; }
92 /// possible entry states
93 typedef enum { leEmpty
= 0, leLoading
, leLoaded
, leCorrupted
, leIgnored
} State
;
96 } /* namespace Rock */
98 Rock::Rebuild::Rebuild(SwapDir
*dir
): AsyncJob("Rock::Rebuild"),
110 memset(&counts
, 0, sizeof(counts
));
111 dbSize
= sd
->diskOffsetLimit(); // we do not care about the trailer waste
112 dbEntrySize
= sd
->slotSize
;
113 dbEntryLimit
= sd
->entryLimit();
116 Rock::Rebuild::~Rebuild()
123 /// prepares and initiates entry loading sequence
125 Rock::Rebuild::start()
127 // in SMP mode, only the disker is responsible for populating the map
128 if (UsingSmp() && !IamDiskProcess()) {
129 debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" <<
130 sd
->index
<< " from " << sd
->filePath
);
131 mustStop("non-disker");
135 debugs(47, DBG_IMPORTANT
, "Loading cache_dir #" << sd
->index
<<
136 " from " << sd
->filePath
);
138 fd
= file_open(sd
->filePath
, O_RDONLY
| O_BINARY
);
140 failure("cannot open db", errno
);
142 char hdrBuf
[SwapDir::HeaderSize
];
143 if (read(fd
, hdrBuf
, sizeof(hdrBuf
)) != SwapDir::HeaderSize
)
144 failure("cannot read db header", errno
);
146 // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours
147 assert(sizeof(DbCellHeader
) < SM_PAGE_SIZE
);
148 buf
.init(SM_PAGE_SIZE
, SM_PAGE_SIZE
);
150 dbOffset
= SwapDir::HeaderSize
;
153 entries
= new LoadingEntry
[dbEntryLimit
];
158 /// continues after a pause if not done
160 Rock::Rebuild::checkpoint()
163 eventAdd("Rock::Rebuild", Rock::Rebuild::Steps
, this, 0.01, 1, true);
167 Rock::Rebuild::doneAll() const
169 return dbOffset
>= dbSize
&& validationPos
>= dbEntryLimit
&&
174 Rock::Rebuild::Steps(void *data
)
176 // use async call to enable job call protection that time events lack
177 CallJobHere(47, 5, static_cast<Rebuild
*>(data
), Rock::Rebuild
, steps
);
181 Rock::Rebuild::steps()
183 if (dbOffset
< dbSize
)
192 Rock::Rebuild::loadingSteps()
194 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
195 dbOffset
<< " <= " << dbSize
);
197 // Balance our desire to maximize the number of entries processed at once
198 // (and, hence, minimize overheads and total rebuild time) with a
199 // requirement to also process Coordinator events, disk I/Os, etc.
200 const int maxSpentMsec
= 50; // keep small: most RAM I/Os are under 1ms
201 const timeval loopStart
= current_time
;
204 while (loaded
< dbEntryLimit
&& dbOffset
< dbSize
) {
206 dbOffset
+= dbEntrySize
;
210 if (counts
.scancount
% 1000 == 0)
211 storeRebuildProgress(sd
->index
, dbEntryLimit
, counts
.scancount
);
213 if (opt_foreground_rebuild
)
214 continue; // skip "few entries at a time" check below
217 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
218 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
219 debugs(47, 5, HERE
<< "pausing after " << loaded
<< " entries in " <<
220 elapsedMsec
<< "ms; " << (elapsedMsec
/loaded
) << "ms per entry");
227 Rock::Rebuild::loadOneSlot()
229 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
230 dbOffset
<< " <= " << dbSize
);
234 if (lseek(fd
, dbOffset
, SEEK_SET
) < 0)
235 failure("cannot seek to db entry", errno
);
239 if (!storeRebuildLoadEntry(fd
, sd
->index
, buf
, counts
))
242 const SlotId slotId
= loadingPos
;
246 if (buf
.contentSize() < static_cast<mb_size_t
>(sizeof(header
))) {
247 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
248 "Ignoring truncated " << buf
.contentSize() << "-byte " <<
249 "cache entry meta data at " << dbOffset
);
250 freeSlotIfIdle(slotId
, true);
253 memcpy(&header
, buf
.content(), sizeof(header
));
254 if (header
.empty()) {
255 freeSlotIfIdle(slotId
, false);
258 if (!header
.sane(dbEntrySize
, dbEntryLimit
)) {
259 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
260 "Ignoring malformed cache entry meta data at " << dbOffset
);
261 freeSlotIfIdle(slotId
, true);
264 buf
.consume(sizeof(header
)); // optimize to avoid memmove()
266 useNewSlot(slotId
, header
);
269 /// parse StoreEntry basics and add them to the map, returning true on success
271 Rock::Rebuild::importEntry(Ipc::StoreMapAnchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
273 cache_key key
[SQUID_MD5_DIGEST_LENGTH
];
275 const uint64_t knownSize
= header
.entrySize
> 0 ?
276 header
.entrySize
: anchor
.basics
.swap_file_sz
.get();
277 if (!storeRebuildParseEntry(buf
, loadedE
, key
, counts
, knownSize
))
280 // the entry size may still be unknown at this time
282 debugs(47, 8, "importing basics for entry " << fileno
<<
283 " swap_file_sz: " << loadedE
.swap_file_sz
);
286 // we have not validated whether all db cells for this entry were loaded
287 EBIT_CLR(anchor
.basics
.flags
, ENTRY_VALIDATED
);
295 Rock::Rebuild::validationSteps()
297 debugs(47, 5, sd
->index
<< " validating from " << validationPos
);
299 // see loadingSteps() for the rationale; TODO: avoid duplication
300 const int maxSpentMsec
= 50; // keep small: validation does not do I/O
301 const timeval loopStart
= current_time
;
304 while (validationPos
< dbEntryLimit
) {
309 if (validationPos
% 1000 == 0)
310 debugs(20, 2, "validated: " << validationPos
);
312 if (opt_foreground_rebuild
)
313 continue; // skip "few entries at a time" check below
316 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
317 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
318 debugs(47, 5, "pausing after " << validated
<< " entries in " <<
319 elapsedMsec
<< "ms; " << (elapsedMsec
/validated
) << "ms per entry");
326 Rock::Rebuild::validateOneEntry()
328 LoadingEntry
&e
= entries
[validationPos
];
331 case LoadingEntry::leEmpty
:
332 break; // no entry hashed to this position
334 case LoadingEntry::leLoading
:
335 freeBadEntry(validationPos
, "partially stored");
338 case LoadingEntry::leLoaded
:
339 break; // we have already unlocked this entry
341 case LoadingEntry::leCorrupted
:
342 break; // we have already removed this entry
346 /// Marks remaining bad entry slots as free and unlocks the entry. The map
347 /// cannot do this because Loading entries may have holes in the slots chain.
349 Rock::Rebuild::freeBadEntry(const sfileno fileno
, const char *eDescription
)
351 debugs(47, 2, "cache_dir #" << sd
->index
<< ' ' << eDescription
<<
352 " entry " << fileno
<< " is ignored during rebuild");
354 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
356 bool freedSome
= false;
357 // free all loaded non-anchor slots
358 SlotId slotId
= entries
[anchor
.start
].more
;
359 while (slotId
>= 0) {
360 const SlotId next
= entries
[slotId
].more
;
361 freeSlot(slotId
, false);
365 // free anchor slot if it was loaded
366 if (entries
[fileno
].anchored
) {
367 freeSlot(anchor
.start
, false);
372 sd
->map
->forgetWritingEntry(fileno
);
377 Rock::Rebuild::swanSong()
379 debugs(47,3, HERE
<< "cache_dir #" << sd
->index
<< " rebuild level: " <<
380 StoreController::store_dirs_rebuilding
);
381 --StoreController::store_dirs_rebuilding
;
382 storeRebuildComplete(&counts
);
386 Rock::Rebuild::failure(const char *msg
, int errNo
)
388 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
389 dbOffset
<< " <= " << dbSize
);
392 debugs(47, DBG_CRITICAL
, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo
));
393 debugs(47, DBG_CRITICAL
, "Do you need to run 'squid -z' to initialize storage?");
396 fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.",
397 sd
->index
, sd
->filePath
, msg
);
400 /// adds slot to the free slot index
402 Rock::Rebuild::freeSlot(const SlotId slotId
, const bool invalid
)
404 debugs(47,5, sd
->index
<< " frees slot " << slotId
);
405 LoadingEntry
&le
= entries
[slotId
];
411 //sd->unlink(fileno); leave garbage on disk, it should not hurt
414 Ipc::Mem::PageId pageId
;
415 pageId
.pool
= sd
->index
+1;
416 pageId
.number
= slotId
+1;
417 sd
->freeSlots
->push(pageId
);
420 /// adds slot to the free slot index but only if the slot is unused
422 Rock::Rebuild::freeSlotIfIdle(const SlotId slotId
, const bool invalid
)
424 const LoadingEntry
&le
= entries
[slotId
];
426 // mapped slots must be freed via freeBadEntry() to keep the map in sync
430 freeSlot(slotId
, invalid
);
433 /// adds slot to the entry chain in the map
435 Rock::Rebuild::mapSlot(const SlotId slotId
, const DbCellHeader
&header
)
437 LoadingEntry
&le
= entries
[slotId
];
442 Ipc::StoreMapSlice slice
;
443 slice
.next
= header
.nextSlot
;
444 slice
.size
= header
.payloadSize
;
445 sd
->map
->importSlice(slotId
, slice
);
448 /// adds slot to an existing entry chain; caller must check that the slot
449 /// belongs to the chain it is being added to
451 Rock::Rebuild::addSlotToEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
453 LoadingEntry
&le
= entries
[fileno
];
454 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
456 assert(le
.version
== header
.version
);
458 // mark anchor as loaded or add the secondary slot to the chain
459 LoadingEntry
&inode
= entries
[header
.firstSlot
];
460 if (header
.firstSlot
== slotId
) {
461 debugs(47,5, "adding inode");
462 assert(!inode
.freed
);
465 debugs(47,9, "linking " << slotId
<< " to " << inode
.more
);
466 // we do not need to preserve the order
467 LoadingEntry
&slice
= entries
[slotId
];
468 assert(!slice
.freed
);
469 assert(slice
.more
< 0);
470 slice
.more
= inode
.more
;
474 if (header
.firstSlot
== slotId
&& !importEntry(anchor
, fileno
, header
)) {
475 le
.state
= LoadingEntry::leCorrupted
;
476 freeBadEntry(fileno
, "corrupted metainfo");
480 // set total entry size and/or check it for consistency
481 debugs(47, 8, "header.entrySize: " << header
.entrySize
<< " swap_file_sz: " << anchor
.basics
.swap_file_sz
);
482 uint64_t totalSize
= header
.entrySize
;
483 assert(totalSize
!= static_cast<uint64_t>(-1));
484 if (!totalSize
&& anchor
.basics
.swap_file_sz
) {
485 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
486 // perhaps we loaded a later slot (with entrySize) earlier
487 totalSize
= anchor
.basics
.swap_file_sz
;
488 } else if (totalSize
&& !anchor
.basics
.swap_file_sz
) {
489 anchor
.basics
.swap_file_sz
= totalSize
;
490 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
491 } else if (totalSize
!= anchor
.basics
.swap_file_sz
) {
492 le
.state
= LoadingEntry::leCorrupted
;
493 freeBadEntry(fileno
, "size mismatch");
497 le
.size
+= header
.payloadSize
;
499 if (totalSize
> 0 && le
.size
> totalSize
) { // overflow
500 debugs(47, 8, "overflow: " << le
.size
<< " > " << totalSize
);
501 le
.state
= LoadingEntry::leCorrupted
;
502 freeBadEntry(fileno
, "overflowing");
506 mapSlot(slotId
, header
);
507 if (totalSize
> 0 && le
.size
== totalSize
) {
508 // entry fully loaded, unlock it
509 // we have validated that all db cells for this entry were loaded
510 EBIT_SET(anchor
.basics
.flags
, ENTRY_VALIDATED
);
511 le
.state
= LoadingEntry::leLoaded
;
512 sd
->map
->closeForWriting(fileno
, false);
517 /// initialize housekeeping information for a newly accepted entry
519 Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
521 anchor
.setKey(reinterpret_cast<const cache_key
*>(header
.key
));
522 assert(header
.firstSlot
>= 0);
523 anchor
.start
= header
.firstSlot
;
525 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
527 LoadingEntry
&le
= entries
[fileno
];
528 le
.state
= LoadingEntry::leLoading
;
529 le
.version
= header
.version
;
533 /// handle a slot from an entry that we have not seen before
535 Rock::Rebuild::startNewEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
537 // If some other from-disk entry is/was using this slot as its inode OR
538 // if some other from-disk entry is/was using our inode slot, then the
539 // entries are conflicting. We cannot identify other entries, so we just
540 // remove ours and hope that the others were/will be handled correctly.
541 const LoadingEntry
&slice
= entries
[slotId
];
542 const LoadingEntry
&inode
= entries
[header
.firstSlot
];
543 if (slice
.used() || inode
.used()) {
544 debugs(47,8, "slice/inode used: " << slice
.used() << inode
.used());
545 LoadingEntry
&le
= entries
[fileno
];
546 le
.state
= LoadingEntry::leCorrupted
;
547 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
548 // if not idle, the other entry will handle its slice
553 // A miss may have been stored at our fileno while we were loading other
554 // slots from disk. We ought to preserve that entry because it is fresher.
555 const bool overwriteExisting
= false;
556 if (Ipc::StoreMap::Anchor
*anchor
= sd
->map
->openForWritingAt(fileno
, overwriteExisting
)) {
557 primeNewEntry(*anchor
, fileno
, header
);
558 addSlotToEntry(fileno
, slotId
, header
); // may fail
559 assert(anchor
->basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
561 // A new from-network entry is occupying our map slot; let it be, but
562 // save us from the trouble of going through the above motions again.
563 LoadingEntry
&le
= entries
[fileno
];
564 le
.state
= LoadingEntry::leIgnored
;
565 freeSlotIfIdle(slotId
, false);
569 /// does the header belong to the fileno entry being loaded?
571 Rock::Rebuild::sameEntry(const sfileno fileno
, const DbCellHeader
&header
) const
573 const Ipc::StoreMap::Anchor
&anchor
= sd
->map
->writeableEntry(fileno
);
574 const LoadingEntry
&le
= entries
[fileno
];
575 // any order will work, but do fast comparisons first:
576 return le
.version
== header
.version
&&
577 anchor
.start
== static_cast<Ipc::StoreMapSliceId
>(header
.firstSlot
) &&
578 anchor
.sameKey(reinterpret_cast<const cache_key
*>(header
.key
));
581 /// is the new header consistent with information already loaded?
583 Rock::Rebuild::canAdd(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
) const
585 if (!sameEntry(fileno
, header
)) {
586 debugs(79, 7, "cannot add; wrong entry");
590 const LoadingEntry
&le
= entries
[slotId
];
591 // We cannot add a slot that was already declared free or mapped.
592 if (le
.freed
|| le
.mapped
) {
593 debugs(79, 7, "cannot add; freed/mapped: " << le
.freed
<< le
.mapped
);
597 if (slotId
== header
.firstSlot
) {
598 // If we are the inode, the anchored flag cannot be set yet.
599 if (entries
[fileno
].anchored
) {
600 debugs(79, 7, "cannot add; extra anchor");
604 // And there should have been some other slot for this entry to exist.
606 debugs(79, 7, "cannot add; missing slots");
613 // We are the continuation slice so the more field is reserved for us.
615 debugs(79, 7, "cannot add; foreign slot");
622 /// handle freshly loaded (and validated) db slot header
624 Rock::Rebuild::useNewSlot(const SlotId slotId
, const DbCellHeader
&header
)
626 LoadingEntry
&slice
= entries
[slotId
];
627 assert(!slice
.freed
); // we cannot free what was not loaded
629 const cache_key
*const key
=
630 reinterpret_cast<const cache_key
*>(header
.key
);
631 const sfileno fileno
= sd
->map
->anchorIndexByKey(key
);
632 assert(0 <= fileno
&& fileno
< dbEntryLimit
);
634 LoadingEntry
&le
= entries
[fileno
];
635 debugs(47,9, "entry " << fileno
<< " state: " << le
.state
<< ", inode: " <<
636 header
.firstSlot
<< ", size: " << header
.payloadSize
);
640 case LoadingEntry::leEmpty
: {
641 startNewEntry(fileno
, slotId
, header
);
645 case LoadingEntry::leLoading
: {
646 if (canAdd(fileno
, slotId
, header
)) {
647 addSlotToEntry(fileno
, slotId
, header
);
649 // either the loading chain or this slot is stale;
650 // be conservative and ignore both (and any future ones)
651 le
.state
= LoadingEntry::leCorrupted
;
652 freeBadEntry(fileno
, "duplicated");
653 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
659 case LoadingEntry::leLoaded
: {
660 // either the previously loaded chain or this slot is stale;
661 // be conservative and ignore both (and any future ones)
662 le
.state
= LoadingEntry::leCorrupted
;
663 sd
->map
->freeEntry(fileno
); // may not be immediately successful
664 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
669 case LoadingEntry::leCorrupted
: {
670 // previously seen slots messed things up so we must ignore this one
671 freeSlotIfIdle(slotId
, false);
675 case LoadingEntry::leIgnored
: {
676 // already replaced by a fresher or colliding from-network entry
677 freeSlotIfIdle(slotId
, false);