]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/fs/rock/RockRebuild.cc
2 * DEBUG: section 79 Disk IO Routines
7 #include "fs/rock/RockDbCell.h"
8 #include "fs/rock/RockRebuild.h"
9 #include "fs/rock/RockSwapDir.h"
11 #include "ipc/StoreMap.h"
13 #include "SquidTime.h"
14 #include "store_rebuild.h"
22 CBDATA_NAMESPACED_CLASS_INIT(Rock
, Rebuild
);
25 \defgroup RockFsRebuild Rock Store Rebuild
28 \section Overview Overview
29 * Several layers of information are manipualted during the rebuild:
31 * Store Entry: Response message plus all the metainformation associated with
32 * it. Identified by store key. At any given time, from Squid point
33 * of view, there is only one entry with a given key, but several
34 * different entries with the same key can be observed in any historical
35 * archive (such as an access log or a store database).
37 * Slot chain: A sequence of db slots representing a Store Entry state at
38 * some point in time. Identified by key+version combination. Due to
39 * transaction aborts, crashes, and idle periods, some chains may contain
40 * incomplete or stale information. We assume that no two different chains
41 * have the same key and version. If that assumption fails, we may serve a
42 * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed.
44 * Db slot: A db record containing a piece of a single store entry and linked
45 * to other slots with the same key and version fields, forming a chain.
46 * Slots are identified by their absolute position in the database file,
47 * which is naturally unique.
49 * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is
50 * entry-level and is stored at fileno position. In other words, the array of
51 * LoadingEntries should be interpreted as two arrays, one that maps slot ID
52 * to the LoadingEntry::mapped/free/more members, and the second one that maps
53 * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno.
55 * When information from the newly loaded db slot contradicts the entry-level
56 * information collected so far (e.g., the versions do not match or the total
57 * chain size after the slot contribution exceeds the expected number), the
58 * whole entry (and not just the chain or the slot!) is declared corrupted.
60 * Why invalidate the whole entry? Rock Store is written for high-load
61 * environments with large caches, where there is usually very few idle slots
62 * in the database. A space occupied by a purged entry is usually immediately
63 * reclaimed. A Squid crash or a transaction abort is rather unlikely to
64 * leave a relatively large number of stale slots in the database. Thus, the
65 * number of potentially corrupted entries is relatively small. On the other
66 * hand, the damage from serving a single hadgepodge entry may be significant
67 * to the user. In such an environment, invalidating the whole entry has
68 * negligible performance impact but saves us from high-damage bugs.
74 /// maintains information about the store entry being loaded from disk
75 /// used for identifying partially stored/loaded entries
79 LoadingEntry(): size(0), version(0), state(leEmpty
), anchored(0),
80 mapped(0), freed(0), more(-1) {}
82 /* store entry-level information indexed by sfileno */
83 uint64_t size
; ///< payload seen so far
84 uint32_t version
; ///< DbCellHeader::version to distinguish same-URL chains
85 uint32_t state
:3; ///< current entry state (one of the State values)
86 uint32_t anchored
:1; ///< whether we loaded the inode slot for this entry
88 /* db slot-level information indexed by slotId, starting with firstSlot */
89 uint32_t mapped
:1; ///< whether this slot was added to a mapped entry
90 uint32_t freed
:1; ///< whether this slot was marked as free
91 sfileno more
:25; ///< another slot in some entry chain (unordered)
92 bool used() const { return freed
|| mapped
|| more
!= -1; }
94 /// possible entry states
95 typedef enum { leEmpty
= 0, leLoading
, leLoaded
, leCorrupted
, leIgnored
} State
;
98 } /* namespace Rock */
100 Rock::Rebuild::Rebuild(SwapDir
*dir
): AsyncJob("Rock::Rebuild"),
112 memset(&counts
, 0, sizeof(counts
));
113 dbSize
= sd
->diskOffsetLimit(); // we do not care about the trailer waste
114 dbEntrySize
= sd
->slotSize
;
115 dbEntryLimit
= sd
->entryLimit();
118 Rock::Rebuild::~Rebuild()
125 /// prepares and initiates entry loading sequence
127 Rock::Rebuild::start()
129 // in SMP mode, only the disker is responsible for populating the map
130 if (UsingSmp() && !IamDiskProcess()) {
131 debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" <<
132 sd
->index
<< " from " << sd
->filePath
);
133 mustStop("non-disker");
137 debugs(47, DBG_IMPORTANT
, "Loading cache_dir #" << sd
->index
<<
138 " from " << sd
->filePath
);
140 fd
= file_open(sd
->filePath
, O_RDONLY
| O_BINARY
);
142 failure("cannot open db", errno
);
144 char hdrBuf
[SwapDir::HeaderSize
];
145 if (read(fd
, hdrBuf
, sizeof(hdrBuf
)) != SwapDir::HeaderSize
)
146 failure("cannot read db header", errno
);
148 // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours
149 assert(sizeof(DbCellHeader
) < SM_PAGE_SIZE
);
150 buf
.init(SM_PAGE_SIZE
, SM_PAGE_SIZE
);
152 dbOffset
= SwapDir::HeaderSize
;
155 entries
= new LoadingEntry
[dbEntryLimit
];
160 /// continues after a pause if not done
162 Rock::Rebuild::checkpoint()
165 eventAdd("Rock::Rebuild", Rock::Rebuild::Steps
, this, 0.01, 1, true);
169 Rock::Rebuild::doneAll() const
171 return dbOffset
>= dbSize
&& validationPos
>= dbEntryLimit
&&
176 Rock::Rebuild::Steps(void *data
)
178 // use async call to enable job call protection that time events lack
179 CallJobHere(47, 5, static_cast<Rebuild
*>(data
), Rock::Rebuild
, steps
);
183 Rock::Rebuild::steps()
185 if (dbOffset
< dbSize
)
194 Rock::Rebuild::loadingSteps()
196 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
197 dbOffset
<< " <= " << dbSize
);
199 // Balance our desire to maximize the number of entries processed at once
200 // (and, hence, minimize overheads and total rebuild time) with a
201 // requirement to also process Coordinator events, disk I/Os, etc.
202 const int maxSpentMsec
= 50; // keep small: most RAM I/Os are under 1ms
203 const timeval loopStart
= current_time
;
206 while (loaded
< dbEntryLimit
&& dbOffset
< dbSize
) {
208 dbOffset
+= dbEntrySize
;
212 if (counts
.scancount
% 1000 == 0)
213 storeRebuildProgress(sd
->index
, dbEntryLimit
, counts
.scancount
);
215 if (opt_foreground_rebuild
)
216 continue; // skip "few entries at a time" check below
219 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
220 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
221 debugs(47, 5, HERE
<< "pausing after " << loaded
<< " entries in " <<
222 elapsedMsec
<< "ms; " << (elapsedMsec
/loaded
) << "ms per entry");
229 Rock::Rebuild::loadOneSlot()
231 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
232 dbOffset
<< " <= " << dbSize
);
236 if (lseek(fd
, dbOffset
, SEEK_SET
) < 0)
237 failure("cannot seek to db entry", errno
);
241 if (!storeRebuildLoadEntry(fd
, sd
->index
, buf
, counts
))
244 const SlotId slotId
= loadingPos
;
248 if (buf
.contentSize() < static_cast<mb_size_t
>(sizeof(header
))) {
249 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
250 "Ignoring truncated " << buf
.contentSize() << "-byte " <<
251 "cache entry meta data at " << dbOffset
);
252 freeSlotIfIdle(slotId
, true);
255 memcpy(&header
, buf
.content(), sizeof(header
));
256 if (header
.empty()) {
257 freeSlotIfIdle(slotId
, false);
260 if (!header
.sane(dbEntrySize
, dbEntryLimit
)) {
261 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
262 "Ignoring malformed cache entry meta data at " << dbOffset
);
263 freeSlotIfIdle(slotId
, true);
266 buf
.consume(sizeof(header
)); // optimize to avoid memmove()
268 useNewSlot(slotId
, header
);
271 /// parse StoreEntry basics and add them to the map, returning true on success
273 Rock::Rebuild::importEntry(Ipc::StoreMapAnchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
275 cache_key key
[SQUID_MD5_DIGEST_LENGTH
];
277 const uint64_t knownSize
= header
.entrySize
> 0 ?
278 header
.entrySize
: anchor
.basics
.swap_file_sz
.get();
279 if (!storeRebuildParseEntry(buf
, loadedE
, key
, counts
, knownSize
))
282 // the entry size may still be unknown at this time
284 debugs(47, 8, "importing basics for entry " << fileno
<<
285 " swap_file_sz: " << loadedE
.swap_file_sz
);
288 // we have not validated whether all db cells for this entry were loaded
289 EBIT_CLR(anchor
.basics
.flags
, ENTRY_VALIDATED
);
297 Rock::Rebuild::validationSteps()
299 debugs(47, 5, sd
->index
<< " validating from " << validationPos
);
301 // see loadingSteps() for the rationale; TODO: avoid duplication
302 const int maxSpentMsec
= 50; // keep small: validation does not do I/O
303 const timeval loopStart
= current_time
;
306 while (validationPos
< dbEntryLimit
) {
311 if (validationPos
% 1000 == 0)
312 debugs(20, 2, "validated: " << validationPos
);
314 if (opt_foreground_rebuild
)
315 continue; // skip "few entries at a time" check below
318 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
319 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
320 debugs(47, 5, "pausing after " << validated
<< " entries in " <<
321 elapsedMsec
<< "ms; " << (elapsedMsec
/validated
) << "ms per entry");
328 Rock::Rebuild::validateOneEntry()
330 LoadingEntry
&e
= entries
[validationPos
];
333 case LoadingEntry::leEmpty
:
334 break; // no entry hashed to this position
336 case LoadingEntry::leLoading
:
337 freeBadEntry(validationPos
, "partially stored");
340 case LoadingEntry::leLoaded
:
341 break; // we have already unlocked this entry
343 case LoadingEntry::leCorrupted
:
344 break; // we have already removed this entry
348 /// Marks remaining bad entry slots as free and unlocks the entry. The map
349 /// cannot do this because Loading entries may have holes in the slots chain.
351 Rock::Rebuild::freeBadEntry(const sfileno fileno
, const char *eDescription
)
353 debugs(47, 2, "cache_dir #" << sd
->index
<< ' ' << eDescription
<<
354 " entry " << fileno
<< " is ignored during rebuild");
356 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
358 bool freedSome
= false;
359 // free all loaded non-anchor slots
360 SlotId slotId
= entries
[anchor
.start
].more
;
361 while (slotId
>= 0) {
362 const SlotId next
= entries
[slotId
].more
;
363 freeSlot(slotId
, false);
367 // free anchor slot if it was loaded
368 if (entries
[fileno
].anchored
) {
369 freeSlot(anchor
.start
, false);
374 sd
->map
->forgetWritingEntry(fileno
);
379 Rock::Rebuild::swanSong()
381 debugs(47,3, HERE
<< "cache_dir #" << sd
->index
<< " rebuild level: " <<
382 StoreController::store_dirs_rebuilding
);
383 --StoreController::store_dirs_rebuilding
;
384 storeRebuildComplete(&counts
);
388 Rock::Rebuild::failure(const char *msg
, int errNo
)
390 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
391 dbOffset
<< " <= " << dbSize
);
394 debugs(47, DBG_CRITICAL
, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo
));
395 debugs(47, DBG_CRITICAL
, "Do you need to run 'squid -z' to initialize storage?");
398 fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.",
399 sd
->index
, sd
->filePath
, msg
);
402 /// adds slot to the free slot index
404 Rock::Rebuild::freeSlot(const SlotId slotId
, const bool invalid
)
406 debugs(47,5, sd
->index
<< " frees slot " << slotId
);
407 LoadingEntry
&le
= entries
[slotId
];
413 //sd->unlink(fileno); leave garbage on disk, it should not hurt
416 Ipc::Mem::PageId pageId
;
417 pageId
.pool
= sd
->index
+1;
418 pageId
.number
= slotId
+1;
419 sd
->freeSlots
->push(pageId
);
422 /// adds slot to the free slot index but only if the slot is unused
424 Rock::Rebuild::freeSlotIfIdle(const SlotId slotId
, const bool invalid
)
426 const LoadingEntry
&le
= entries
[slotId
];
428 // mapped slots must be freed via freeBadEntry() to keep the map in sync
432 freeSlot(slotId
, invalid
);
435 /// adds slot to the entry chain in the map
437 Rock::Rebuild::mapSlot(const SlotId slotId
, const DbCellHeader
&header
)
439 LoadingEntry
&le
= entries
[slotId
];
444 Ipc::StoreMapSlice slice
;
445 slice
.next
= header
.nextSlot
;
446 slice
.size
= header
.payloadSize
;
447 sd
->map
->importSlice(slotId
, slice
);
450 /// adds slot to an existing entry chain; caller must check that the slot
451 /// belongs to the chain it is being added to
453 Rock::Rebuild::addSlotToEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
455 LoadingEntry
&le
= entries
[fileno
];
456 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
458 assert(le
.version
== header
.version
);
460 // mark anchor as loaded or add the secondary slot to the chain
461 LoadingEntry
&inode
= entries
[header
.firstSlot
];
462 if (header
.firstSlot
== slotId
) {
463 debugs(47,5, "adding inode");
464 assert(!inode
.freed
);
467 debugs(47,9, "linking " << slotId
<< " to " << inode
.more
);
468 // we do not need to preserve the order
469 LoadingEntry
&slice
= entries
[slotId
];
470 assert(!slice
.freed
);
471 assert(slice
.more
< 0);
472 slice
.more
= inode
.more
;
476 if (header
.firstSlot
== slotId
&& !importEntry(anchor
, fileno
, header
)) {
477 le
.state
= LoadingEntry::leCorrupted
;
478 freeBadEntry(fileno
, "corrupted metainfo");
482 // set total entry size and/or check it for consistency
483 debugs(47, 8, "header.entrySize: " << header
.entrySize
<< " swap_file_sz: " << anchor
.basics
.swap_file_sz
);
484 uint64_t totalSize
= header
.entrySize
;
485 assert(totalSize
!= static_cast<uint64_t>(-1));
486 if (!totalSize
&& anchor
.basics
.swap_file_sz
) {
487 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
488 // perhaps we loaded a later slot (with entrySize) earlier
489 totalSize
= anchor
.basics
.swap_file_sz
;
490 } else if (totalSize
&& !anchor
.basics
.swap_file_sz
) {
491 anchor
.basics
.swap_file_sz
= totalSize
;
492 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
493 } else if (totalSize
!= anchor
.basics
.swap_file_sz
) {
494 le
.state
= LoadingEntry::leCorrupted
;
495 freeBadEntry(fileno
, "size mismatch");
499 le
.size
+= header
.payloadSize
;
501 if (totalSize
> 0 && le
.size
> totalSize
) { // overflow
502 debugs(47, 8, "overflow: " << le
.size
<< " > " << totalSize
);
503 le
.state
= LoadingEntry::leCorrupted
;
504 freeBadEntry(fileno
, "overflowing");
508 mapSlot(slotId
, header
);
509 if (totalSize
> 0 && le
.size
== totalSize
) {
510 // entry fully loaded, unlock it
511 // we have validated that all db cells for this entry were loaded
512 EBIT_SET(anchor
.basics
.flags
, ENTRY_VALIDATED
);
513 le
.state
= LoadingEntry::leLoaded
;
514 sd
->map
->closeForWriting(fileno
, false);
519 /// initialize housekeeping information for a newly accepted entry
521 Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
523 anchor
.setKey(reinterpret_cast<const cache_key
*>(header
.key
));
524 assert(header
.firstSlot
>= 0);
525 anchor
.start
= header
.firstSlot
;
527 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
529 LoadingEntry
&le
= entries
[fileno
];
530 le
.state
= LoadingEntry::leLoading
;
531 le
.version
= header
.version
;
535 /// handle a slot from an entry that we have not seen before
537 Rock::Rebuild::startNewEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
539 // If some other from-disk entry is/was using this slot as its inode OR
540 // if some other from-disk entry is/was using our inode slot, then the
541 // entries are conflicting. We cannot identify other entries, so we just
542 // remove ours and hope that the others were/will be handled correctly.
543 const LoadingEntry
&slice
= entries
[slotId
];
544 const LoadingEntry
&inode
= entries
[header
.firstSlot
];
545 if (slice
.used() || inode
.used()) {
546 debugs(47,8, "slice/inode used: " << slice
.used() << inode
.used());
547 LoadingEntry
&le
= entries
[fileno
];
548 le
.state
= LoadingEntry::leCorrupted
;
549 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
550 // if not idle, the other entry will handle its slice
555 // A miss may have been stored at our fileno while we were loading other
556 // slots from disk. We ought to preserve that entry because it is fresher.
557 const bool overwriteExisting
= false;
558 if (Ipc::StoreMap::Anchor
*anchor
= sd
->map
->openForWritingAt(fileno
, overwriteExisting
)) {
559 primeNewEntry(*anchor
, fileno
, header
);
560 addSlotToEntry(fileno
, slotId
, header
); // may fail
561 assert(anchor
->basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
563 // A new from-network entry is occupying our map slot; let it be, but
564 // save us from the trouble of going through the above motions again.
565 LoadingEntry
&le
= entries
[fileno
];
566 le
.state
= LoadingEntry::leIgnored
;
567 freeSlotIfIdle(slotId
, false);
571 /// does the header belong to the fileno entry being loaded?
573 Rock::Rebuild::sameEntry(const sfileno fileno
, const DbCellHeader
&header
) const
575 const Ipc::StoreMap::Anchor
&anchor
= sd
->map
->writeableEntry(fileno
);
576 const LoadingEntry
&le
= entries
[fileno
];
577 // any order will work, but do fast comparisons first:
578 return le
.version
== header
.version
&&
579 anchor
.start
== static_cast<Ipc::StoreMapSliceId
>(header
.firstSlot
) &&
580 anchor
.sameKey(reinterpret_cast<const cache_key
*>(header
.key
));
583 /// is the new header consistent with information already loaded?
585 Rock::Rebuild::canAdd(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
) const
587 if (!sameEntry(fileno
, header
)) {
588 debugs(79, 7, "cannot add; wrong entry");
592 const LoadingEntry
&le
= entries
[slotId
];
593 // We cannot add a slot that was already declared free or mapped.
594 if (le
.freed
|| le
.mapped
) {
595 debugs(79, 7, "cannot add; freed/mapped: " << le
.freed
<< le
.mapped
);
599 if (slotId
== header
.firstSlot
) {
600 // If we are the inode, the anchored flag cannot be set yet.
601 if (entries
[fileno
].anchored
) {
602 debugs(79, 7, "cannot add; extra anchor");
606 // And there should have been some other slot for this entry to exist.
608 debugs(79, 7, "cannot add; missing slots");
615 // We are the continuation slice so the more field is reserved for us.
617 debugs(79, 7, "cannot add; foreign slot");
624 /// handle freshly loaded (and validated) db slot header
626 Rock::Rebuild::useNewSlot(const SlotId slotId
, const DbCellHeader
&header
)
628 LoadingEntry
&slice
= entries
[slotId
];
629 assert(!slice
.freed
); // we cannot free what was not loaded
631 const cache_key
*const key
=
632 reinterpret_cast<const cache_key
*>(header
.key
);
633 const sfileno fileno
= sd
->map
->anchorIndexByKey(key
);
634 assert(0 <= fileno
&& fileno
< dbEntryLimit
);
636 LoadingEntry
&le
= entries
[fileno
];
637 debugs(47,9, "entry " << fileno
<< " state: " << le
.state
<< ", inode: " <<
638 header
.firstSlot
<< ", size: " << header
.payloadSize
);
642 case LoadingEntry::leEmpty
: {
643 startNewEntry(fileno
, slotId
, header
);
647 case LoadingEntry::leLoading
: {
648 if (canAdd(fileno
, slotId
, header
)) {
649 addSlotToEntry(fileno
, slotId
, header
);
651 // either the loading chain or this slot is stale;
652 // be conservative and ignore both (and any future ones)
653 le
.state
= LoadingEntry::leCorrupted
;
654 freeBadEntry(fileno
, "duplicated");
655 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
661 case LoadingEntry::leLoaded
: {
662 // either the previously loaded chain or this slot is stale;
663 // be conservative and ignore both (and any future ones)
664 le
.state
= LoadingEntry::leCorrupted
;
665 sd
->map
->freeEntry(fileno
); // may not be immediately successful
666 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
671 case LoadingEntry::leCorrupted
: {
672 // previously seen slots messed things up so we must ignore this one
673 freeSlotIfIdle(slotId
, false);
677 case LoadingEntry::leIgnored
: {
678 // already replaced by a fresher or colliding from-network entry
679 freeSlotIfIdle(slotId
, false);