]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/fs/rock/RockRebuild.cc
2 * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
9 /* DEBUG: section 79 Disk IO Routines */
13 #include "base/AsyncJobCalls.h"
14 #include "fs/rock/RockDbCell.h"
15 #include "fs/rock/RockRebuild.h"
16 #include "fs/rock/RockSwapDir.h"
18 #include "ipc/StoreMap.h"
20 #include "SquidTime.h"
21 #include "store_rebuild.h"
27 CBDATA_NAMESPACED_CLASS_INIT(Rock
, Rebuild
);
30 \defgroup RockFsRebuild Rock Store Rebuild
33 \section Overview Overview
34 * Several layers of information are manipualted during the rebuild:
36 * Store Entry: Response message plus all the metainformation associated with
37 * it. Identified by store key. At any given time, from Squid point
38 * of view, there is only one entry with a given key, but several
39 * different entries with the same key can be observed in any historical
40 * archive (such as an access log or a store database).
42 * Slot chain: A sequence of db slots representing a Store Entry state at
43 * some point in time. Identified by key+version combination. Due to
44 * transaction aborts, crashes, and idle periods, some chains may contain
45 * incomplete or stale information. We assume that no two different chains
46 * have the same key and version. If that assumption fails, we may serve a
47 * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed.
49 * Db slot: A db record containing a piece of a single store entry and linked
50 * to other slots with the same key and version fields, forming a chain.
51 * Slots are identified by their absolute position in the database file,
52 * which is naturally unique.
54 * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is
55 * entry-level and is stored at fileno position. In other words, the array of
56 * LoadingEntries should be interpreted as two arrays, one that maps slot ID
57 * to the LoadingEntry::mapped/free/more members, and the second one that maps
58 * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno.
60 * When information from the newly loaded db slot contradicts the entry-level
61 * information collected so far (e.g., the versions do not match or the total
62 * chain size after the slot contribution exceeds the expected number), the
63 * whole entry (and not just the chain or the slot!) is declared corrupted.
65 * Why invalidate the whole entry? Rock Store is written for high-load
66 * environments with large caches, where there is usually very few idle slots
67 * in the database. A space occupied by a purged entry is usually immediately
68 * reclaimed. A Squid crash or a transaction abort is rather unlikely to
69 * leave a relatively large number of stale slots in the database. Thus, the
70 * number of potentially corrupted entries is relatively small. On the other
71 * hand, the damage from serving a single hadgepodge entry may be significant
72 * to the user. In such an environment, invalidating the whole entry has
73 * negligible performance impact but saves us from high-damage bugs.
79 /// maintains information about the store entry being loaded from disk
80 /// used for identifying partially stored/loaded entries
84 LoadingEntry(): size(0), version(0), state(leEmpty
), anchored(0),
85 mapped(0), freed(0), more(-1) {}
87 /* store entry-level information indexed by sfileno */
88 uint64_t size
; ///< payload seen so far
89 uint32_t version
; ///< DbCellHeader::version to distinguish same-URL chains
90 uint8_t state
:3; ///< current entry state (one of the State values)
91 uint8_t anchored
:1; ///< whether we loaded the inode slot for this entry
93 /* db slot-level information indexed by slotId, starting with firstSlot */
94 uint8_t mapped
:1; ///< whether this slot was added to a mapped entry
95 uint8_t freed
:1; ///< whether this slot was marked as free
96 Ipc::StoreMapSliceId more
; ///< another slot in some entry chain (unordered)
97 bool used() const { return freed
|| mapped
|| more
!= -1; }
99 /// possible entry states
100 typedef enum { leEmpty
= 0, leLoading
, leLoaded
, leCorrupted
, leIgnored
} State
;
103 } /* namespace Rock */
105 Rock::Rebuild::Rebuild(SwapDir
*dir
): AsyncJob("Rock::Rebuild"),
118 memset(&counts
, 0, sizeof(counts
));
119 dbSize
= sd
->diskOffsetLimit(); // we do not care about the trailer waste
120 dbSlotSize
= sd
->slotSize
;
121 dbEntryLimit
= sd
->entryLimitActual();
122 dbSlotLimit
= sd
->slotLimitActual();
123 assert(dbEntryLimit
<= dbSlotLimit
);
126 Rock::Rebuild::~Rebuild()
133 /// prepares and initiates entry loading sequence
135 Rock::Rebuild::start()
137 // in SMP mode, only the disker is responsible for populating the map
138 if (UsingSmp() && !IamDiskProcess()) {
139 debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" <<
140 sd
->index
<< " from " << sd
->filePath
);
141 mustStop("non-disker");
145 debugs(47, DBG_IMPORTANT
, "Loading cache_dir #" << sd
->index
<<
146 " from " << sd
->filePath
);
148 fd
= file_open(sd
->filePath
, O_RDONLY
| O_BINARY
);
150 failure("cannot open db", errno
);
152 char hdrBuf
[SwapDir::HeaderSize
];
153 if (read(fd
, hdrBuf
, sizeof(hdrBuf
)) != SwapDir::HeaderSize
)
154 failure("cannot read db header", errno
);
156 // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours
157 assert(sizeof(DbCellHeader
) < SM_PAGE_SIZE
);
158 buf
.init(SM_PAGE_SIZE
, SM_PAGE_SIZE
);
160 dbOffset
= SwapDir::HeaderSize
;
162 entries
= new LoadingEntry
[dbSlotLimit
];
167 /// continues after a pause if not done
169 Rock::Rebuild::checkpoint()
172 eventAdd("Rock::Rebuild", Rock::Rebuild::Steps
, this, 0.01, 1, true);
176 Rock::Rebuild::doneAll() const
178 return loadingPos
>= dbSlotLimit
&& validationPos
>= dbSlotLimit
&&
183 Rock::Rebuild::Steps(void *data
)
185 // use async call to enable job call protection that time events lack
186 CallJobHere(47, 5, static_cast<Rebuild
*>(data
), Rock::Rebuild
, steps
);
190 Rock::Rebuild::steps()
192 if (loadingPos
< dbSlotLimit
)
201 Rock::Rebuild::loadingSteps()
203 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
204 dbOffset
<< " <= " << dbSize
);
206 // Balance our desire to maximize the number of entries processed at once
207 // (and, hence, minimize overheads and total rebuild time) with a
208 // requirement to also process Coordinator events, disk I/Os, etc.
209 const int maxSpentMsec
= 50; // keep small: most RAM I/Os are under 1ms
210 const timeval loopStart
= current_time
;
213 while (loadingPos
< dbSlotLimit
) {
215 dbOffset
+= dbSlotSize
;
219 if (counts
.scancount
% 1000 == 0)
220 storeRebuildProgress(sd
->index
, dbSlotLimit
, counts
.scancount
);
222 if (opt_foreground_rebuild
)
223 continue; // skip "few entries at a time" check below
226 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
227 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
228 debugs(47, 5, HERE
<< "pausing after " << loaded
<< " entries in " <<
229 elapsedMsec
<< "ms; " << (elapsedMsec
/loaded
) << "ms per entry");
236 Rock::Rebuild::loadOneSlot()
238 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
239 dbOffset
<< " <= " << dbSize
);
243 if (lseek(fd
, dbOffset
, SEEK_SET
) < 0)
244 failure("cannot seek to db entry", errno
);
248 if (!storeRebuildLoadEntry(fd
, sd
->index
, buf
, counts
))
251 const SlotId slotId
= loadingPos
;
255 if (buf
.contentSize() < static_cast<mb_size_t
>(sizeof(header
))) {
256 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
257 "Ignoring truncated " << buf
.contentSize() << "-byte " <<
258 "cache entry meta data at " << dbOffset
);
259 freeSlotIfIdle(slotId
, true);
262 memcpy(&header
, buf
.content(), sizeof(header
));
263 if (header
.empty()) {
264 freeSlotIfIdle(slotId
, false);
267 if (!header
.sane(dbSlotSize
, dbSlotLimit
)) {
268 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
269 "Ignoring malformed cache entry meta data at " << dbOffset
);
270 freeSlotIfIdle(slotId
, true);
273 buf
.consume(sizeof(header
)); // optimize to avoid memmove()
275 useNewSlot(slotId
, header
);
278 /// parse StoreEntry basics and add them to the map, returning true on success
280 Rock::Rebuild::importEntry(Ipc::StoreMapAnchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
282 cache_key key
[SQUID_MD5_DIGEST_LENGTH
];
284 const uint64_t knownSize
= header
.entrySize
> 0 ?
285 header
.entrySize
: anchor
.basics
.swap_file_sz
.load();
286 if (!storeRebuildParseEntry(buf
, loadedE
, key
, counts
, knownSize
))
289 // the entry size may still be unknown at this time
291 debugs(47, 8, "importing basics for entry " << fileno
<<
292 " swap_file_sz: " << loadedE
.swap_file_sz
);
295 // we have not validated whether all db cells for this entry were loaded
296 EBIT_CLR(anchor
.basics
.flags
, ENTRY_VALIDATED
);
304 Rock::Rebuild::validationSteps()
306 debugs(47, 5, sd
->index
<< " validating from " << validationPos
);
308 // see loadingSteps() for the rationale; TODO: avoid duplication
309 const int maxSpentMsec
= 50; // keep small: validation does not do I/O
310 const timeval loopStart
= current_time
;
313 while (validationPos
< dbSlotLimit
) {
318 if (validationPos
% 1000 == 0)
319 debugs(20, 2, "validated: " << validationPos
);
321 if (opt_foreground_rebuild
)
322 continue; // skip "few entries at a time" check below
325 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
326 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
327 debugs(47, 5, "pausing after " << validated
<< " entries in " <<
328 elapsedMsec
<< "ms; " << (elapsedMsec
/validated
) << "ms per entry");
335 Rock::Rebuild::validateOneEntry()
337 LoadingEntry
&e
= entries
[validationPos
];
340 case LoadingEntry::leEmpty
:
341 break; // no entry hashed to this position
343 case LoadingEntry::leLoading
:
344 freeBadEntry(validationPos
, "partially stored");
347 case LoadingEntry::leLoaded
:
348 break; // we have already unlocked this entry
350 case LoadingEntry::leCorrupted
:
351 break; // we have already removed this entry
355 /// Marks remaining bad entry slots as free and unlocks the entry. The map
356 /// cannot do this because Loading entries may have holes in the slots chain.
358 Rock::Rebuild::freeBadEntry(const sfileno fileno
, const char *eDescription
)
360 debugs(47, 2, "cache_dir #" << sd
->index
<< ' ' << eDescription
<<
361 " entry " << fileno
<< " is ignored during rebuild");
363 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
365 bool freedSome
= false;
366 // free all loaded non-anchor slots
367 SlotId slotId
= entries
[anchor
.start
].more
;
368 while (slotId
>= 0) {
369 const SlotId next
= entries
[slotId
].more
;
370 freeSlot(slotId
, false);
374 // free anchor slot if it was loaded
375 if (entries
[fileno
].anchored
) {
376 freeSlot(anchor
.start
, false);
381 sd
->map
->forgetWritingEntry(fileno
);
386 Rock::Rebuild::swanSong()
388 debugs(47,3, HERE
<< "cache_dir #" << sd
->index
<< " rebuild level: " <<
389 StoreController::store_dirs_rebuilding
);
390 --StoreController::store_dirs_rebuilding
;
391 storeRebuildComplete(&counts
);
395 Rock::Rebuild::failure(const char *msg
, int errNo
)
397 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
398 dbOffset
<< " <= " << dbSize
);
401 debugs(47, DBG_CRITICAL
, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo
));
402 debugs(47, DBG_CRITICAL
, "Do you need to run 'squid -z' to initialize storage?");
405 fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.",
406 sd
->index
, sd
->filePath
, msg
);
409 /// adds slot to the free slot index
411 Rock::Rebuild::freeSlot(const SlotId slotId
, const bool invalid
)
413 debugs(47,5, sd
->index
<< " frees slot " << slotId
);
414 LoadingEntry
&le
= entries
[slotId
];
420 //sd->unlink(fileno); leave garbage on disk, it should not hurt
423 Ipc::Mem::PageId pageId
;
424 pageId
.pool
= sd
->index
+1;
425 pageId
.number
= slotId
+1;
426 sd
->freeSlots
->push(pageId
);
429 /// adds slot to the free slot index but only if the slot is unused
431 Rock::Rebuild::freeSlotIfIdle(const SlotId slotId
, const bool invalid
)
433 const LoadingEntry
&le
= entries
[slotId
];
435 // mapped slots must be freed via freeBadEntry() to keep the map in sync
439 freeSlot(slotId
, invalid
);
442 /// adds slot to the entry chain in the map
444 Rock::Rebuild::mapSlot(const SlotId slotId
, const DbCellHeader
&header
)
446 LoadingEntry
&le
= entries
[slotId
];
451 Ipc::StoreMapSlice slice
;
452 slice
.next
= header
.nextSlot
;
453 slice
.size
= header
.payloadSize
;
454 sd
->map
->importSlice(slotId
, slice
);
457 /// adds slot to an existing entry chain; caller must check that the slot
458 /// belongs to the chain it is being added to
460 Rock::Rebuild::addSlotToEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
462 LoadingEntry
&le
= entries
[fileno
];
463 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
465 assert(le
.version
== header
.version
);
467 // mark anchor as loaded or add the secondary slot to the chain
468 LoadingEntry
&inode
= entries
[header
.firstSlot
];
469 if (header
.firstSlot
== slotId
) {
470 debugs(47,5, "adding inode");
471 assert(!inode
.freed
);
474 debugs(47,9, "linking " << slotId
<< " to " << inode
.more
);
475 // we do not need to preserve the order
476 LoadingEntry
&slice
= entries
[slotId
];
477 assert(!slice
.freed
);
478 assert(slice
.more
< 0);
479 slice
.more
= inode
.more
;
483 if (header
.firstSlot
== slotId
&& !importEntry(anchor
, fileno
, header
)) {
484 le
.state
= LoadingEntry::leCorrupted
;
485 freeBadEntry(fileno
, "corrupted metainfo");
489 // set total entry size and/or check it for consistency
490 debugs(47, 8, "header.entrySize: " << header
.entrySize
<< " swap_file_sz: " << anchor
.basics
.swap_file_sz
);
491 uint64_t totalSize
= header
.entrySize
;
492 assert(totalSize
!= static_cast<uint64_t>(-1));
493 if (!totalSize
&& anchor
.basics
.swap_file_sz
) {
494 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
495 // perhaps we loaded a later slot (with entrySize) earlier
496 totalSize
= anchor
.basics
.swap_file_sz
;
497 } else if (totalSize
&& !anchor
.basics
.swap_file_sz
) {
498 anchor
.basics
.swap_file_sz
= totalSize
;
499 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
500 } else if (totalSize
!= anchor
.basics
.swap_file_sz
) {
501 le
.state
= LoadingEntry::leCorrupted
;
502 freeBadEntry(fileno
, "size mismatch");
506 le
.size
+= header
.payloadSize
;
508 if (totalSize
> 0 && le
.size
> totalSize
) { // overflow
509 debugs(47, 8, "overflow: " << le
.size
<< " > " << totalSize
);
510 le
.state
= LoadingEntry::leCorrupted
;
511 freeBadEntry(fileno
, "overflowing");
515 mapSlot(slotId
, header
);
516 if (totalSize
> 0 && le
.size
== totalSize
) {
517 // entry fully loaded, unlock it
518 // we have validated that all db cells for this entry were loaded
519 EBIT_SET(anchor
.basics
.flags
, ENTRY_VALIDATED
);
520 le
.state
= LoadingEntry::leLoaded
;
521 sd
->map
->closeForWriting(fileno
, false);
526 /// initialize housekeeping information for a newly accepted entry
528 Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
530 anchor
.setKey(reinterpret_cast<const cache_key
*>(header
.key
));
531 assert(header
.firstSlot
>= 0);
532 anchor
.start
= header
.firstSlot
;
534 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
536 LoadingEntry
&le
= entries
[fileno
];
537 le
.state
= LoadingEntry::leLoading
;
538 le
.version
= header
.version
;
542 /// handle a slot from an entry that we have not seen before
544 Rock::Rebuild::startNewEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
546 // If some other from-disk entry is/was using this slot as its inode OR
547 // if some other from-disk entry is/was using our inode slot, then the
548 // entries are conflicting. We cannot identify other entries, so we just
549 // remove ours and hope that the others were/will be handled correctly.
550 const LoadingEntry
&slice
= entries
[slotId
];
551 const LoadingEntry
&inode
= entries
[header
.firstSlot
];
552 if (slice
.used() || inode
.used()) {
553 debugs(47,8, "slice/inode used: " << slice
.used() << inode
.used());
554 LoadingEntry
&le
= entries
[fileno
];
555 le
.state
= LoadingEntry::leCorrupted
;
556 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
557 // if not idle, the other entry will handle its slice
562 // A miss may have been stored at our fileno while we were loading other
563 // slots from disk. We ought to preserve that entry because it is fresher.
564 const bool overwriteExisting
= false;
565 if (Ipc::StoreMap::Anchor
*anchor
= sd
->map
->openForWritingAt(fileno
, overwriteExisting
)) {
566 primeNewEntry(*anchor
, fileno
, header
);
567 addSlotToEntry(fileno
, slotId
, header
); // may fail
568 assert(anchor
->basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
570 // A new from-network entry is occupying our map slot; let it be, but
571 // save us from the trouble of going through the above motions again.
572 LoadingEntry
&le
= entries
[fileno
];
573 le
.state
= LoadingEntry::leIgnored
;
574 freeSlotIfIdle(slotId
, false);
578 /// does the header belong to the fileno entry being loaded?
580 Rock::Rebuild::sameEntry(const sfileno fileno
, const DbCellHeader
&header
) const
582 const Ipc::StoreMap::Anchor
&anchor
= sd
->map
->writeableEntry(fileno
);
583 const LoadingEntry
&le
= entries
[fileno
];
584 // any order will work, but do fast comparisons first:
585 return le
.version
== header
.version
&&
586 anchor
.start
== static_cast<Ipc::StoreMapSliceId
>(header
.firstSlot
) &&
587 anchor
.sameKey(reinterpret_cast<const cache_key
*>(header
.key
));
590 /// is the new header consistent with information already loaded?
592 Rock::Rebuild::canAdd(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
) const
594 if (!sameEntry(fileno
, header
)) {
595 debugs(79, 7, "cannot add; wrong entry");
599 const LoadingEntry
&le
= entries
[slotId
];
600 // We cannot add a slot that was already declared free or mapped.
601 if (le
.freed
|| le
.mapped
) {
602 debugs(79, 7, "cannot add; freed/mapped: " << le
.freed
<< le
.mapped
);
606 if (slotId
== header
.firstSlot
) {
607 // If we are the inode, the anchored flag cannot be set yet.
608 if (entries
[fileno
].anchored
) {
609 debugs(79, 7, "cannot add; extra anchor");
613 // And there should have been some other slot for this entry to exist.
615 debugs(79, 7, "cannot add; missing slots");
622 // We are the continuation slice so the more field is reserved for us.
624 debugs(79, 7, "cannot add; foreign slot");
631 /// handle freshly loaded (and validated) db slot header
633 Rock::Rebuild::useNewSlot(const SlotId slotId
, const DbCellHeader
&header
)
635 LoadingEntry
&slice
= entries
[slotId
];
636 assert(!slice
.freed
); // we cannot free what was not loaded
638 const cache_key
*const key
=
639 reinterpret_cast<const cache_key
*>(header
.key
);
640 const sfileno fileno
= sd
->map
->anchorIndexByKey(key
);
641 assert(0 <= fileno
&& fileno
< dbEntryLimit
);
643 LoadingEntry
&le
= entries
[fileno
];
644 debugs(47,9, "entry " << fileno
<< " state: " << le
.state
<< ", inode: " <<
645 header
.firstSlot
<< ", size: " << header
.payloadSize
);
649 case LoadingEntry::leEmpty
: {
650 startNewEntry(fileno
, slotId
, header
);
654 case LoadingEntry::leLoading
: {
655 if (canAdd(fileno
, slotId
, header
)) {
656 addSlotToEntry(fileno
, slotId
, header
);
658 // either the loading chain or this slot is stale;
659 // be conservative and ignore both (and any future ones)
660 le
.state
= LoadingEntry::leCorrupted
;
661 freeBadEntry(fileno
, "duplicated");
662 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
668 case LoadingEntry::leLoaded
: {
669 // either the previously loaded chain or this slot is stale;
670 // be conservative and ignore both (and any future ones)
671 le
.state
= LoadingEntry::leCorrupted
;
672 sd
->map
->freeEntry(fileno
); // may not be immediately successful
673 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
678 case LoadingEntry::leCorrupted
: {
679 // previously seen slots messed things up so we must ignore this one
680 freeSlotIfIdle(slotId
, false);
684 case LoadingEntry::leIgnored
: {
685 // already replaced by a fresher or colliding from-network entry
686 freeSlotIfIdle(slotId
, false);