]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/fs/rock/RockRebuild.cc
2 * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
9 /* DEBUG: section 79 Disk IO Routines */
13 #include "fs/rock/RockDbCell.h"
14 #include "fs/rock/RockRebuild.h"
15 #include "fs/rock/RockSwapDir.h"
17 #include "ipc/StoreMap.h"
19 #include "SquidTime.h"
20 #include "store_rebuild.h"
25 CBDATA_NAMESPACED_CLASS_INIT(Rock
, Rebuild
);
28 \defgroup RockFsRebuild Rock Store Rebuild
31 \section Overview Overview
32 * Several layers of information are manipualted during the rebuild:
34 * Store Entry: Response message plus all the metainformation associated with
35 * it. Identified by store key. At any given time, from Squid point
36 * of view, there is only one entry with a given key, but several
37 * different entries with the same key can be observed in any historical
38 * archive (such as an access log or a store database).
40 * Slot chain: A sequence of db slots representing a Store Entry state at
41 * some point in time. Identified by key+version combination. Due to
42 * transaction aborts, crashes, and idle periods, some chains may contain
43 * incomplete or stale information. We assume that no two different chains
44 * have the same key and version. If that assumption fails, we may serve a
45 * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed.
47 * Db slot: A db record containing a piece of a single store entry and linked
48 * to other slots with the same key and version fields, forming a chain.
49 * Slots are identified by their absolute position in the database file,
50 * which is naturally unique.
52 * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is
53 * entry-level and is stored at fileno position. In other words, the array of
54 * LoadingEntries should be interpreted as two arrays, one that maps slot ID
55 * to the LoadingEntry::mapped/free/more members, and the second one that maps
56 * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno.
58 * When information from the newly loaded db slot contradicts the entry-level
59 * information collected so far (e.g., the versions do not match or the total
60 * chain size after the slot contribution exceeds the expected number), the
61 * whole entry (and not just the chain or the slot!) is declared corrupted.
63 * Why invalidate the whole entry? Rock Store is written for high-load
64 * environments with large caches, where there is usually very few idle slots
65 * in the database. A space occupied by a purged entry is usually immediately
66 * reclaimed. A Squid crash or a transaction abort is rather unlikely to
67 * leave a relatively large number of stale slots in the database. Thus, the
68 * number of potentially corrupted entries is relatively small. On the other
69 * hand, the damage from serving a single hadgepodge entry may be significant
70 * to the user. In such an environment, invalidating the whole entry has
71 * negligible performance impact but saves us from high-damage bugs.
77 /// maintains information about the store entry being loaded from disk
78 /// used for identifying partially stored/loaded entries
82 LoadingEntry(): size(0), version(0), state(leEmpty
), anchored(0),
83 mapped(0), freed(0), more(-1) {}
85 /* store entry-level information indexed by sfileno */
86 uint64_t size
; ///< payload seen so far
87 uint32_t version
; ///< DbCellHeader::version to distinguish same-URL chains
88 uint8_t state
:3; ///< current entry state (one of the State values)
89 uint8_t anchored
:1; ///< whether we loaded the inode slot for this entry
91 /* db slot-level information indexed by slotId, starting with firstSlot */
92 uint8_t mapped
:1; ///< whether this slot was added to a mapped entry
93 uint8_t freed
:1; ///< whether this slot was marked as free
94 Ipc::StoreMapSliceId more
; ///< another slot in some entry chain (unordered)
95 bool used() const { return freed
|| mapped
|| more
!= -1; }
97 /// possible entry states
98 typedef enum { leEmpty
= 0, leLoading
, leLoaded
, leCorrupted
, leIgnored
} State
;
101 } /* namespace Rock */
103 Rock::Rebuild::Rebuild(SwapDir
*dir
): AsyncJob("Rock::Rebuild"),
116 memset(&counts
, 0, sizeof(counts
));
117 dbSize
= sd
->diskOffsetLimit(); // we do not care about the trailer waste
118 dbSlotSize
= sd
->slotSize
;
119 dbEntryLimit
= sd
->entryLimitActual();
120 dbSlotLimit
= sd
->slotLimitActual();
121 assert(dbEntryLimit
<= dbSlotLimit
);
124 Rock::Rebuild::~Rebuild()
131 /// prepares and initiates entry loading sequence
133 Rock::Rebuild::start()
135 // in SMP mode, only the disker is responsible for populating the map
136 if (UsingSmp() && !IamDiskProcess()) {
137 debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" <<
138 sd
->index
<< " from " << sd
->filePath
);
139 mustStop("non-disker");
143 debugs(47, DBG_IMPORTANT
, "Loading cache_dir #" << sd
->index
<<
144 " from " << sd
->filePath
);
146 fd
= file_open(sd
->filePath
, O_RDONLY
| O_BINARY
);
148 failure("cannot open db", errno
);
150 char hdrBuf
[SwapDir::HeaderSize
];
151 if (read(fd
, hdrBuf
, sizeof(hdrBuf
)) != SwapDir::HeaderSize
)
152 failure("cannot read db header", errno
);
154 // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours
155 assert(sizeof(DbCellHeader
) < SM_PAGE_SIZE
);
156 buf
.init(SM_PAGE_SIZE
, SM_PAGE_SIZE
);
158 dbOffset
= SwapDir::HeaderSize
;
160 entries
= new LoadingEntry
[dbSlotLimit
];
165 /// continues after a pause if not done
167 Rock::Rebuild::checkpoint()
170 eventAdd("Rock::Rebuild", Rock::Rebuild::Steps
, this, 0.01, 1, true);
174 Rock::Rebuild::doneAll() const
176 return loadingPos
>= dbSlotLimit
&& validationPos
>= dbSlotLimit
&&
181 Rock::Rebuild::Steps(void *data
)
183 // use async call to enable job call protection that time events lack
184 CallJobHere(47, 5, static_cast<Rebuild
*>(data
), Rock::Rebuild
, steps
);
188 Rock::Rebuild::steps()
190 if (loadingPos
< dbSlotLimit
)
199 Rock::Rebuild::loadingSteps()
201 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
202 dbOffset
<< " <= " << dbSize
);
204 // Balance our desire to maximize the number of entries processed at once
205 // (and, hence, minimize overheads and total rebuild time) with a
206 // requirement to also process Coordinator events, disk I/Os, etc.
207 const int maxSpentMsec
= 50; // keep small: most RAM I/Os are under 1ms
208 const timeval loopStart
= current_time
;
211 while (loadingPos
< dbSlotLimit
) {
213 dbOffset
+= dbSlotSize
;
217 if (counts
.scancount
% 1000 == 0)
218 storeRebuildProgress(sd
->index
, dbSlotLimit
, counts
.scancount
);
220 if (opt_foreground_rebuild
)
221 continue; // skip "few entries at a time" check below
224 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
225 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
226 debugs(47, 5, HERE
<< "pausing after " << loaded
<< " entries in " <<
227 elapsedMsec
<< "ms; " << (elapsedMsec
/loaded
) << "ms per entry");
234 Rock::Rebuild::loadOneSlot()
236 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
237 dbOffset
<< " <= " << dbSize
);
241 if (lseek(fd
, dbOffset
, SEEK_SET
) < 0)
242 failure("cannot seek to db entry", errno
);
246 if (!storeRebuildLoadEntry(fd
, sd
->index
, buf
, counts
))
249 const SlotId slotId
= loadingPos
;
253 if (buf
.contentSize() < static_cast<mb_size_t
>(sizeof(header
))) {
254 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
255 "Ignoring truncated " << buf
.contentSize() << "-byte " <<
256 "cache entry meta data at " << dbOffset
);
257 freeSlotIfIdle(slotId
, true);
260 memcpy(&header
, buf
.content(), sizeof(header
));
261 if (header
.empty()) {
262 freeSlotIfIdle(slotId
, false);
265 if (!header
.sane(dbSlotSize
, dbSlotLimit
)) {
266 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
267 "Ignoring malformed cache entry meta data at " << dbOffset
);
268 freeSlotIfIdle(slotId
, true);
271 buf
.consume(sizeof(header
)); // optimize to avoid memmove()
273 useNewSlot(slotId
, header
);
276 /// parse StoreEntry basics and add them to the map, returning true on success
278 Rock::Rebuild::importEntry(Ipc::StoreMapAnchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
280 cache_key key
[SQUID_MD5_DIGEST_LENGTH
];
282 const uint64_t knownSize
= header
.entrySize
> 0 ?
283 header
.entrySize
: anchor
.basics
.swap_file_sz
.load();
284 if (!storeRebuildParseEntry(buf
, loadedE
, key
, counts
, knownSize
))
287 // the entry size may still be unknown at this time
289 debugs(47, 8, "importing basics for entry " << fileno
<<
290 " swap_file_sz: " << loadedE
.swap_file_sz
);
293 // we have not validated whether all db cells for this entry were loaded
294 EBIT_CLR(anchor
.basics
.flags
, ENTRY_VALIDATED
);
302 Rock::Rebuild::validationSteps()
304 debugs(47, 5, sd
->index
<< " validating from " << validationPos
);
306 // see loadingSteps() for the rationale; TODO: avoid duplication
307 const int maxSpentMsec
= 50; // keep small: validation does not do I/O
308 const timeval loopStart
= current_time
;
311 while (validationPos
< dbSlotLimit
) {
316 if (validationPos
% 1000 == 0)
317 debugs(20, 2, "validated: " << validationPos
);
319 if (opt_foreground_rebuild
)
320 continue; // skip "few entries at a time" check below
323 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
324 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
325 debugs(47, 5, "pausing after " << validated
<< " entries in " <<
326 elapsedMsec
<< "ms; " << (elapsedMsec
/validated
) << "ms per entry");
333 Rock::Rebuild::validateOneEntry()
335 LoadingEntry
&e
= entries
[validationPos
];
338 case LoadingEntry::leEmpty
:
339 break; // no entry hashed to this position
341 case LoadingEntry::leLoading
:
342 freeBadEntry(validationPos
, "partially stored");
345 case LoadingEntry::leLoaded
:
346 break; // we have already unlocked this entry
348 case LoadingEntry::leCorrupted
:
349 break; // we have already removed this entry
353 /// Marks remaining bad entry slots as free and unlocks the entry. The map
354 /// cannot do this because Loading entries may have holes in the slots chain.
356 Rock::Rebuild::freeBadEntry(const sfileno fileno
, const char *eDescription
)
358 debugs(47, 2, "cache_dir #" << sd
->index
<< ' ' << eDescription
<<
359 " entry " << fileno
<< " is ignored during rebuild");
361 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
363 bool freedSome
= false;
364 // free all loaded non-anchor slots
365 SlotId slotId
= entries
[anchor
.start
].more
;
366 while (slotId
>= 0) {
367 const SlotId next
= entries
[slotId
].more
;
368 freeSlot(slotId
, false);
372 // free anchor slot if it was loaded
373 if (entries
[fileno
].anchored
) {
374 freeSlot(anchor
.start
, false);
379 sd
->map
->forgetWritingEntry(fileno
);
384 Rock::Rebuild::swanSong()
386 debugs(47,3, HERE
<< "cache_dir #" << sd
->index
<< " rebuild level: " <<
387 StoreController::store_dirs_rebuilding
);
388 --StoreController::store_dirs_rebuilding
;
389 storeRebuildComplete(&counts
);
393 Rock::Rebuild::failure(const char *msg
, int errNo
)
395 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
396 dbOffset
<< " <= " << dbSize
);
399 debugs(47, DBG_CRITICAL
, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo
));
400 debugs(47, DBG_CRITICAL
, "Do you need to run 'squid -z' to initialize storage?");
403 fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.",
404 sd
->index
, sd
->filePath
, msg
);
407 /// adds slot to the free slot index
409 Rock::Rebuild::freeSlot(const SlotId slotId
, const bool invalid
)
411 debugs(47,5, sd
->index
<< " frees slot " << slotId
);
412 LoadingEntry
&le
= entries
[slotId
];
418 //sd->unlink(fileno); leave garbage on disk, it should not hurt
421 Ipc::Mem::PageId pageId
;
422 pageId
.pool
= sd
->index
+1;
423 pageId
.number
= slotId
+1;
424 sd
->freeSlots
->push(pageId
);
427 /// adds slot to the free slot index but only if the slot is unused
429 Rock::Rebuild::freeSlotIfIdle(const SlotId slotId
, const bool invalid
)
431 const LoadingEntry
&le
= entries
[slotId
];
433 // mapped slots must be freed via freeBadEntry() to keep the map in sync
437 freeSlot(slotId
, invalid
);
440 /// adds slot to the entry chain in the map
442 Rock::Rebuild::mapSlot(const SlotId slotId
, const DbCellHeader
&header
)
444 LoadingEntry
&le
= entries
[slotId
];
449 Ipc::StoreMapSlice slice
;
450 slice
.next
= header
.nextSlot
;
451 slice
.size
= header
.payloadSize
;
452 sd
->map
->importSlice(slotId
, slice
);
455 /// adds slot to an existing entry chain; caller must check that the slot
456 /// belongs to the chain it is being added to
458 Rock::Rebuild::addSlotToEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
460 LoadingEntry
&le
= entries
[fileno
];
461 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
463 assert(le
.version
== header
.version
);
465 // mark anchor as loaded or add the secondary slot to the chain
466 LoadingEntry
&inode
= entries
[header
.firstSlot
];
467 if (header
.firstSlot
== slotId
) {
468 debugs(47,5, "adding inode");
469 assert(!inode
.freed
);
472 debugs(47,9, "linking " << slotId
<< " to " << inode
.more
);
473 // we do not need to preserve the order
474 LoadingEntry
&slice
= entries
[slotId
];
475 assert(!slice
.freed
);
476 assert(slice
.more
< 0);
477 slice
.more
= inode
.more
;
481 if (header
.firstSlot
== slotId
&& !importEntry(anchor
, fileno
, header
)) {
482 le
.state
= LoadingEntry::leCorrupted
;
483 freeBadEntry(fileno
, "corrupted metainfo");
487 // set total entry size and/or check it for consistency
488 debugs(47, 8, "header.entrySize: " << header
.entrySize
<< " swap_file_sz: " << anchor
.basics
.swap_file_sz
);
489 uint64_t totalSize
= header
.entrySize
;
490 assert(totalSize
!= static_cast<uint64_t>(-1));
491 if (!totalSize
&& anchor
.basics
.swap_file_sz
) {
492 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
493 // perhaps we loaded a later slot (with entrySize) earlier
494 totalSize
= anchor
.basics
.swap_file_sz
;
495 } else if (totalSize
&& !anchor
.basics
.swap_file_sz
) {
496 anchor
.basics
.swap_file_sz
= totalSize
;
497 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
498 } else if (totalSize
!= anchor
.basics
.swap_file_sz
) {
499 le
.state
= LoadingEntry::leCorrupted
;
500 freeBadEntry(fileno
, "size mismatch");
504 le
.size
+= header
.payloadSize
;
506 if (totalSize
> 0 && le
.size
> totalSize
) { // overflow
507 debugs(47, 8, "overflow: " << le
.size
<< " > " << totalSize
);
508 le
.state
= LoadingEntry::leCorrupted
;
509 freeBadEntry(fileno
, "overflowing");
513 mapSlot(slotId
, header
);
514 if (totalSize
> 0 && le
.size
== totalSize
) {
515 // entry fully loaded, unlock it
516 // we have validated that all db cells for this entry were loaded
517 EBIT_SET(anchor
.basics
.flags
, ENTRY_VALIDATED
);
518 le
.state
= LoadingEntry::leLoaded
;
519 sd
->map
->closeForWriting(fileno
, false);
524 /// initialize housekeeping information for a newly accepted entry
526 Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
528 anchor
.setKey(reinterpret_cast<const cache_key
*>(header
.key
));
529 assert(header
.firstSlot
>= 0);
530 anchor
.start
= header
.firstSlot
;
532 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
534 LoadingEntry
&le
= entries
[fileno
];
535 le
.state
= LoadingEntry::leLoading
;
536 le
.version
= header
.version
;
540 /// handle a slot from an entry that we have not seen before
542 Rock::Rebuild::startNewEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
544 // If some other from-disk entry is/was using this slot as its inode OR
545 // if some other from-disk entry is/was using our inode slot, then the
546 // entries are conflicting. We cannot identify other entries, so we just
547 // remove ours and hope that the others were/will be handled correctly.
548 const LoadingEntry
&slice
= entries
[slotId
];
549 const LoadingEntry
&inode
= entries
[header
.firstSlot
];
550 if (slice
.used() || inode
.used()) {
551 debugs(47,8, "slice/inode used: " << slice
.used() << inode
.used());
552 LoadingEntry
&le
= entries
[fileno
];
553 le
.state
= LoadingEntry::leCorrupted
;
554 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
555 // if not idle, the other entry will handle its slice
560 // A miss may have been stored at our fileno while we were loading other
561 // slots from disk. We ought to preserve that entry because it is fresher.
562 const bool overwriteExisting
= false;
563 if (Ipc::StoreMap::Anchor
*anchor
= sd
->map
->openForWritingAt(fileno
, overwriteExisting
)) {
564 primeNewEntry(*anchor
, fileno
, header
);
565 addSlotToEntry(fileno
, slotId
, header
); // may fail
566 assert(anchor
->basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
568 // A new from-network entry is occupying our map slot; let it be, but
569 // save us from the trouble of going through the above motions again.
570 LoadingEntry
&le
= entries
[fileno
];
571 le
.state
= LoadingEntry::leIgnored
;
572 freeSlotIfIdle(slotId
, false);
576 /// does the header belong to the fileno entry being loaded?
578 Rock::Rebuild::sameEntry(const sfileno fileno
, const DbCellHeader
&header
) const
580 const Ipc::StoreMap::Anchor
&anchor
= sd
->map
->writeableEntry(fileno
);
581 const LoadingEntry
&le
= entries
[fileno
];
582 // any order will work, but do fast comparisons first:
583 return le
.version
== header
.version
&&
584 anchor
.start
== static_cast<Ipc::StoreMapSliceId
>(header
.firstSlot
) &&
585 anchor
.sameKey(reinterpret_cast<const cache_key
*>(header
.key
));
588 /// is the new header consistent with information already loaded?
590 Rock::Rebuild::canAdd(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
) const
592 if (!sameEntry(fileno
, header
)) {
593 debugs(79, 7, "cannot add; wrong entry");
597 const LoadingEntry
&le
= entries
[slotId
];
598 // We cannot add a slot that was already declared free or mapped.
599 if (le
.freed
|| le
.mapped
) {
600 debugs(79, 7, "cannot add; freed/mapped: " << le
.freed
<< le
.mapped
);
604 if (slotId
== header
.firstSlot
) {
605 // If we are the inode, the anchored flag cannot be set yet.
606 if (entries
[fileno
].anchored
) {
607 debugs(79, 7, "cannot add; extra anchor");
611 // And there should have been some other slot for this entry to exist.
613 debugs(79, 7, "cannot add; missing slots");
620 // We are the continuation slice so the more field is reserved for us.
622 debugs(79, 7, "cannot add; foreign slot");
629 /// handle freshly loaded (and validated) db slot header
631 Rock::Rebuild::useNewSlot(const SlotId slotId
, const DbCellHeader
&header
)
633 LoadingEntry
&slice
= entries
[slotId
];
634 assert(!slice
.freed
); // we cannot free what was not loaded
636 const cache_key
*const key
=
637 reinterpret_cast<const cache_key
*>(header
.key
);
638 const sfileno fileno
= sd
->map
->anchorIndexByKey(key
);
639 assert(0 <= fileno
&& fileno
< dbEntryLimit
);
641 LoadingEntry
&le
= entries
[fileno
];
642 debugs(47,9, "entry " << fileno
<< " state: " << le
.state
<< ", inode: " <<
643 header
.firstSlot
<< ", size: " << header
.payloadSize
);
647 case LoadingEntry::leEmpty
: {
648 startNewEntry(fileno
, slotId
, header
);
652 case LoadingEntry::leLoading
: {
653 if (canAdd(fileno
, slotId
, header
)) {
654 addSlotToEntry(fileno
, slotId
, header
);
656 // either the loading chain or this slot is stale;
657 // be conservative and ignore both (and any future ones)
658 le
.state
= LoadingEntry::leCorrupted
;
659 freeBadEntry(fileno
, "duplicated");
660 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
666 case LoadingEntry::leLoaded
: {
667 // either the previously loaded chain or this slot is stale;
668 // be conservative and ignore both (and any future ones)
669 le
.state
= LoadingEntry::leCorrupted
;
670 sd
->map
->freeEntry(fileno
); // may not be immediately successful
671 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
676 case LoadingEntry::leCorrupted
: {
677 // previously seen slots messed things up so we must ignore this one
678 freeSlotIfIdle(slotId
, false);
682 case LoadingEntry::leIgnored
: {
683 // already replaced by a fresher or colliding from-network entry
684 freeSlotIfIdle(slotId
, false);