]>
git.ipfire.org Git - thirdparty/squid.git/blob - src/fs/rock/RockRebuild.cc
2 * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
9 /* DEBUG: section 79 Disk IO Routines */
13 #include "fs/forward.h"
14 #include "fs/rock/RockDbCell.h"
15 #include "fs/rock/RockRebuild.h"
16 #include "fs/rock/RockSwapDir.h"
18 #include "ipc/StoreMap.h"
20 #include "SquidTime.h"
21 #include "store_rebuild.h"
26 CBDATA_NAMESPACED_CLASS_INIT(Rock
, Rebuild
);
29 \defgroup RockFsRebuild Rock Store Rebuild
32 \section Overview Overview
33 * Several layers of information are manipualted during the rebuild:
35 * Store Entry: Response message plus all the metainformation associated with
36 * it. Identified by store key. At any given time, from Squid point
37 * of view, there is only one entry with a given key, but several
38 * different entries with the same key can be observed in any historical
39 * archive (such as an access log or a store database).
41 * Slot chain: A sequence of db slots representing a Store Entry state at
42 * some point in time. Identified by key+version combination. Due to
43 * transaction aborts, crashes, and idle periods, some chains may contain
44 * incomplete or stale information. We assume that no two different chains
45 * have the same key and version. If that assumption fails, we may serve a
46 * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed.
48 * Db slot: A db record containing a piece of a single store entry and linked
49 * to other slots with the same key and version fields, forming a chain.
50 * Slots are identified by their absolute position in the database file,
51 * which is naturally unique.
53 * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is
54 * entry-level and is stored at fileno position. In other words, the array of
55 * LoadingEntries should be interpreted as two arrays, one that maps slot ID
56 * to the LoadingEntry::mapped/free/more members, and the second one that maps
57 * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno.
59 * When information from the newly loaded db slot contradicts the entry-level
60 * information collected so far (e.g., the versions do not match or the total
61 * chain size after the slot contribution exceeds the expected number), the
62 * whole entry (and not just the chain or the slot!) is declared corrupted.
64 * Why invalidate the whole entry? Rock Store is written for high-load
65 * environments with large caches, where there is usually very few idle slots
66 * in the database. A space occupied by a purged entry is usually immediately
67 * reclaimed. A Squid crash or a transaction abort is rather unlikely to
68 * leave a relatively large number of stale slots in the database. Thus, the
69 * number of potentially corrupted entries is relatively small. On the other
70 * hand, the damage from serving a single hadgepodge entry may be significant
71 * to the user. In such an environment, invalidating the whole entry has
72 * negligible performance impact but saves us from high-damage bugs.
78 /// maintains information about the store entry being loaded from disk
79 /// used for identifying partially stored/loaded entries
83 LoadingEntry(): size(0), version(0), state(leEmpty
), anchored(0),
84 mapped(0), freed(0), more(-1) {}
86 /* store entry-level information indexed by sfileno */
87 uint64_t size
; ///< payload seen so far
88 uint32_t version
; ///< DbCellHeader::version to distinguish same-URL chains
89 uint8_t state
:3; ///< current entry state (one of the State values)
90 uint8_t anchored
:1; ///< whether we loaded the inode slot for this entry
92 /* db slot-level information indexed by slotId, starting with firstSlot */
93 uint8_t mapped
:1; ///< whether this slot was added to a mapped entry
94 uint8_t freed
:1; ///< whether this slot was marked as free
95 Ipc::StoreMapSliceId more
; ///< another slot in some entry chain (unordered)
96 bool used() const { return freed
|| mapped
|| more
!= -1; }
98 /// possible entry states
99 typedef enum { leEmpty
= 0, leLoading
, leLoaded
, leCorrupted
, leIgnored
} State
;
102 } /* namespace Rock */
104 Rock::Rebuild::Rebuild(SwapDir
*dir
): AsyncJob("Rock::Rebuild"),
117 memset(&counts
, 0, sizeof(counts
));
118 dbSize
= sd
->diskOffsetLimit(); // we do not care about the trailer waste
119 dbSlotSize
= sd
->slotSize
;
120 dbEntryLimit
= sd
->entryLimitActual();
121 dbSlotLimit
= sd
->slotLimitActual();
122 assert(dbEntryLimit
<= dbSlotLimit
);
125 Rock::Rebuild::~Rebuild()
132 /// prepares and initiates entry loading sequence
134 Rock::Rebuild::start()
136 // in SMP mode, only the disker is responsible for populating the map
137 if (UsingSmp() && !IamDiskProcess()) {
138 debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" <<
139 sd
->index
<< " from " << sd
->filePath
);
140 mustStop("non-disker");
144 debugs(47, DBG_IMPORTANT
, "Loading cache_dir #" << sd
->index
<<
145 " from " << sd
->filePath
);
147 fd
= file_open(sd
->filePath
, O_RDONLY
| O_BINARY
);
149 failure("cannot open db", errno
);
151 char hdrBuf
[SwapDir::HeaderSize
];
152 if (read(fd
, hdrBuf
, sizeof(hdrBuf
)) != SwapDir::HeaderSize
)
153 failure("cannot read db header", errno
);
155 // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours
156 assert(sizeof(DbCellHeader
) < SM_PAGE_SIZE
);
157 buf
.init(SM_PAGE_SIZE
, SM_PAGE_SIZE
);
159 dbOffset
= SwapDir::HeaderSize
;
161 entries
= new LoadingEntry
[dbSlotLimit
];
166 /// continues after a pause if not done
168 Rock::Rebuild::checkpoint()
171 eventAdd("Rock::Rebuild", Rock::Rebuild::Steps
, this, 0.01, 1, true);
175 Rock::Rebuild::doneAll() const
177 return loadingPos
>= dbSlotLimit
&& validationPos
>= dbSlotLimit
&&
182 Rock::Rebuild::Steps(void *data
)
184 // use async call to enable job call protection that time events lack
185 CallJobHere(47, 5, static_cast<Rebuild
*>(data
), Rock::Rebuild
, steps
);
189 Rock::Rebuild::steps()
191 if (loadingPos
< dbSlotLimit
)
200 Rock::Rebuild::loadingSteps()
202 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
203 dbOffset
<< " <= " << dbSize
);
205 // Balance our desire to maximize the number of entries processed at once
206 // (and, hence, minimize overheads and total rebuild time) with a
207 // requirement to also process Coordinator events, disk I/Os, etc.
208 const int maxSpentMsec
= 50; // keep small: most RAM I/Os are under 1ms
209 const timeval loopStart
= current_time
;
212 while (loadingPos
< dbSlotLimit
) {
214 dbOffset
+= dbSlotSize
;
218 if (counts
.scancount
% 1000 == 0)
219 storeRebuildProgress(sd
->index
, dbSlotLimit
, counts
.scancount
);
221 if (opt_foreground_rebuild
)
222 continue; // skip "few entries at a time" check below
225 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
226 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
227 debugs(47, 5, HERE
<< "pausing after " << loaded
<< " entries in " <<
228 elapsedMsec
<< "ms; " << (elapsedMsec
/loaded
) << "ms per entry");
235 Rock::Rebuild::loadOneSlot()
237 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
238 dbOffset
<< " <= " << dbSize
);
242 if (lseek(fd
, dbOffset
, SEEK_SET
) < 0)
243 failure("cannot seek to db entry", errno
);
247 if (!storeRebuildLoadEntry(fd
, sd
->index
, buf
, counts
))
250 const SlotId slotId
= loadingPos
;
254 if (buf
.contentSize() < static_cast<mb_size_t
>(sizeof(header
))) {
255 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
256 "Ignoring truncated " << buf
.contentSize() << "-byte " <<
257 "cache entry meta data at " << dbOffset
);
258 freeSlotIfIdle(slotId
, true);
261 memcpy(&header
, buf
.content(), sizeof(header
));
262 if (header
.empty()) {
263 freeSlotIfIdle(slotId
, false);
266 if (!header
.sane(dbSlotSize
, dbSlotLimit
)) {
267 debugs(47, DBG_IMPORTANT
, "WARNING: cache_dir[" << sd
->index
<< "]: " <<
268 "Ignoring malformed cache entry meta data at " << dbOffset
);
269 freeSlotIfIdle(slotId
, true);
272 buf
.consume(sizeof(header
)); // optimize to avoid memmove()
274 useNewSlot(slotId
, header
);
277 /// parse StoreEntry basics and add them to the map, returning true on success
279 Rock::Rebuild::importEntry(Ipc::StoreMapAnchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
281 cache_key key
[SQUID_MD5_DIGEST_LENGTH
];
283 const uint64_t knownSize
= header
.entrySize
> 0 ?
284 header
.entrySize
: anchor
.basics
.swap_file_sz
.load();
285 if (!storeRebuildParseEntry(buf
, loadedE
, key
, counts
, knownSize
))
288 // the entry size may still be unknown at this time
290 debugs(47, 8, "importing basics for entry " << fileno
<<
291 " swap_file_sz: " << loadedE
.swap_file_sz
);
294 // we have not validated whether all db cells for this entry were loaded
295 EBIT_CLR(anchor
.basics
.flags
, ENTRY_VALIDATED
);
303 Rock::Rebuild::validationSteps()
305 debugs(47, 5, sd
->index
<< " validating from " << validationPos
);
307 // see loadingSteps() for the rationale; TODO: avoid duplication
308 const int maxSpentMsec
= 50; // keep small: validation does not do I/O
309 const timeval loopStart
= current_time
;
312 while (validationPos
< dbSlotLimit
) {
317 if (validationPos
% 1000 == 0)
318 debugs(20, 2, "validated: " << validationPos
);
320 if (opt_foreground_rebuild
)
321 continue; // skip "few entries at a time" check below
324 const double elapsedMsec
= tvSubMsec(loopStart
, current_time
);
325 if (elapsedMsec
> maxSpentMsec
|| elapsedMsec
< 0) {
326 debugs(47, 5, "pausing after " << validated
<< " entries in " <<
327 elapsedMsec
<< "ms; " << (elapsedMsec
/validated
) << "ms per entry");
334 Rock::Rebuild::validateOneEntry()
336 LoadingEntry
&e
= entries
[validationPos
];
339 case LoadingEntry::leEmpty
:
340 break; // no entry hashed to this position
342 case LoadingEntry::leLoading
:
343 freeBadEntry(validationPos
, "partially stored");
346 case LoadingEntry::leLoaded
:
347 break; // we have already unlocked this entry
349 case LoadingEntry::leCorrupted
:
350 break; // we have already removed this entry
354 /// Marks remaining bad entry slots as free and unlocks the entry. The map
355 /// cannot do this because Loading entries may have holes in the slots chain.
357 Rock::Rebuild::freeBadEntry(const sfileno fileno
, const char *eDescription
)
359 debugs(47, 2, "cache_dir #" << sd
->index
<< ' ' << eDescription
<<
360 " entry " << fileno
<< " is ignored during rebuild");
362 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
364 bool freedSome
= false;
365 // free all loaded non-anchor slots
366 SlotId slotId
= entries
[anchor
.start
].more
;
367 while (slotId
>= 0) {
368 const SlotId next
= entries
[slotId
].more
;
369 freeSlot(slotId
, false);
373 // free anchor slot if it was loaded
374 if (entries
[fileno
].anchored
) {
375 freeSlot(anchor
.start
, false);
380 sd
->map
->forgetWritingEntry(fileno
);
385 Rock::Rebuild::swanSong()
387 debugs(47,3, HERE
<< "cache_dir #" << sd
->index
<< " rebuild level: " <<
388 StoreController::store_dirs_rebuilding
);
389 --StoreController::store_dirs_rebuilding
;
390 storeRebuildComplete(&counts
);
394 Rock::Rebuild::failure(const char *msg
, int errNo
)
396 debugs(47,5, sd
->index
<< " slot " << loadingPos
<< " at " <<
397 dbOffset
<< " <= " << dbSize
);
400 debugs(47, DBG_CRITICAL
, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo
));
401 debugs(47, DBG_CRITICAL
, "Do you need to run 'squid -z' to initialize storage?");
404 fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.",
405 sd
->index
, sd
->filePath
, msg
);
408 /// adds slot to the free slot index
410 Rock::Rebuild::freeSlot(const SlotId slotId
, const bool invalid
)
412 debugs(47,5, sd
->index
<< " frees slot " << slotId
);
413 LoadingEntry
&le
= entries
[slotId
];
419 //sd->unlink(fileno); leave garbage on disk, it should not hurt
422 Ipc::Mem::PageId pageId
;
423 pageId
.pool
= sd
->index
+1;
424 pageId
.number
= slotId
+1;
425 sd
->freeSlots
->push(pageId
);
428 /// adds slot to the free slot index but only if the slot is unused
430 Rock::Rebuild::freeSlotIfIdle(const SlotId slotId
, const bool invalid
)
432 const LoadingEntry
&le
= entries
[slotId
];
434 // mapped slots must be freed via freeBadEntry() to keep the map in sync
438 freeSlot(slotId
, invalid
);
441 /// adds slot to the entry chain in the map
443 Rock::Rebuild::mapSlot(const SlotId slotId
, const DbCellHeader
&header
)
445 LoadingEntry
&le
= entries
[slotId
];
450 Ipc::StoreMapSlice slice
;
451 slice
.next
= header
.nextSlot
;
452 slice
.size
= header
.payloadSize
;
453 sd
->map
->importSlice(slotId
, slice
);
456 /// adds slot to an existing entry chain; caller must check that the slot
457 /// belongs to the chain it is being added to
459 Rock::Rebuild::addSlotToEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
461 LoadingEntry
&le
= entries
[fileno
];
462 Ipc::StoreMapAnchor
&anchor
= sd
->map
->writeableEntry(fileno
);
464 assert(le
.version
== header
.version
);
466 // mark anchor as loaded or add the secondary slot to the chain
467 LoadingEntry
&inode
= entries
[header
.firstSlot
];
468 if (header
.firstSlot
== slotId
) {
469 debugs(47,5, "adding inode");
470 assert(!inode
.freed
);
473 debugs(47,9, "linking " << slotId
<< " to " << inode
.more
);
474 // we do not need to preserve the order
475 LoadingEntry
&slice
= entries
[slotId
];
476 assert(!slice
.freed
);
477 assert(slice
.more
< 0);
478 slice
.more
= inode
.more
;
482 if (header
.firstSlot
== slotId
&& !importEntry(anchor
, fileno
, header
)) {
483 le
.state
= LoadingEntry::leCorrupted
;
484 freeBadEntry(fileno
, "corrupted metainfo");
488 // set total entry size and/or check it for consistency
489 debugs(47, 8, "header.entrySize: " << header
.entrySize
<< " swap_file_sz: " << anchor
.basics
.swap_file_sz
);
490 uint64_t totalSize
= header
.entrySize
;
491 assert(totalSize
!= static_cast<uint64_t>(-1));
492 if (!totalSize
&& anchor
.basics
.swap_file_sz
) {
493 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
494 // perhaps we loaded a later slot (with entrySize) earlier
495 totalSize
= anchor
.basics
.swap_file_sz
;
496 } else if (totalSize
&& !anchor
.basics
.swap_file_sz
) {
497 anchor
.basics
.swap_file_sz
= totalSize
;
498 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
499 } else if (totalSize
!= anchor
.basics
.swap_file_sz
) {
500 le
.state
= LoadingEntry::leCorrupted
;
501 freeBadEntry(fileno
, "size mismatch");
505 le
.size
+= header
.payloadSize
;
507 if (totalSize
> 0 && le
.size
> totalSize
) { // overflow
508 debugs(47, 8, "overflow: " << le
.size
<< " > " << totalSize
);
509 le
.state
= LoadingEntry::leCorrupted
;
510 freeBadEntry(fileno
, "overflowing");
514 mapSlot(slotId
, header
);
515 if (totalSize
> 0 && le
.size
== totalSize
) {
516 // entry fully loaded, unlock it
517 // we have validated that all db cells for this entry were loaded
518 EBIT_SET(anchor
.basics
.flags
, ENTRY_VALIDATED
);
519 le
.state
= LoadingEntry::leLoaded
;
520 sd
->map
->closeForWriting(fileno
, false);
525 /// initialize housekeeping information for a newly accepted entry
527 Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor
&anchor
, const sfileno fileno
, const DbCellHeader
&header
)
529 anchor
.setKey(reinterpret_cast<const cache_key
*>(header
.key
));
530 assert(header
.firstSlot
>= 0);
531 anchor
.start
= header
.firstSlot
;
533 assert(anchor
.basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
535 LoadingEntry
&le
= entries
[fileno
];
536 le
.state
= LoadingEntry::leLoading
;
537 le
.version
= header
.version
;
541 /// handle a slot from an entry that we have not seen before
543 Rock::Rebuild::startNewEntry(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
)
545 // If some other from-disk entry is/was using this slot as its inode OR
546 // if some other from-disk entry is/was using our inode slot, then the
547 // entries are conflicting. We cannot identify other entries, so we just
548 // remove ours and hope that the others were/will be handled correctly.
549 const LoadingEntry
&slice
= entries
[slotId
];
550 const LoadingEntry
&inode
= entries
[header
.firstSlot
];
551 if (slice
.used() || inode
.used()) {
552 debugs(47,8, "slice/inode used: " << slice
.used() << inode
.used());
553 LoadingEntry
&le
= entries
[fileno
];
554 le
.state
= LoadingEntry::leCorrupted
;
555 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
556 // if not idle, the other entry will handle its slice
561 // A miss may have been stored at our fileno while we were loading other
562 // slots from disk. We ought to preserve that entry because it is fresher.
563 const bool overwriteExisting
= false;
564 if (Ipc::StoreMap::Anchor
*anchor
= sd
->map
->openForWritingAt(fileno
, overwriteExisting
)) {
565 primeNewEntry(*anchor
, fileno
, header
);
566 addSlotToEntry(fileno
, slotId
, header
); // may fail
567 assert(anchor
->basics
.swap_file_sz
!= static_cast<uint64_t>(-1));
569 // A new from-network entry is occupying our map slot; let it be, but
570 // save us from the trouble of going through the above motions again.
571 LoadingEntry
&le
= entries
[fileno
];
572 le
.state
= LoadingEntry::leIgnored
;
573 freeSlotIfIdle(slotId
, false);
577 /// does the header belong to the fileno entry being loaded?
579 Rock::Rebuild::sameEntry(const sfileno fileno
, const DbCellHeader
&header
) const
581 const Ipc::StoreMap::Anchor
&anchor
= sd
->map
->writeableEntry(fileno
);
582 const LoadingEntry
&le
= entries
[fileno
];
583 // any order will work, but do fast comparisons first:
584 return le
.version
== header
.version
&&
585 anchor
.start
== static_cast<Ipc::StoreMapSliceId
>(header
.firstSlot
) &&
586 anchor
.sameKey(reinterpret_cast<const cache_key
*>(header
.key
));
589 /// is the new header consistent with information already loaded?
591 Rock::Rebuild::canAdd(const sfileno fileno
, const SlotId slotId
, const DbCellHeader
&header
) const
593 if (!sameEntry(fileno
, header
)) {
594 debugs(79, 7, "cannot add; wrong entry");
598 const LoadingEntry
&le
= entries
[slotId
];
599 // We cannot add a slot that was already declared free or mapped.
600 if (le
.freed
|| le
.mapped
) {
601 debugs(79, 7, "cannot add; freed/mapped: " << le
.freed
<< le
.mapped
);
605 if (slotId
== header
.firstSlot
) {
606 // If we are the inode, the anchored flag cannot be set yet.
607 if (entries
[fileno
].anchored
) {
608 debugs(79, 7, "cannot add; extra anchor");
612 // And there should have been some other slot for this entry to exist.
614 debugs(79, 7, "cannot add; missing slots");
621 // We are the continuation slice so the more field is reserved for us.
623 debugs(79, 7, "cannot add; foreign slot");
630 /// handle freshly loaded (and validated) db slot header
632 Rock::Rebuild::useNewSlot(const SlotId slotId
, const DbCellHeader
&header
)
634 LoadingEntry
&slice
= entries
[slotId
];
635 assert(!slice
.freed
); // we cannot free what was not loaded
637 const cache_key
*const key
=
638 reinterpret_cast<const cache_key
*>(header
.key
);
639 const sfileno fileno
= sd
->map
->anchorIndexByKey(key
);
640 assert(0 <= fileno
&& fileno
< dbEntryLimit
);
642 LoadingEntry
&le
= entries
[fileno
];
643 debugs(47,9, "entry " << fileno
<< " state: " << le
.state
<< ", inode: " <<
644 header
.firstSlot
<< ", size: " << header
.payloadSize
);
648 case LoadingEntry::leEmpty
: {
649 startNewEntry(fileno
, slotId
, header
);
653 case LoadingEntry::leLoading
: {
654 if (canAdd(fileno
, slotId
, header
)) {
655 addSlotToEntry(fileno
, slotId
, header
);
657 // either the loading chain or this slot is stale;
658 // be conservative and ignore both (and any future ones)
659 le
.state
= LoadingEntry::leCorrupted
;
660 freeBadEntry(fileno
, "duplicated");
661 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
667 case LoadingEntry::leLoaded
: {
668 // either the previously loaded chain or this slot is stale;
669 // be conservative and ignore both (and any future ones)
670 le
.state
= LoadingEntry::leCorrupted
;
671 sd
->map
->freeEntry(fileno
); // may not be immediately successful
672 freeSlotIfIdle(slotId
, slotId
== header
.firstSlot
);
677 case LoadingEntry::leCorrupted
: {
678 // previously seen slots messed things up so we must ignore this one
679 freeSlotIfIdle(slotId
, false);
683 case LoadingEntry::leIgnored
: {
684 // already replaced by a fresher or colliding from-network entry
685 freeSlotIfIdle(slotId
, false);