]>
Commit | Line | Data |
---|---|---|
e2851fe7 | 1 | /* |
f70aedc4 | 2 | * Copyright (C) 1996-2021 The Squid Software Foundation and contributors |
bbc27441 AJ |
3 | * |
4 | * Squid software is distributed under GPLv2+ license and includes | |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
e2851fe7 AR |
7 | */ |
8 | ||
bbc27441 AJ |
9 | /* DEBUG: section 79 Disk IO Routines */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
2745fea5 | 12 | #include "base/AsyncJobCalls.h" |
c59baaa8 | 13 | #include "DebugMessages.h" |
602d9612 | 14 | #include "fs/rock/RockDbCell.h" |
e2851fe7 AR |
15 | #include "fs/rock/RockRebuild.h" |
16 | #include "fs/rock/RockSwapDir.h" | |
b3f7fd88 | 17 | #include "fs_io.h" |
67679543 | 18 | #include "globals.h" |
582c2af2 | 19 | #include "md5.h" |
8ecbe78d | 20 | #include "sbuf/Stream.h" |
386d28bf | 21 | #include "SquidTime.h" |
2745fea5 | 22 | #include "Store.h" |
602d9612 | 23 | #include "tools.h" |
e2851fe7 | 24 | |
1a30fdf5 | 25 | #include <cerrno> |
21d845b1 | 26 | |
e2851fe7 AR |
27 | CBDATA_NAMESPACED_CLASS_INIT(Rock, Rebuild); |
28 | ||
e4d13993 AR |
29 | /** |
30 | \defgroup RockFsRebuild Rock Store Rebuild | |
31 | \ingroup Filesystems | |
32 | * | |
f439fbd2 | 33 | \section RockFsRebuildOverview Overview |
e4d13993 AR |
34 | * Several layers of information are manipualted during the rebuild: |
35 | \par | |
36 | * Store Entry: Response message plus all the metainformation associated with | |
37 | * it. Identified by store key. At any given time, from Squid point | |
38 | * of view, there is only one entry with a given key, but several | |
39 | * different entries with the same key can be observed in any historical | |
40 | * archive (such as an access log or a store database). | |
41 | \par | |
42 | * Slot chain: A sequence of db slots representing a Store Entry state at | |
43 | * some point in time. Identified by key+version combination. Due to | |
44 | * transaction aborts, crashes, and idle periods, some chains may contain | |
45 | * incomplete or stale information. We assume that no two different chains | |
46 | * have the same key and version. If that assumption fails, we may serve a | |
47 | * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed. | |
48 | \par | |
abf396ec AR |
49 | * iNode: The very first db slot in an entry slot chain. This slot contains |
50 | * at least the beginning of Store Entry metadata, but most 32KB inodes contain | |
51 | * the entire metadata, HTTP headers, and HTTP body. | |
52 | \par | |
e4d13993 AR |
53 | * Db slot: A db record containing a piece of a single store entry and linked |
54 | * to other slots with the same key and version fields, forming a chain. | |
55 | * Slots are identified by their absolute position in the database file, | |
56 | * which is naturally unique. | |
57 | \par | |
e4d13993 AR |
58 | * When information from the newly loaded db slot contradicts the entry-level |
59 | * information collected so far (e.g., the versions do not match or the total | |
60 | * chain size after the slot contribution exceeds the expected number), the | |
61 | * whole entry (and not just the chain or the slot!) is declared corrupted. | |
62 | \par | |
63 | * Why invalidate the whole entry? Rock Store is written for high-load | |
64 | * environments with large caches, where there is usually very few idle slots | |
65 | * in the database. A space occupied by a purged entry is usually immediately | |
66 | * reclaimed. A Squid crash or a transaction abort is rather unlikely to | |
67 | * leave a relatively large number of stale slots in the database. Thus, the | |
68 | * number of potentially corrupted entries is relatively small. On the other | |
69 | * hand, the damage from serving a single hadgepodge entry may be significant | |
70 | * to the user. In such an environment, invalidating the whole entry has | |
71 | * negligible performance impact but saves us from high-damage bugs. | |
72 | */ | |
73 | ||
9d4e9cfb AR |
74 | namespace Rock |
75 | { | |
50dc81ec | 76 | |
8ecbe78d EB |
77 | static bool |
78 | DoneLoading(const int64_t loadingPos, const int64_t dbSlotLimit) | |
79 | { | |
80 | return loadingPos >= dbSlotLimit; | |
81 | } | |
82 | ||
83 | static bool | |
84 | DoneValidating(const int64_t validationPos, const int64_t dbSlotLimit, const int64_t dbEntryLimit) | |
85 | { | |
86 | // paranoid slot checking is only enabled with squid -S | |
87 | const auto extraWork = opt_store_doublecheck ? dbSlotLimit : 0; | |
88 | return validationPos >= (dbEntryLimit + extraWork); | |
89 | } | |
90 | ||
abf396ec AR |
91 | /// low-level anti-padding storage class for LoadingEntry and LoadingSlot flags |
92 | class LoadingFlags | |
9d4e9cfb | 93 | { |
50dc81ec | 94 | public: |
abf396ec | 95 | LoadingFlags(): state(0), anchored(0), mapped(0), finalized(0), freed(0) {} |
50dc81ec | 96 | |
abf396ec AR |
97 | /* for LoadingEntry */ |
98 | uint8_t state:3; ///< current entry state (one of the LoadingEntry::State values) | |
36c84e19 | 99 | uint8_t anchored:1; ///< whether we loaded the inode slot for this entry |
50dc81ec | 100 | |
abf396ec AR |
101 | /* for LoadingSlot */ |
102 | uint8_t mapped:1; ///< whether the slot was added to a mapped entry | |
103 | uint8_t finalized:1; ///< whether finalizeOrThrow() has scanned the slot | |
104 | uint8_t freed:1; ///< whether the slot was given to the map as free space | |
105 | }; | |
106 | ||
107 | /// smart StoreEntry-level info pointer (hides anti-padding LoadingParts arrays) | |
108 | class LoadingEntry | |
109 | { | |
110 | public: | |
111 | LoadingEntry(const sfileno fileNo, LoadingParts &source); | |
112 | ||
113 | uint64_t &size; ///< payload seen so far | |
114 | uint32_t &version; ///< DbCellHeader::version to distinguish same-URL chains | |
50dc81ec | 115 | |
abf396ec | 116 | /// possible store entry states during index rebuild |
50dc81ec | 117 | typedef enum { leEmpty = 0, leLoading, leLoaded, leCorrupted, leIgnored } State; |
abf396ec AR |
118 | |
119 | /* LoadingFlags::state */ | |
120 | State state() const { return static_cast<State>(flags.state); } | |
121 | void state(State aState) const { flags.state = aState; } | |
122 | ||
123 | /* LoadingFlags::anchored */ | |
124 | bool anchored() const { return flags.anchored; } | |
125 | void anchored(const bool beAnchored) { flags.anchored = beAnchored; } | |
126 | ||
127 | private: | |
128 | LoadingFlags &flags; ///< entry flags (see the above accessors) are ours | |
129 | }; | |
130 | ||
131 | /// smart db slot-level info pointer (hides anti-padding LoadingParts arrays) | |
132 | class LoadingSlot | |
133 | { | |
134 | public: | |
135 | LoadingSlot(const SlotId slotId, LoadingParts &source); | |
136 | ||
137 | /// another slot in some chain belonging to the same entry (unordered!) | |
138 | Ipc::StoreMapSliceId &more; | |
139 | ||
140 | /* LoadingFlags::mapped */ | |
141 | bool mapped() const { return flags.mapped; } | |
142 | void mapped(const bool beMapped) { flags.mapped = beMapped; } | |
143 | ||
144 | /* LoadingFlags::finalized */ | |
145 | bool finalized() const { return flags.finalized; } | |
146 | void finalized(const bool beFinalized) { flags.finalized = beFinalized; } | |
147 | ||
148 | /* LoadingFlags::freed */ | |
149 | bool freed() const { return flags.freed; } | |
150 | void freed(const bool beFreed) { flags.freed = beFreed; } | |
151 | ||
152 | bool used() const { return freed() || mapped() || more != -1; } | |
153 | ||
154 | private: | |
155 | LoadingFlags &flags; ///< slot flags (see the above accessors) are ours | |
156 | }; | |
157 | ||
158 | /// information about store entries being loaded from disk (and their slots) | |
159 | /// used for identifying partially stored/loaded entries | |
160 | class LoadingParts | |
161 | { | |
162 | public: | |
8ecbe78d EB |
163 | using Sizes = Ipc::StoreMapItems<uint64_t>; |
164 | using Versions = Ipc::StoreMapItems<uint32_t>; | |
165 | using Mores = Ipc::StoreMapItems<Ipc::StoreMapSliceId>; | |
166 | using Flags = Ipc::StoreMapItems<LoadingFlags>; | |
abf396ec | 167 | |
8ecbe78d EB |
168 | LoadingParts(const SwapDir &dir, const bool resuming); |
169 | ~LoadingParts(); | |
abf396ec | 170 | |
8ecbe78d EB |
171 | // lacking copying/moving code and often too huge to copy |
172 | LoadingParts(LoadingParts&&) = delete; | |
173 | ||
174 | Sizes &sizes() const { return *sizesOwner->object(); } | |
175 | Versions &versions() const { return *versionsOwner->object(); } | |
176 | Mores &mores() const { return *moresOwner->object(); } | |
177 | Flags &flags() const { return *flagsOwner->object(); } | |
178 | ||
179 | private: | |
abf396ec AR |
180 | /* Anti-padding storage. With millions of entries, padding matters! */ |
181 | ||
182 | /* indexed by sfileno */ | |
8ecbe78d EB |
183 | Sizes::Owner *sizesOwner; ///< LoadingEntry::size for all entries |
184 | Versions::Owner *versionsOwner; ///< LoadingEntry::version for all entries | |
abf396ec AR |
185 | |
186 | /* indexed by SlotId */ | |
8ecbe78d | 187 | Mores::Owner *moresOwner; ///< LoadingSlot::more for all slots |
abf396ec AR |
188 | |
189 | /* entry flags are indexed by sfileno; slot flags -- by SlotId */ | |
8ecbe78d | 190 | Flags::Owner *flagsOwner; ///< all LoadingEntry and LoadingSlot flags |
50dc81ec AR |
191 | }; |
192 | ||
193 | } /* namespace Rock */ | |
194 | ||
abf396ec AR |
195 | /* LoadingEntry */ |
196 | ||
197 | Rock::LoadingEntry::LoadingEntry(const sfileno fileNo, LoadingParts &source): | |
8ecbe78d EB |
198 | size(source.sizes().at(fileNo)), |
199 | version(source.versions().at(fileNo)), | |
200 | flags(source.flags().at(fileNo)) | |
abf396ec AR |
201 | { |
202 | } | |
203 | ||
204 | /* LoadingSlot */ | |
205 | ||
206 | Rock::LoadingSlot::LoadingSlot(const SlotId slotId, LoadingParts &source): | |
8ecbe78d EB |
207 | more(source.mores().at(slotId)), |
208 | flags(source.flags().at(slotId)) | |
abf396ec AR |
209 | { |
210 | } | |
211 | ||
212 | /* LoadingParts */ | |
213 | ||
8ecbe78d EB |
214 | template <class T> |
215 | inline typename T::Owner * | |
216 | createOwner(const char *dirPath, const char *sfx, const int64_t limit, const bool resuming) | |
217 | { | |
218 | auto id = Ipc::Mem::Segment::Name(SBuf(dirPath), sfx); | |
219 | return resuming ? Ipc::Mem::Owner<T>::Old(id.c_str()) : shm_new(T)(id.c_str(), limit); | |
220 | } | |
221 | ||
222 | Rock::LoadingParts::LoadingParts(const SwapDir &dir, const bool resuming): | |
223 | sizesOwner(createOwner<Sizes>(dir.path, "rebuild_sizes", dir.entryLimitActual(), resuming)), | |
224 | versionsOwner(createOwner<Versions>(dir.path, "rebuild_versions", dir.entryLimitActual(), resuming)), | |
225 | moresOwner(createOwner<Mores>(dir.path, "rebuild_mores", dir.slotLimitActual(), resuming)), | |
226 | flagsOwner(createOwner<Flags>(dir.path, "rebuild_flags", dir.slotLimitActual(), resuming)) | |
abf396ec | 227 | { |
8ecbe78d EB |
228 | assert(sizes().capacity == versions().capacity); // every entry has both fields |
229 | assert(sizes().capacity <= mores().capacity); // every entry needs slot(s) | |
230 | assert(mores().capacity == flags().capacity); // every slot needs a set of flags | |
231 | ||
232 | if (!resuming) { | |
233 | // other parts rely on shared memory segments being zero-initialized | |
234 | // TODO: refactor the next slot pointer to use 0 for nil values | |
235 | mores().fill(-1); | |
236 | } | |
237 | } | |
238 | ||
239 | Rock::LoadingParts::~LoadingParts() | |
240 | { | |
241 | delete sizesOwner; | |
242 | delete versionsOwner; | |
243 | delete moresOwner; | |
244 | delete flagsOwner; | |
245 | } | |
246 | ||
247 | /* Rock::Rebuild::Stats */ | |
248 | ||
249 | SBuf | |
250 | Rock::Rebuild::Stats::Path(const char *dirPath) | |
251 | { | |
252 | return Ipc::Mem::Segment::Name(SBuf(dirPath), "rebuild_stats"); | |
253 | } | |
254 | ||
255 | Ipc::Mem::Owner<Rock::Rebuild::Stats>* | |
256 | Rock::Rebuild::Stats::Init(const SwapDir &dir) | |
257 | { | |
258 | return shm_new(Stats)(Path(dir.path).c_str()); | |
259 | } | |
260 | ||
261 | bool | |
b4bae09e | 262 | Rock::Rebuild::Stats::completed(const SwapDir &dir) const |
8ecbe78d | 263 | { |
b4bae09e JR |
264 | return DoneLoading(counts.scancount, dir.slotLimitActual()) && |
265 | DoneValidating(counts.validations, dir.slotLimitActual(), dir.entryLimitActual()); | |
abf396ec AR |
266 | } |
267 | ||
268 | /* Rebuild */ | |
269 | ||
8ecbe78d | 270 | bool |
8b082ed9 | 271 | Rock::Rebuild::IsResponsible(const SwapDir &) |
8ecbe78d EB |
272 | { |
273 | // in SMP mode, only the disker is responsible for populating the map | |
274 | return !UsingSmp() || IamDiskProcess(); | |
275 | } | |
276 | ||
277 | bool | |
278 | Rock::Rebuild::Start(SwapDir &dir) | |
279 | { | |
280 | if (!IsResponsible(dir)) { | |
281 | debugs(47, 2, "not responsible for indexing cache_dir #" << | |
282 | dir.index << " from " << dir.filePath); | |
283 | return false; | |
284 | } | |
285 | ||
286 | const auto stats = shm_old(Rebuild::Stats)(Stats::Path(dir.path).c_str()); | |
287 | if (stats->completed(dir)) { | |
288 | debugs(47, 2, "already indexed cache_dir #" << | |
289 | dir.index << " from " << dir.filePath); | |
290 | return false; | |
291 | } | |
292 | ||
2b6b1bcb | 293 | AsyncJob::Start(new Rebuild(&dir, stats)); |
8ecbe78d EB |
294 | return true; |
295 | } | |
296 | ||
297 | Rock::Rebuild::Rebuild(SwapDir *dir, const Ipc::Mem::Pointer<Stats> &s): AsyncJob("Rock::Rebuild"), | |
f53969cc | 298 | sd(dir), |
abf396ec | 299 | parts(nullptr), |
8ecbe78d | 300 | stats(s), |
f53969cc SM |
301 | dbSize(0), |
302 | dbSlotSize(0), | |
303 | dbSlotLimit(0), | |
304 | dbEntryLimit(0), | |
305 | fd(-1), | |
306 | dbOffset(0), | |
8ecbe78d EB |
307 | loadingPos(stats->counts.scancount), |
308 | validationPos(stats->counts.validations), | |
309 | counts(stats->counts), | |
310 | resuming(stats->counts.started()) | |
e2851fe7 AR |
311 | { |
312 | assert(sd); | |
e2851fe7 | 313 | dbSize = sd->diskOffsetLimit(); // we do not care about the trailer waste |
36c84e19 AR |
314 | dbSlotSize = sd->slotSize; |
315 | dbEntryLimit = sd->entryLimitActual(); | |
316 | dbSlotLimit = sd->slotLimitActual(); | |
317 | assert(dbEntryLimit <= dbSlotLimit); | |
8ecbe78d | 318 | registerRunner(); |
e2851fe7 AR |
319 | } |
320 | ||
321 | Rock::Rebuild::~Rebuild() | |
322 | { | |
323 | if (fd >= 0) | |
324 | file_close(fd); | |
8ecbe78d EB |
325 | // normally, segments are used until the Squid instance quits, |
326 | // but these indexing-only segments are no longer needed | |
abf396ec | 327 | delete parts; |
e2851fe7 AR |
328 | } |
329 | ||
8ecbe78d EB |
330 | void |
331 | Rock::Rebuild::startShutdown() | |
332 | { | |
333 | mustStop("startShutdown"); | |
334 | } | |
335 | ||
e2851fe7 AR |
336 | /// prepares and initiates entry loading sequence |
337 | void | |
9199139f AR |
338 | Rock::Rebuild::start() |
339 | { | |
8ecbe78d | 340 | assert(IsResponsible(*sd)); |
078274f6 | 341 | |
8ecbe78d | 342 | if (!resuming) { |
c59baaa8 | 343 | debugs(47, Important(18), "Loading cache_dir #" << sd->index << |
8ecbe78d EB |
344 | " from " << sd->filePath); |
345 | } else { | |
c59baaa8 | 346 | debugs(47, Important(63), "Resuming indexing cache_dir #" << sd->index << |
70ac5b29 | 347 | " from " << sd->filePath << ':' << progressDescription()); |
8ecbe78d | 348 | } |
e2851fe7 AR |
349 | |
350 | fd = file_open(sd->filePath, O_RDONLY | O_BINARY); | |
351 | if (fd < 0) | |
352 | failure("cannot open db", errno); | |
353 | ||
50dc81ec AR |
354 | char hdrBuf[SwapDir::HeaderSize]; |
355 | if (read(fd, hdrBuf, sizeof(hdrBuf)) != SwapDir::HeaderSize) | |
e2851fe7 AR |
356 | failure("cannot read db header", errno); |
357 | ||
50dc81ec AR |
358 | // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours |
359 | assert(sizeof(DbCellHeader) < SM_PAGE_SIZE); | |
360 | buf.init(SM_PAGE_SIZE, SM_PAGE_SIZE); | |
361 | ||
8ecbe78d | 362 | dbOffset = SwapDir::HeaderSize + loadingPos * dbSlotSize; |
50dc81ec | 363 | |
8ecbe78d EB |
364 | assert(!parts); |
365 | parts = new LoadingParts(*sd, resuming); | |
366 | ||
367 | counts.updateStartTime(current_time); | |
e2851fe7 AR |
368 | |
369 | checkpoint(); | |
370 | } | |
371 | ||
078274f6 | 372 | /// continues after a pause if not done |
e2851fe7 AR |
373 | void |
374 | Rock::Rebuild::checkpoint() | |
375 | { | |
50dc81ec | 376 | if (!done()) |
e2851fe7 | 377 | eventAdd("Rock::Rebuild", Rock::Rebuild::Steps, this, 0.01, 1, true); |
078274f6 AR |
378 | } |
379 | ||
abf396ec AR |
380 | bool |
381 | Rock::Rebuild::doneLoading() const | |
382 | { | |
8ecbe78d | 383 | return DoneLoading(loadingPos, dbSlotLimit); |
abf396ec AR |
384 | } |
385 | ||
386 | bool | |
387 | Rock::Rebuild::doneValidating() const | |
388 | { | |
8ecbe78d | 389 | return DoneValidating(validationPos, dbSlotLimit, dbEntryLimit); |
abf396ec AR |
390 | } |
391 | ||
078274f6 AR |
392 | bool |
393 | Rock::Rebuild::doneAll() const | |
394 | { | |
abf396ec | 395 | return doneLoading() && doneValidating() && AsyncJob::doneAll(); |
e2851fe7 AR |
396 | } |
397 | ||
398 | void | |
399 | Rock::Rebuild::Steps(void *data) | |
400 | { | |
078274f6 AR |
401 | // use async call to enable job call protection that time events lack |
402 | CallJobHere(47, 5, static_cast<Rebuild*>(data), Rock::Rebuild, steps); | |
e2851fe7 AR |
403 | } |
404 | ||
93910d5c | 405 | void |
50dc81ec | 406 | Rock::Rebuild::steps() |
93910d5c | 407 | { |
abf396ec | 408 | if (!doneLoading()) |
50dc81ec AR |
409 | loadingSteps(); |
410 | else | |
411 | validationSteps(); | |
412 | ||
413 | checkpoint(); | |
93910d5c AR |
414 | } |
415 | ||
e2851fe7 | 416 | void |
50dc81ec | 417 | Rock::Rebuild::loadingSteps() |
9199139f | 418 | { |
539283df | 419 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 420 | dbOffset << " <= " << dbSize); |
e2851fe7 | 421 | |
386d28bf | 422 | // Balance our desire to maximize the number of entries processed at once |
9199139f | 423 | // (and, hence, minimize overheads and total rebuild time) with a |
386d28bf AR |
424 | // requirement to also process Coordinator events, disk I/Os, etc. |
425 | const int maxSpentMsec = 50; // keep small: most RAM I/Os are under 1ms | |
426 | const timeval loopStart = current_time; | |
427 | ||
8ecbe78d | 428 | int64_t loaded = 0; |
abf396ec | 429 | while (!doneLoading()) { |
50dc81ec | 430 | loadOneSlot(); |
36c84e19 | 431 | dbOffset += dbSlotSize; |
6d68a230 | 432 | ++loadingPos; |
386d28bf | 433 | ++loaded; |
e2851fe7 AR |
434 | |
435 | if (counts.scancount % 1000 == 0) | |
36c84e19 | 436 | storeRebuildProgress(sd->index, dbSlotLimit, counts.scancount); |
386d28bf AR |
437 | |
438 | if (opt_foreground_rebuild) | |
439 | continue; // skip "few entries at a time" check below | |
440 | ||
441 | getCurrentTime(); | |
442 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
443 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
444 | debugs(47, 5, HERE << "pausing after " << loaded << " entries in " << | |
9199139f | 445 | elapsedMsec << "ms; " << (elapsedMsec/loaded) << "ms per entry"); |
386d28bf AR |
446 | break; |
447 | } | |
448 | } | |
e2851fe7 AR |
449 | } |
450 | ||
abf396ec AR |
451 | Rock::LoadingEntry |
452 | Rock::Rebuild::loadingEntry(const sfileno fileNo) | |
453 | { | |
454 | Must(0 <= fileNo && fileNo < dbEntryLimit); | |
455 | return LoadingEntry(fileNo, *parts); | |
456 | } | |
457 | ||
458 | Rock::LoadingSlot | |
459 | Rock::Rebuild::loadingSlot(const SlotId slotId) | |
460 | { | |
461 | Must(0 <= slotId && slotId < dbSlotLimit); | |
462 | Must(slotId <= loadingPos); // cannot look ahead | |
463 | return LoadingSlot(slotId, *parts); | |
464 | } | |
465 | ||
93910d5c | 466 | void |
50dc81ec | 467 | Rock::Rebuild::loadOneSlot() |
93910d5c | 468 | { |
539283df | 469 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 470 | dbOffset << " <= " << dbSize); |
e2851fe7 | 471 | |
8ecbe78d EB |
472 | // increment before loadingPos to avoid getting stuck at a slot |
473 | // in a case of crash | |
c728b6f9 AR |
474 | ++counts.scancount; |
475 | ||
e2851fe7 AR |
476 | if (lseek(fd, dbOffset, SEEK_SET) < 0) |
477 | failure("cannot seek to db entry", errno); | |
478 | ||
50dc81ec | 479 | buf.reset(); |
c728b6f9 AR |
480 | |
481 | if (!storeRebuildLoadEntry(fd, sd->index, buf, counts)) | |
482 | return; | |
483 | ||
6d68a230 | 484 | const SlotId slotId = loadingPos; |
93910d5c | 485 | |
50dc81ec AR |
486 | // get our header |
487 | DbCellHeader header; | |
c728b6f9 | 488 | if (buf.contentSize() < static_cast<mb_size_t>(sizeof(header))) { |
51618c6a | 489 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
ce44c1ea AR |
490 | "Ignoring truncated " << buf.contentSize() << "-byte " << |
491 | "cache entry meta data at " << dbOffset); | |
abf396ec | 492 | freeUnusedSlot(slotId, true); |
c728b6f9 AR |
493 | return; |
494 | } | |
50dc81ec AR |
495 | memcpy(&header, buf.content(), sizeof(header)); |
496 | if (header.empty()) { | |
abf396ec | 497 | freeUnusedSlot(slotId, false); |
50dc81ec AR |
498 | return; |
499 | } | |
36c84e19 | 500 | if (!header.sane(dbSlotSize, dbSlotLimit)) { |
51618c6a | 501 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
9199139f | 502 | "Ignoring malformed cache entry meta data at " << dbOffset); |
abf396ec | 503 | freeUnusedSlot(slotId, true); |
e2851fe7 | 504 | return; |
9199139f | 505 | } |
50dc81ec AR |
506 | buf.consume(sizeof(header)); // optimize to avoid memmove() |
507 | ||
508 | useNewSlot(slotId, header); | |
509 | } | |
510 | ||
511 | /// parse StoreEntry basics and add them to the map, returning true on success | |
512 | bool | |
513 | Rock::Rebuild::importEntry(Ipc::StoreMapAnchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
514 | { | |
515 | cache_key key[SQUID_MD5_DIGEST_LENGTH]; | |
516 | StoreEntry loadedE; | |
50dc81ec | 517 | const uint64_t knownSize = header.entrySize > 0 ? |
d2b13bab | 518 | header.entrySize : anchor.basics.swap_file_sz.load(); |
ce44c1ea AR |
519 | if (!storeRebuildParseEntry(buf, loadedE, key, counts, knownSize)) |
520 | return false; | |
521 | ||
abf396ec | 522 | // the entry size may be unknown, but if it is known, it is authoritative |
50dc81ec | 523 | |
ce44c1ea | 524 | debugs(47, 8, "importing basics for entry " << fileno << |
abf396ec | 525 | " inode.entrySize: " << header.entrySize << |
ce44c1ea | 526 | " swap_file_sz: " << loadedE.swap_file_sz); |
50dc81ec AR |
527 | anchor.set(loadedE); |
528 | ||
529 | // we have not validated whether all db cells for this entry were loaded | |
530 | EBIT_CLR(anchor.basics.flags, ENTRY_VALIDATED); | |
531 | ||
532 | // loadedE->dump(5); | |
533 | ||
534 | return true; | |
93910d5c | 535 | } |
e2851fe7 | 536 | |
93910d5c | 537 | void |
50dc81ec | 538 | Rock::Rebuild::validationSteps() |
93910d5c | 539 | { |
50dc81ec | 540 | debugs(47, 5, sd->index << " validating from " << validationPos); |
93910d5c | 541 | |
50dc81ec AR |
542 | // see loadingSteps() for the rationale; TODO: avoid duplication |
543 | const int maxSpentMsec = 50; // keep small: validation does not do I/O | |
544 | const timeval loopStart = current_time; | |
e2851fe7 | 545 | |
8ecbe78d | 546 | int64_t validated = 0; |
abf396ec | 547 | while (!doneValidating()) { |
8ecbe78d EB |
548 | // increment before validationPos to avoid getting stuck at a slot |
549 | // in a case of crash | |
550 | ++counts.validations; | |
abf396ec AR |
551 | if (validationPos < dbEntryLimit) |
552 | validateOneEntry(validationPos); | |
553 | else | |
554 | validateOneSlot(validationPos - dbEntryLimit); | |
50dc81ec AR |
555 | ++validationPos; |
556 | ++validated; | |
93910d5c | 557 | |
50dc81ec AR |
558 | if (validationPos % 1000 == 0) |
559 | debugs(20, 2, "validated: " << validationPos); | |
e2851fe7 | 560 | |
50dc81ec AR |
561 | if (opt_foreground_rebuild) |
562 | continue; // skip "few entries at a time" check below | |
563 | ||
564 | getCurrentTime(); | |
565 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
566 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
567 | debugs(47, 5, "pausing after " << validated << " entries in " << | |
568 | elapsedMsec << "ms; " << (elapsedMsec/validated) << "ms per entry"); | |
569 | break; | |
570 | } | |
571 | } | |
572 | } | |
573 | ||
abf396ec AR |
574 | /// Either make the entry accessible to all or throw. |
575 | /// This method assumes it is called only when no more entry slots are expected. | |
576 | void | |
577 | Rock::Rebuild::finalizeOrThrow(const sfileno fileNo, LoadingEntry &le) | |
578 | { | |
579 | // walk all map-linked slots, starting from inode, and mark each | |
580 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileNo); | |
581 | Must(le.size > 0); // paranoid | |
582 | uint64_t mappedSize = 0; | |
583 | SlotId slotId = anchor.start; | |
584 | while (slotId >= 0 && mappedSize < le.size) { | |
585 | LoadingSlot slot = loadingSlot(slotId); // throws if we have not loaded that slot | |
586 | Must(!slot.finalized()); // no loops or stealing from other entries | |
587 | Must(slot.mapped()); // all our slots should be in the sd->map | |
588 | Must(!slot.freed()); // all our slots should still be present | |
589 | slot.finalized(true); | |
590 | ||
591 | Ipc::StoreMapSlice &mapSlice = sd->map->writeableSlice(fileNo, slotId); | |
592 | Must(mapSlice.size > 0); // paranoid | |
593 | mappedSize += mapSlice.size; | |
594 | slotId = mapSlice.next; | |
595 | } | |
596 | /* no hodgepodge entries: one entry - one full chain and no leftovers */ | |
597 | Must(slotId < 0); | |
598 | Must(mappedSize == le.size); | |
599 | ||
600 | if (!anchor.basics.swap_file_sz) | |
601 | anchor.basics.swap_file_sz = le.size; | |
602 | EBIT_SET(anchor.basics.flags, ENTRY_VALIDATED); | |
603 | le.state(LoadingEntry::leLoaded); | |
8253d451 | 604 | sd->map->closeForWriting(fileNo); |
abf396ec AR |
605 | ++counts.objcount; |
606 | } | |
607 | ||
608 | /// Either make the entry accessible to all or free it. | |
609 | /// This method must only be called when no more entry slots are expected. | |
50dc81ec | 610 | void |
abf396ec | 611 | Rock::Rebuild::finalizeOrFree(const sfileno fileNo, LoadingEntry &le) |
50dc81ec | 612 | { |
abf396ec AR |
613 | try { |
614 | finalizeOrThrow(fileNo, le); | |
615 | } catch (const std::exception &ex) { | |
616 | freeBadEntry(fileNo, ex.what()); | |
617 | } | |
618 | } | |
50dc81ec | 619 | |
abf396ec AR |
620 | void |
621 | Rock::Rebuild::validateOneEntry(const sfileno fileNo) | |
622 | { | |
623 | LoadingEntry entry = loadingEntry(fileNo); | |
624 | switch (entry.state()) { | |
50dc81ec AR |
625 | |
626 | case LoadingEntry::leLoading: | |
abf396ec | 627 | finalizeOrFree(fileNo, entry); |
50dc81ec AR |
628 | break; |
629 | ||
abf396ec AR |
630 | case LoadingEntry::leEmpty: // no entry hashed to this position |
631 | case LoadingEntry::leLoaded: // we have already unlocked this entry | |
632 | case LoadingEntry::leCorrupted: // we have already removed this entry | |
633 | case LoadingEntry::leIgnored: // we have already discarded this entry | |
634 | break; | |
50dc81ec AR |
635 | } |
636 | } | |
637 | ||
abf396ec AR |
638 | void |
639 | Rock::Rebuild::validateOneSlot(const SlotId slotId) | |
640 | { | |
641 | const LoadingSlot slot = loadingSlot(slotId); | |
642 | // there should not be any unprocessed slots left | |
643 | Must(slot.freed() || (slot.mapped() && slot.finalized())); | |
644 | } | |
645 | ||
50dc81ec AR |
646 | /// Marks remaining bad entry slots as free and unlocks the entry. The map |
647 | /// cannot do this because Loading entries may have holes in the slots chain. | |
648 | void | |
649 | Rock::Rebuild::freeBadEntry(const sfileno fileno, const char *eDescription) | |
650 | { | |
651 | debugs(47, 2, "cache_dir #" << sd->index << ' ' << eDescription << | |
652 | " entry " << fileno << " is ignored during rebuild"); | |
653 | ||
abf396ec AR |
654 | LoadingEntry le = loadingEntry(fileno); |
655 | le.state(LoadingEntry::leCorrupted); | |
50dc81ec | 656 | |
abf396ec AR |
657 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); |
658 | assert(anchor.start < 0 || le.size > 0); | |
659 | for (SlotId slotId = anchor.start; slotId >= 0;) { | |
660 | const SlotId next = loadingSlot(slotId).more; | |
661 | freeSlot(slotId, true); | |
50dc81ec | 662 | slotId = next; |
50dc81ec | 663 | } |
50dc81ec AR |
664 | |
665 | sd->map->forgetWritingEntry(fileno); | |
e2851fe7 AR |
666 | } |
667 | ||
668 | void | |
9199139f AR |
669 | Rock::Rebuild::swanSong() |
670 | { | |
078274f6 | 671 | debugs(47,3, HERE << "cache_dir #" << sd->index << " rebuild level: " << |
9199139f | 672 | StoreController::store_dirs_rebuilding); |
e2851fe7 | 673 | storeRebuildComplete(&counts); |
e2851fe7 AR |
674 | } |
675 | ||
676 | void | |
9199139f AR |
677 | Rock::Rebuild::failure(const char *msg, int errNo) |
678 | { | |
539283df | 679 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 680 | dbOffset << " <= " << dbSize); |
e2851fe7 AR |
681 | |
682 | if (errNo) | |
f5adb654 AR |
683 | debugs(47, DBG_CRITICAL, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo)); |
684 | debugs(47, DBG_CRITICAL, "Do you need to run 'squid -z' to initialize storage?"); | |
e2851fe7 AR |
685 | |
686 | assert(sd); | |
687 | fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.", | |
9199139f | 688 | sd->index, sd->filePath, msg); |
e2851fe7 | 689 | } |
93910d5c | 690 | |
50dc81ec AR |
691 | /// adds slot to the free slot index |
692 | void | |
693 | Rock::Rebuild::freeSlot(const SlotId slotId, const bool invalid) | |
93910d5c | 694 | { |
50dc81ec | 695 | debugs(47,5, sd->index << " frees slot " << slotId); |
abf396ec AR |
696 | LoadingSlot slot = loadingSlot(slotId); |
697 | assert(!slot.freed()); | |
698 | slot.freed(true); | |
50dc81ec AR |
699 | |
700 | if (invalid) { | |
701 | ++counts.invalid; | |
702 | //sd->unlink(fileno); leave garbage on disk, it should not hurt | |
703 | } | |
704 | ||
705 | Ipc::Mem::PageId pageId; | |
1fe7f70f | 706 | pageId.pool = Ipc::Mem::PageStack::IdForSwapDirSpace(sd->index); |
50dc81ec AR |
707 | pageId.number = slotId+1; |
708 | sd->freeSlots->push(pageId); | |
709 | } | |
710 | ||
abf396ec | 711 | /// freeSlot() for never-been-mapped slots |
50dc81ec | 712 | void |
abf396ec | 713 | Rock::Rebuild::freeUnusedSlot(const SlotId slotId, const bool invalid) |
50dc81ec | 714 | { |
abf396ec | 715 | LoadingSlot slot = loadingSlot(slotId); |
50dc81ec | 716 | // mapped slots must be freed via freeBadEntry() to keep the map in sync |
abf396ec AR |
717 | assert(!slot.mapped()); |
718 | freeSlot(slotId, invalid); | |
50dc81ec AR |
719 | } |
720 | ||
721 | /// adds slot to the entry chain in the map | |
722 | void | |
723 | Rock::Rebuild::mapSlot(const SlotId slotId, const DbCellHeader &header) | |
724 | { | |
abf396ec AR |
725 | LoadingSlot slot = loadingSlot(slotId); |
726 | assert(!slot.mapped()); | |
727 | assert(!slot.freed()); | |
728 | slot.mapped(true); | |
50dc81ec AR |
729 | |
730 | Ipc::StoreMapSlice slice; | |
731 | slice.next = header.nextSlot; | |
732 | slice.size = header.payloadSize; | |
733 | sd->map->importSlice(slotId, slice); | |
734 | } | |
735 | ||
abf396ec AR |
736 | template <class SlotIdType> // accommodates atomic and simple SlotIds. |
737 | void | |
738 | Rock::Rebuild::chainSlots(SlotIdType &from, const SlotId to) | |
739 | { | |
740 | LoadingSlot slot = loadingSlot(to); | |
741 | assert(slot.more < 0); | |
742 | slot.more = from; // may still be unset | |
743 | from = to; | |
744 | } | |
745 | ||
50dc81ec AR |
746 | /// adds slot to an existing entry chain; caller must check that the slot |
747 | /// belongs to the chain it is being added to | |
748 | void | |
749 | Rock::Rebuild::addSlotToEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
750 | { | |
abf396ec | 751 | LoadingEntry le = loadingEntry(fileno); |
50dc81ec AR |
752 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); |
753 | ||
abf396ec AR |
754 | debugs(47,9, "adding " << slotId << " to entry " << fileno); |
755 | // we do not need to preserve the order | |
756 | if (le.anchored()) { | |
757 | LoadingSlot inode = loadingSlot(anchor.start); | |
758 | chainSlots(inode.more, slotId); | |
50dc81ec | 759 | } else { |
abf396ec | 760 | chainSlots(anchor.start, slotId); |
50dc81ec AR |
761 | } |
762 | ||
abf396ec | 763 | le.size += header.payloadSize; // must precede freeBadEntry() calls |
50dc81ec | 764 | |
abf396ec AR |
765 | if (header.firstSlot == slotId) { |
766 | debugs(47,5, "added inode"); | |
767 | ||
768 | if (le.anchored()) { // we have already added another inode slot | |
769 | freeBadEntry(fileno, "inode conflict"); | |
770 | ++counts.clashcount; | |
771 | return; | |
772 | } | |
773 | ||
774 | le.anchored(true); | |
775 | ||
776 | if (!importEntry(anchor, fileno, header)) { | |
777 | freeBadEntry(fileno, "corrupted metainfo"); | |
778 | return; | |
779 | } | |
780 | ||
781 | // set total entry size and/or check it for consistency | |
782 | if (const uint64_t totalSize = header.entrySize) { | |
783 | assert(totalSize != static_cast<uint64_t>(-1)); | |
784 | if (!anchor.basics.swap_file_sz) { | |
785 | anchor.basics.swap_file_sz = totalSize; | |
786 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
787 | } else if (totalSize != anchor.basics.swap_file_sz) { | |
788 | freeBadEntry(fileno, "size mismatch"); | |
789 | return; | |
790 | } | |
791 | } | |
50dc81ec AR |
792 | } |
793 | ||
abf396ec | 794 | const uint64_t totalSize = anchor.basics.swap_file_sz; // may be 0/unknown |
50dc81ec AR |
795 | |
796 | if (totalSize > 0 && le.size > totalSize) { // overflow | |
ce44c1ea | 797 | debugs(47, 8, "overflow: " << le.size << " > " << totalSize); |
50dc81ec AR |
798 | freeBadEntry(fileno, "overflowing"); |
799 | return; | |
800 | } | |
801 | ||
802 | mapSlot(slotId, header); | |
abf396ec AR |
803 | if (totalSize > 0 && le.size == totalSize) |
804 | finalizeOrFree(fileno, le); // entry is probably fully loaded now | |
50dc81ec AR |
805 | } |
806 | ||
807 | /// initialize housekeeping information for a newly accepted entry | |
808 | void | |
809 | Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
810 | { | |
811 | anchor.setKey(reinterpret_cast<const cache_key*>(header.key)); | |
812 | assert(header.firstSlot >= 0); | |
abf396ec | 813 | anchor.start = -1; // addSlotToEntry() will set it |
50dc81ec AR |
814 | |
815 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
816 | ||
abf396ec AR |
817 | LoadingEntry le = loadingEntry(fileno); |
818 | le.state(LoadingEntry::leLoading); | |
50dc81ec AR |
819 | le.version = header.version; |
820 | le.size = 0; | |
821 | } | |
822 | ||
823 | /// handle a slot from an entry that we have not seen before | |
824 | void | |
825 | Rock::Rebuild::startNewEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
826 | { | |
50dc81ec AR |
827 | // A miss may have been stored at our fileno while we were loading other |
828 | // slots from disk. We ought to preserve that entry because it is fresher. | |
829 | const bool overwriteExisting = false; | |
830 | if (Ipc::StoreMap::Anchor *anchor = sd->map->openForWritingAt(fileno, overwriteExisting)) { | |
831 | primeNewEntry(*anchor, fileno, header); | |
832 | addSlotToEntry(fileno, slotId, header); // may fail | |
833 | assert(anchor->basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
834 | } else { | |
835 | // A new from-network entry is occupying our map slot; let it be, but | |
836 | // save us from the trouble of going through the above motions again. | |
abf396ec AR |
837 | LoadingEntry le = loadingEntry(fileno); |
838 | le.state(LoadingEntry::leIgnored); | |
839 | freeUnusedSlot(slotId, false); | |
50dc81ec AR |
840 | } |
841 | } | |
842 | ||
843 | /// does the header belong to the fileno entry being loaded? | |
844 | bool | |
845 | Rock::Rebuild::sameEntry(const sfileno fileno, const DbCellHeader &header) const | |
846 | { | |
abf396ec AR |
847 | // Header updates always result in multi-start chains and often |
848 | // result in multi-version chains so we can only compare the keys. | |
50dc81ec | 849 | const Ipc::StoreMap::Anchor &anchor = sd->map->writeableEntry(fileno); |
abf396ec | 850 | return anchor.sameKey(reinterpret_cast<const cache_key*>(header.key)); |
50dc81ec AR |
851 | } |
852 | ||
853 | /// handle freshly loaded (and validated) db slot header | |
854 | void | |
855 | Rock::Rebuild::useNewSlot(const SlotId slotId, const DbCellHeader &header) | |
856 | { | |
50dc81ec AR |
857 | const cache_key *const key = |
858 | reinterpret_cast<const cache_key*>(header.key); | |
abf396ec | 859 | const sfileno fileno = sd->map->fileNoByKey(key); |
50dc81ec AR |
860 | assert(0 <= fileno && fileno < dbEntryLimit); |
861 | ||
abf396ec AR |
862 | LoadingEntry le = loadingEntry(fileno); |
863 | debugs(47,9, "entry " << fileno << " state: " << le.state() << ", inode: " << | |
9d4e9cfb | 864 | header.firstSlot << ", size: " << header.payloadSize); |
50dc81ec | 865 | |
abf396ec | 866 | switch (le.state()) { |
50dc81ec AR |
867 | |
868 | case LoadingEntry::leEmpty: { | |
869 | startNewEntry(fileno, slotId, header); | |
870 | break; | |
871 | } | |
872 | ||
873 | case LoadingEntry::leLoading: { | |
abf396ec AR |
874 | if (sameEntry(fileno, header)) { |
875 | addSlotToEntry(fileno, slotId, header); // may fail | |
50dc81ec AR |
876 | } else { |
877 | // either the loading chain or this slot is stale; | |
878 | // be conservative and ignore both (and any future ones) | |
50dc81ec | 879 | freeBadEntry(fileno, "duplicated"); |
abf396ec | 880 | freeUnusedSlot(slotId, true); |
50dc81ec AR |
881 | ++counts.dupcount; |
882 | } | |
883 | break; | |
884 | } | |
885 | ||
886 | case LoadingEntry::leLoaded: { | |
887 | // either the previously loaded chain or this slot is stale; | |
888 | // be conservative and ignore both (and any future ones) | |
abf396ec | 889 | le.state(LoadingEntry::leCorrupted); |
50dc81ec | 890 | sd->map->freeEntry(fileno); // may not be immediately successful |
abf396ec | 891 | freeUnusedSlot(slotId, true); |
50dc81ec AR |
892 | ++counts.dupcount; |
893 | break; | |
894 | } | |
895 | ||
896 | case LoadingEntry::leCorrupted: { | |
897 | // previously seen slots messed things up so we must ignore this one | |
abf396ec | 898 | freeUnusedSlot(slotId, true); |
50dc81ec AR |
899 | break; |
900 | } | |
901 | ||
902 | case LoadingEntry::leIgnored: { | |
903 | // already replaced by a fresher or colliding from-network entry | |
abf396ec | 904 | freeUnusedSlot(slotId, false); |
50dc81ec AR |
905 | break; |
906 | } | |
907 | } | |
93910d5c | 908 | } |
f53969cc | 909 | |
8ecbe78d EB |
910 | SBuf |
911 | Rock::Rebuild::progressDescription() const | |
912 | { | |
913 | SBufStream str; | |
914 | ||
915 | str << Debug::Extra << "slots loaded: " << Progress(loadingPos, dbSlotLimit); | |
916 | ||
917 | const auto validatingEntries = validationPos < dbEntryLimit; | |
918 | const auto entriesValidated = validatingEntries ? validationPos : dbEntryLimit; | |
919 | str << Debug::Extra << "entries validated: " << Progress(entriesValidated, dbEntryLimit); | |
920 | if (opt_store_doublecheck) { | |
921 | const auto slotsValidated = validatingEntries ? 0 : (validationPos - dbEntryLimit); | |
922 | str << Debug::Extra << "slots validated: " << Progress(slotsValidated, dbSlotLimit); | |
923 | } | |
924 | ||
925 | return str.buf(); | |
926 | } | |
927 |