]>
Commit | Line | Data |
---|---|---|
e2851fe7 | 1 | /* |
5b74111a | 2 | * Copyright (C) 1996-2018 The Squid Software Foundation and contributors |
bbc27441 AJ |
3 | * |
4 | * Squid software is distributed under GPLv2+ license and includes | |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
e2851fe7 AR |
7 | */ |
8 | ||
bbc27441 AJ |
9 | /* DEBUG: section 79 Disk IO Routines */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
2745fea5 | 12 | #include "base/AsyncJobCalls.h" |
602d9612 | 13 | #include "fs/rock/RockDbCell.h" |
e2851fe7 AR |
14 | #include "fs/rock/RockRebuild.h" |
15 | #include "fs/rock/RockSwapDir.h" | |
b3f7fd88 | 16 | #include "fs_io.h" |
67679543 | 17 | #include "globals.h" |
dcd84f80 | 18 | #include "ipc/StoreMap.h" |
582c2af2 | 19 | #include "md5.h" |
386d28bf | 20 | #include "SquidTime.h" |
2745fea5 | 21 | #include "Store.h" |
b3f7fd88 | 22 | #include "store_rebuild.h" |
602d9612 | 23 | #include "tools.h" |
e2851fe7 | 24 | |
1a30fdf5 | 25 | #include <cerrno> |
21d845b1 | 26 | |
e2851fe7 AR |
27 | CBDATA_NAMESPACED_CLASS_INIT(Rock, Rebuild); |
28 | ||
e4d13993 AR |
29 | /** |
30 | \defgroup RockFsRebuild Rock Store Rebuild | |
31 | \ingroup Filesystems | |
32 | * | |
33 | \section Overview Overview | |
34 | * Several layers of information are manipualted during the rebuild: | |
35 | \par | |
36 | * Store Entry: Response message plus all the metainformation associated with | |
37 | * it. Identified by store key. At any given time, from Squid point | |
38 | * of view, there is only one entry with a given key, but several | |
39 | * different entries with the same key can be observed in any historical | |
40 | * archive (such as an access log or a store database). | |
41 | \par | |
42 | * Slot chain: A sequence of db slots representing a Store Entry state at | |
43 | * some point in time. Identified by key+version combination. Due to | |
44 | * transaction aborts, crashes, and idle periods, some chains may contain | |
45 | * incomplete or stale information. We assume that no two different chains | |
46 | * have the same key and version. If that assumption fails, we may serve a | |
47 | * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed. | |
48 | \par | |
abf396ec AR |
49 | * iNode: The very first db slot in an entry slot chain. This slot contains |
50 | * at least the beginning of Store Entry metadata, but most 32KB inodes contain | |
51 | * the entire metadata, HTTP headers, and HTTP body. | |
52 | \par | |
e4d13993 AR |
53 | * Db slot: A db record containing a piece of a single store entry and linked |
54 | * to other slots with the same key and version fields, forming a chain. | |
55 | * Slots are identified by their absolute position in the database file, | |
56 | * which is naturally unique. | |
57 | \par | |
e4d13993 AR |
58 | * When information from the newly loaded db slot contradicts the entry-level |
59 | * information collected so far (e.g., the versions do not match or the total | |
60 | * chain size after the slot contribution exceeds the expected number), the | |
61 | * whole entry (and not just the chain or the slot!) is declared corrupted. | |
62 | \par | |
63 | * Why invalidate the whole entry? Rock Store is written for high-load | |
64 | * environments with large caches, where there is usually very few idle slots | |
65 | * in the database. A space occupied by a purged entry is usually immediately | |
66 | * reclaimed. A Squid crash or a transaction abort is rather unlikely to | |
67 | * leave a relatively large number of stale slots in the database. Thus, the | |
68 | * number of potentially corrupted entries is relatively small. On the other | |
69 | * hand, the damage from serving a single hadgepodge entry may be significant | |
70 | * to the user. In such an environment, invalidating the whole entry has | |
71 | * negligible performance impact but saves us from high-damage bugs. | |
72 | */ | |
73 | ||
9d4e9cfb AR |
74 | namespace Rock |
75 | { | |
50dc81ec | 76 | |
abf396ec AR |
77 | /// low-level anti-padding storage class for LoadingEntry and LoadingSlot flags |
78 | class LoadingFlags | |
9d4e9cfb | 79 | { |
50dc81ec | 80 | public: |
abf396ec | 81 | LoadingFlags(): state(0), anchored(0), mapped(0), finalized(0), freed(0) {} |
50dc81ec | 82 | |
abf396ec AR |
83 | /* for LoadingEntry */ |
84 | uint8_t state:3; ///< current entry state (one of the LoadingEntry::State values) | |
36c84e19 | 85 | uint8_t anchored:1; ///< whether we loaded the inode slot for this entry |
50dc81ec | 86 | |
abf396ec AR |
87 | /* for LoadingSlot */ |
88 | uint8_t mapped:1; ///< whether the slot was added to a mapped entry | |
89 | uint8_t finalized:1; ///< whether finalizeOrThrow() has scanned the slot | |
90 | uint8_t freed:1; ///< whether the slot was given to the map as free space | |
91 | }; | |
92 | ||
93 | /// smart StoreEntry-level info pointer (hides anti-padding LoadingParts arrays) | |
94 | class LoadingEntry | |
95 | { | |
96 | public: | |
97 | LoadingEntry(const sfileno fileNo, LoadingParts &source); | |
98 | ||
99 | uint64_t &size; ///< payload seen so far | |
100 | uint32_t &version; ///< DbCellHeader::version to distinguish same-URL chains | |
50dc81ec | 101 | |
abf396ec | 102 | /// possible store entry states during index rebuild |
50dc81ec | 103 | typedef enum { leEmpty = 0, leLoading, leLoaded, leCorrupted, leIgnored } State; |
abf396ec AR |
104 | |
105 | /* LoadingFlags::state */ | |
106 | State state() const { return static_cast<State>(flags.state); } | |
107 | void state(State aState) const { flags.state = aState; } | |
108 | ||
109 | /* LoadingFlags::anchored */ | |
110 | bool anchored() const { return flags.anchored; } | |
111 | void anchored(const bool beAnchored) { flags.anchored = beAnchored; } | |
112 | ||
113 | private: | |
114 | LoadingFlags &flags; ///< entry flags (see the above accessors) are ours | |
115 | }; | |
116 | ||
117 | /// smart db slot-level info pointer (hides anti-padding LoadingParts arrays) | |
118 | class LoadingSlot | |
119 | { | |
120 | public: | |
121 | LoadingSlot(const SlotId slotId, LoadingParts &source); | |
122 | ||
123 | /// another slot in some chain belonging to the same entry (unordered!) | |
124 | Ipc::StoreMapSliceId &more; | |
125 | ||
126 | /* LoadingFlags::mapped */ | |
127 | bool mapped() const { return flags.mapped; } | |
128 | void mapped(const bool beMapped) { flags.mapped = beMapped; } | |
129 | ||
130 | /* LoadingFlags::finalized */ | |
131 | bool finalized() const { return flags.finalized; } | |
132 | void finalized(const bool beFinalized) { flags.finalized = beFinalized; } | |
133 | ||
134 | /* LoadingFlags::freed */ | |
135 | bool freed() const { return flags.freed; } | |
136 | void freed(const bool beFreed) { flags.freed = beFreed; } | |
137 | ||
138 | bool used() const { return freed() || mapped() || more != -1; } | |
139 | ||
140 | private: | |
141 | LoadingFlags &flags; ///< slot flags (see the above accessors) are ours | |
142 | }; | |
143 | ||
144 | /// information about store entries being loaded from disk (and their slots) | |
145 | /// used for identifying partially stored/loaded entries | |
146 | class LoadingParts | |
147 | { | |
148 | public: | |
149 | LoadingParts(int dbSlotLimit, int dbEntryLimit); | |
150 | LoadingParts(LoadingParts&&) = delete; // paranoid (often too huge to copy) | |
151 | ||
152 | private: | |
153 | friend class LoadingEntry; | |
154 | friend class LoadingSlot; | |
155 | ||
156 | /* Anti-padding storage. With millions of entries, padding matters! */ | |
157 | ||
158 | /* indexed by sfileno */ | |
159 | std::vector<uint64_t> sizes; ///< LoadingEntry::size for all entries | |
160 | std::vector<uint32_t> versions; ///< LoadingEntry::version for all entries | |
161 | ||
162 | /* indexed by SlotId */ | |
163 | std::vector<Ipc::StoreMapSliceId> mores; ///< LoadingSlot::more for all slots | |
164 | ||
165 | /* entry flags are indexed by sfileno; slot flags -- by SlotId */ | |
166 | std::vector<LoadingFlags> flags; ///< all LoadingEntry and LoadingSlot flags | |
50dc81ec AR |
167 | }; |
168 | ||
169 | } /* namespace Rock */ | |
170 | ||
abf396ec AR |
171 | /* LoadingEntry */ |
172 | ||
173 | Rock::LoadingEntry::LoadingEntry(const sfileno fileNo, LoadingParts &source): | |
174 | size(source.sizes.at(fileNo)), | |
175 | version(source.versions.at(fileNo)), | |
176 | flags(source.flags.at(fileNo)) | |
177 | { | |
178 | } | |
179 | ||
180 | /* LoadingSlot */ | |
181 | ||
182 | Rock::LoadingSlot::LoadingSlot(const SlotId slotId, LoadingParts &source): | |
183 | more(source.mores.at(slotId)), | |
184 | flags(source.flags.at(slotId)) | |
185 | { | |
186 | } | |
187 | ||
188 | /* LoadingParts */ | |
189 | ||
190 | Rock::LoadingParts::LoadingParts(const int dbEntryLimit, const int dbSlotLimit): | |
191 | sizes(dbEntryLimit, 0), | |
192 | versions(dbEntryLimit, 0), | |
193 | mores(dbSlotLimit, -1), | |
194 | flags(dbSlotLimit) | |
195 | { | |
196 | assert(sizes.size() == versions.size()); // every entry has both fields | |
197 | assert(sizes.size() <= mores.size()); // every entry needs slot(s) | |
198 | assert(mores.size() == flags.size()); // every slot needs a set of flags | |
199 | } | |
200 | ||
201 | /* Rebuild */ | |
202 | ||
078274f6 | 203 | Rock::Rebuild::Rebuild(SwapDir *dir): AsyncJob("Rock::Rebuild"), |
f53969cc | 204 | sd(dir), |
abf396ec | 205 | parts(nullptr), |
f53969cc SM |
206 | dbSize(0), |
207 | dbSlotSize(0), | |
208 | dbSlotLimit(0), | |
209 | dbEntryLimit(0), | |
210 | fd(-1), | |
211 | dbOffset(0), | |
212 | loadingPos(0), | |
213 | validationPos(0) | |
e2851fe7 AR |
214 | { |
215 | assert(sd); | |
216 | memset(&counts, 0, sizeof(counts)); | |
217 | dbSize = sd->diskOffsetLimit(); // we do not care about the trailer waste | |
36c84e19 AR |
218 | dbSlotSize = sd->slotSize; |
219 | dbEntryLimit = sd->entryLimitActual(); | |
220 | dbSlotLimit = sd->slotLimitActual(); | |
221 | assert(dbEntryLimit <= dbSlotLimit); | |
e2851fe7 AR |
222 | } |
223 | ||
224 | Rock::Rebuild::~Rebuild() | |
225 | { | |
226 | if (fd >= 0) | |
227 | file_close(fd); | |
abf396ec | 228 | delete parts; |
e2851fe7 AR |
229 | } |
230 | ||
231 | /// prepares and initiates entry loading sequence | |
232 | void | |
9199139f AR |
233 | Rock::Rebuild::start() |
234 | { | |
078274f6 AR |
235 | // in SMP mode, only the disker is responsible for populating the map |
236 | if (UsingSmp() && !IamDiskProcess()) { | |
237 | debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" << | |
9199139f | 238 | sd->index << " from " << sd->filePath); |
078274f6 AR |
239 | mustStop("non-disker"); |
240 | return; | |
241 | } | |
242 | ||
095ec2b1 AR |
243 | debugs(47, DBG_IMPORTANT, "Loading cache_dir #" << sd->index << |
244 | " from " << sd->filePath); | |
e2851fe7 AR |
245 | |
246 | fd = file_open(sd->filePath, O_RDONLY | O_BINARY); | |
247 | if (fd < 0) | |
248 | failure("cannot open db", errno); | |
249 | ||
50dc81ec AR |
250 | char hdrBuf[SwapDir::HeaderSize]; |
251 | if (read(fd, hdrBuf, sizeof(hdrBuf)) != SwapDir::HeaderSize) | |
e2851fe7 AR |
252 | failure("cannot read db header", errno); |
253 | ||
50dc81ec AR |
254 | // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours |
255 | assert(sizeof(DbCellHeader) < SM_PAGE_SIZE); | |
256 | buf.init(SM_PAGE_SIZE, SM_PAGE_SIZE); | |
257 | ||
e2851fe7 | 258 | dbOffset = SwapDir::HeaderSize; |
50dc81ec | 259 | |
abf396ec | 260 | parts = new LoadingParts(dbEntryLimit, dbSlotLimit); |
e2851fe7 AR |
261 | |
262 | checkpoint(); | |
263 | } | |
264 | ||
078274f6 | 265 | /// continues after a pause if not done |
e2851fe7 AR |
266 | void |
267 | Rock::Rebuild::checkpoint() | |
268 | { | |
50dc81ec | 269 | if (!done()) |
e2851fe7 | 270 | eventAdd("Rock::Rebuild", Rock::Rebuild::Steps, this, 0.01, 1, true); |
078274f6 AR |
271 | } |
272 | ||
abf396ec AR |
273 | bool |
274 | Rock::Rebuild::doneLoading() const | |
275 | { | |
276 | return loadingPos >= dbSlotLimit; | |
277 | } | |
278 | ||
279 | bool | |
280 | Rock::Rebuild::doneValidating() const | |
281 | { | |
282 | // paranoid slot checking is only enabled with squid -S | |
283 | return validationPos >= dbEntryLimit + | |
284 | (opt_store_doublecheck ? dbSlotLimit : 0); | |
285 | } | |
286 | ||
078274f6 AR |
287 | bool |
288 | Rock::Rebuild::doneAll() const | |
289 | { | |
abf396ec | 290 | return doneLoading() && doneValidating() && AsyncJob::doneAll(); |
e2851fe7 AR |
291 | } |
292 | ||
293 | void | |
294 | Rock::Rebuild::Steps(void *data) | |
295 | { | |
078274f6 AR |
296 | // use async call to enable job call protection that time events lack |
297 | CallJobHere(47, 5, static_cast<Rebuild*>(data), Rock::Rebuild, steps); | |
e2851fe7 AR |
298 | } |
299 | ||
93910d5c | 300 | void |
50dc81ec | 301 | Rock::Rebuild::steps() |
93910d5c | 302 | { |
abf396ec | 303 | if (!doneLoading()) |
50dc81ec AR |
304 | loadingSteps(); |
305 | else | |
306 | validationSteps(); | |
307 | ||
308 | checkpoint(); | |
93910d5c AR |
309 | } |
310 | ||
e2851fe7 | 311 | void |
50dc81ec | 312 | Rock::Rebuild::loadingSteps() |
9199139f | 313 | { |
539283df | 314 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 315 | dbOffset << " <= " << dbSize); |
e2851fe7 | 316 | |
386d28bf | 317 | // Balance our desire to maximize the number of entries processed at once |
9199139f | 318 | // (and, hence, minimize overheads and total rebuild time) with a |
386d28bf AR |
319 | // requirement to also process Coordinator events, disk I/Os, etc. |
320 | const int maxSpentMsec = 50; // keep small: most RAM I/Os are under 1ms | |
321 | const timeval loopStart = current_time; | |
322 | ||
323 | int loaded = 0; | |
abf396ec | 324 | while (!doneLoading()) { |
50dc81ec | 325 | loadOneSlot(); |
36c84e19 | 326 | dbOffset += dbSlotSize; |
6d68a230 | 327 | ++loadingPos; |
386d28bf | 328 | ++loaded; |
e2851fe7 AR |
329 | |
330 | if (counts.scancount % 1000 == 0) | |
36c84e19 | 331 | storeRebuildProgress(sd->index, dbSlotLimit, counts.scancount); |
386d28bf AR |
332 | |
333 | if (opt_foreground_rebuild) | |
334 | continue; // skip "few entries at a time" check below | |
335 | ||
336 | getCurrentTime(); | |
337 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
338 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
339 | debugs(47, 5, HERE << "pausing after " << loaded << " entries in " << | |
9199139f | 340 | elapsedMsec << "ms; " << (elapsedMsec/loaded) << "ms per entry"); |
386d28bf AR |
341 | break; |
342 | } | |
343 | } | |
e2851fe7 AR |
344 | } |
345 | ||
abf396ec AR |
346 | Rock::LoadingEntry |
347 | Rock::Rebuild::loadingEntry(const sfileno fileNo) | |
348 | { | |
349 | Must(0 <= fileNo && fileNo < dbEntryLimit); | |
350 | return LoadingEntry(fileNo, *parts); | |
351 | } | |
352 | ||
353 | Rock::LoadingSlot | |
354 | Rock::Rebuild::loadingSlot(const SlotId slotId) | |
355 | { | |
356 | Must(0 <= slotId && slotId < dbSlotLimit); | |
357 | Must(slotId <= loadingPos); // cannot look ahead | |
358 | return LoadingSlot(slotId, *parts); | |
359 | } | |
360 | ||
93910d5c | 361 | void |
50dc81ec | 362 | Rock::Rebuild::loadOneSlot() |
93910d5c | 363 | { |
539283df | 364 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 365 | dbOffset << " <= " << dbSize); |
e2851fe7 | 366 | |
c728b6f9 AR |
367 | ++counts.scancount; |
368 | ||
e2851fe7 AR |
369 | if (lseek(fd, dbOffset, SEEK_SET) < 0) |
370 | failure("cannot seek to db entry", errno); | |
371 | ||
50dc81ec | 372 | buf.reset(); |
c728b6f9 AR |
373 | |
374 | if (!storeRebuildLoadEntry(fd, sd->index, buf, counts)) | |
375 | return; | |
376 | ||
6d68a230 | 377 | const SlotId slotId = loadingPos; |
93910d5c | 378 | |
50dc81ec AR |
379 | // get our header |
380 | DbCellHeader header; | |
c728b6f9 | 381 | if (buf.contentSize() < static_cast<mb_size_t>(sizeof(header))) { |
51618c6a | 382 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
ce44c1ea AR |
383 | "Ignoring truncated " << buf.contentSize() << "-byte " << |
384 | "cache entry meta data at " << dbOffset); | |
abf396ec | 385 | freeUnusedSlot(slotId, true); |
c728b6f9 AR |
386 | return; |
387 | } | |
50dc81ec AR |
388 | memcpy(&header, buf.content(), sizeof(header)); |
389 | if (header.empty()) { | |
abf396ec | 390 | freeUnusedSlot(slotId, false); |
50dc81ec AR |
391 | return; |
392 | } | |
36c84e19 | 393 | if (!header.sane(dbSlotSize, dbSlotLimit)) { |
51618c6a | 394 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
9199139f | 395 | "Ignoring malformed cache entry meta data at " << dbOffset); |
abf396ec | 396 | freeUnusedSlot(slotId, true); |
e2851fe7 | 397 | return; |
9199139f | 398 | } |
50dc81ec AR |
399 | buf.consume(sizeof(header)); // optimize to avoid memmove() |
400 | ||
401 | useNewSlot(slotId, header); | |
402 | } | |
403 | ||
404 | /// parse StoreEntry basics and add them to the map, returning true on success | |
405 | bool | |
406 | Rock::Rebuild::importEntry(Ipc::StoreMapAnchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
407 | { | |
408 | cache_key key[SQUID_MD5_DIGEST_LENGTH]; | |
409 | StoreEntry loadedE; | |
50dc81ec | 410 | const uint64_t knownSize = header.entrySize > 0 ? |
d2b13bab | 411 | header.entrySize : anchor.basics.swap_file_sz.load(); |
ce44c1ea AR |
412 | if (!storeRebuildParseEntry(buf, loadedE, key, counts, knownSize)) |
413 | return false; | |
414 | ||
abf396ec | 415 | // the entry size may be unknown, but if it is known, it is authoritative |
50dc81ec | 416 | |
ce44c1ea | 417 | debugs(47, 8, "importing basics for entry " << fileno << |
abf396ec | 418 | " inode.entrySize: " << header.entrySize << |
ce44c1ea | 419 | " swap_file_sz: " << loadedE.swap_file_sz); |
50dc81ec AR |
420 | anchor.set(loadedE); |
421 | ||
422 | // we have not validated whether all db cells for this entry were loaded | |
423 | EBIT_CLR(anchor.basics.flags, ENTRY_VALIDATED); | |
424 | ||
425 | // loadedE->dump(5); | |
426 | ||
427 | return true; | |
93910d5c | 428 | } |
e2851fe7 | 429 | |
93910d5c | 430 | void |
50dc81ec | 431 | Rock::Rebuild::validationSteps() |
93910d5c | 432 | { |
50dc81ec | 433 | debugs(47, 5, sd->index << " validating from " << validationPos); |
93910d5c | 434 | |
50dc81ec AR |
435 | // see loadingSteps() for the rationale; TODO: avoid duplication |
436 | const int maxSpentMsec = 50; // keep small: validation does not do I/O | |
437 | const timeval loopStart = current_time; | |
e2851fe7 | 438 | |
50dc81ec | 439 | int validated = 0; |
abf396ec AR |
440 | while (!doneValidating()) { |
441 | if (validationPos < dbEntryLimit) | |
442 | validateOneEntry(validationPos); | |
443 | else | |
444 | validateOneSlot(validationPos - dbEntryLimit); | |
50dc81ec AR |
445 | ++validationPos; |
446 | ++validated; | |
93910d5c | 447 | |
50dc81ec AR |
448 | if (validationPos % 1000 == 0) |
449 | debugs(20, 2, "validated: " << validationPos); | |
e2851fe7 | 450 | |
50dc81ec AR |
451 | if (opt_foreground_rebuild) |
452 | continue; // skip "few entries at a time" check below | |
453 | ||
454 | getCurrentTime(); | |
455 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
456 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
457 | debugs(47, 5, "pausing after " << validated << " entries in " << | |
458 | elapsedMsec << "ms; " << (elapsedMsec/validated) << "ms per entry"); | |
459 | break; | |
460 | } | |
461 | } | |
462 | } | |
463 | ||
abf396ec AR |
464 | /// Either make the entry accessible to all or throw. |
465 | /// This method assumes it is called only when no more entry slots are expected. | |
466 | void | |
467 | Rock::Rebuild::finalizeOrThrow(const sfileno fileNo, LoadingEntry &le) | |
468 | { | |
469 | // walk all map-linked slots, starting from inode, and mark each | |
470 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileNo); | |
471 | Must(le.size > 0); // paranoid | |
472 | uint64_t mappedSize = 0; | |
473 | SlotId slotId = anchor.start; | |
474 | while (slotId >= 0 && mappedSize < le.size) { | |
475 | LoadingSlot slot = loadingSlot(slotId); // throws if we have not loaded that slot | |
476 | Must(!slot.finalized()); // no loops or stealing from other entries | |
477 | Must(slot.mapped()); // all our slots should be in the sd->map | |
478 | Must(!slot.freed()); // all our slots should still be present | |
479 | slot.finalized(true); | |
480 | ||
481 | Ipc::StoreMapSlice &mapSlice = sd->map->writeableSlice(fileNo, slotId); | |
482 | Must(mapSlice.size > 0); // paranoid | |
483 | mappedSize += mapSlice.size; | |
484 | slotId = mapSlice.next; | |
485 | } | |
486 | /* no hodgepodge entries: one entry - one full chain and no leftovers */ | |
487 | Must(slotId < 0); | |
488 | Must(mappedSize == le.size); | |
489 | ||
490 | if (!anchor.basics.swap_file_sz) | |
491 | anchor.basics.swap_file_sz = le.size; | |
492 | EBIT_SET(anchor.basics.flags, ENTRY_VALIDATED); | |
493 | le.state(LoadingEntry::leLoaded); | |
494 | sd->map->closeForWriting(fileNo, false); | |
495 | ++counts.objcount; | |
496 | } | |
497 | ||
498 | /// Either make the entry accessible to all or free it. | |
499 | /// This method must only be called when no more entry slots are expected. | |
50dc81ec | 500 | void |
abf396ec | 501 | Rock::Rebuild::finalizeOrFree(const sfileno fileNo, LoadingEntry &le) |
50dc81ec | 502 | { |
abf396ec AR |
503 | try { |
504 | finalizeOrThrow(fileNo, le); | |
505 | } catch (const std::exception &ex) { | |
506 | freeBadEntry(fileNo, ex.what()); | |
507 | } | |
508 | } | |
50dc81ec | 509 | |
abf396ec AR |
510 | void |
511 | Rock::Rebuild::validateOneEntry(const sfileno fileNo) | |
512 | { | |
513 | LoadingEntry entry = loadingEntry(fileNo); | |
514 | switch (entry.state()) { | |
50dc81ec AR |
515 | |
516 | case LoadingEntry::leLoading: | |
abf396ec | 517 | finalizeOrFree(fileNo, entry); |
50dc81ec AR |
518 | break; |
519 | ||
abf396ec AR |
520 | case LoadingEntry::leEmpty: // no entry hashed to this position |
521 | case LoadingEntry::leLoaded: // we have already unlocked this entry | |
522 | case LoadingEntry::leCorrupted: // we have already removed this entry | |
523 | case LoadingEntry::leIgnored: // we have already discarded this entry | |
524 | break; | |
50dc81ec AR |
525 | } |
526 | } | |
527 | ||
abf396ec AR |
528 | void |
529 | Rock::Rebuild::validateOneSlot(const SlotId slotId) | |
530 | { | |
531 | const LoadingSlot slot = loadingSlot(slotId); | |
532 | // there should not be any unprocessed slots left | |
533 | Must(slot.freed() || (slot.mapped() && slot.finalized())); | |
534 | } | |
535 | ||
50dc81ec AR |
536 | /// Marks remaining bad entry slots as free and unlocks the entry. The map |
537 | /// cannot do this because Loading entries may have holes in the slots chain. | |
538 | void | |
539 | Rock::Rebuild::freeBadEntry(const sfileno fileno, const char *eDescription) | |
540 | { | |
541 | debugs(47, 2, "cache_dir #" << sd->index << ' ' << eDescription << | |
542 | " entry " << fileno << " is ignored during rebuild"); | |
543 | ||
abf396ec AR |
544 | LoadingEntry le = loadingEntry(fileno); |
545 | le.state(LoadingEntry::leCorrupted); | |
50dc81ec | 546 | |
abf396ec AR |
547 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); |
548 | assert(anchor.start < 0 || le.size > 0); | |
549 | for (SlotId slotId = anchor.start; slotId >= 0;) { | |
550 | const SlotId next = loadingSlot(slotId).more; | |
551 | freeSlot(slotId, true); | |
50dc81ec | 552 | slotId = next; |
50dc81ec | 553 | } |
50dc81ec AR |
554 | |
555 | sd->map->forgetWritingEntry(fileno); | |
e2851fe7 AR |
556 | } |
557 | ||
558 | void | |
9199139f AR |
559 | Rock::Rebuild::swanSong() |
560 | { | |
078274f6 | 561 | debugs(47,3, HERE << "cache_dir #" << sd->index << " rebuild level: " << |
9199139f | 562 | StoreController::store_dirs_rebuilding); |
078274f6 | 563 | --StoreController::store_dirs_rebuilding; |
e2851fe7 | 564 | storeRebuildComplete(&counts); |
e2851fe7 AR |
565 | } |
566 | ||
567 | void | |
9199139f AR |
568 | Rock::Rebuild::failure(const char *msg, int errNo) |
569 | { | |
539283df | 570 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 571 | dbOffset << " <= " << dbSize); |
e2851fe7 AR |
572 | |
573 | if (errNo) | |
f5adb654 AR |
574 | debugs(47, DBG_CRITICAL, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo)); |
575 | debugs(47, DBG_CRITICAL, "Do you need to run 'squid -z' to initialize storage?"); | |
e2851fe7 AR |
576 | |
577 | assert(sd); | |
578 | fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.", | |
9199139f | 579 | sd->index, sd->filePath, msg); |
e2851fe7 | 580 | } |
93910d5c | 581 | |
50dc81ec AR |
582 | /// adds slot to the free slot index |
583 | void | |
584 | Rock::Rebuild::freeSlot(const SlotId slotId, const bool invalid) | |
93910d5c | 585 | { |
50dc81ec | 586 | debugs(47,5, sd->index << " frees slot " << slotId); |
abf396ec AR |
587 | LoadingSlot slot = loadingSlot(slotId); |
588 | assert(!slot.freed()); | |
589 | slot.freed(true); | |
50dc81ec AR |
590 | |
591 | if (invalid) { | |
592 | ++counts.invalid; | |
593 | //sd->unlink(fileno); leave garbage on disk, it should not hurt | |
594 | } | |
595 | ||
596 | Ipc::Mem::PageId pageId; | |
597 | pageId.pool = sd->index+1; | |
598 | pageId.number = slotId+1; | |
599 | sd->freeSlots->push(pageId); | |
600 | } | |
601 | ||
abf396ec | 602 | /// freeSlot() for never-been-mapped slots |
50dc81ec | 603 | void |
abf396ec | 604 | Rock::Rebuild::freeUnusedSlot(const SlotId slotId, const bool invalid) |
50dc81ec | 605 | { |
abf396ec | 606 | LoadingSlot slot = loadingSlot(slotId); |
50dc81ec | 607 | // mapped slots must be freed via freeBadEntry() to keep the map in sync |
abf396ec AR |
608 | assert(!slot.mapped()); |
609 | freeSlot(slotId, invalid); | |
50dc81ec AR |
610 | } |
611 | ||
612 | /// adds slot to the entry chain in the map | |
613 | void | |
614 | Rock::Rebuild::mapSlot(const SlotId slotId, const DbCellHeader &header) | |
615 | { | |
abf396ec AR |
616 | LoadingSlot slot = loadingSlot(slotId); |
617 | assert(!slot.mapped()); | |
618 | assert(!slot.freed()); | |
619 | slot.mapped(true); | |
50dc81ec AR |
620 | |
621 | Ipc::StoreMapSlice slice; | |
622 | slice.next = header.nextSlot; | |
623 | slice.size = header.payloadSize; | |
624 | sd->map->importSlice(slotId, slice); | |
625 | } | |
626 | ||
abf396ec AR |
627 | template <class SlotIdType> // accommodates atomic and simple SlotIds. |
628 | void | |
629 | Rock::Rebuild::chainSlots(SlotIdType &from, const SlotId to) | |
630 | { | |
631 | LoadingSlot slot = loadingSlot(to); | |
632 | assert(slot.more < 0); | |
633 | slot.more = from; // may still be unset | |
634 | from = to; | |
635 | } | |
636 | ||
50dc81ec AR |
637 | /// adds slot to an existing entry chain; caller must check that the slot |
638 | /// belongs to the chain it is being added to | |
639 | void | |
640 | Rock::Rebuild::addSlotToEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
641 | { | |
abf396ec | 642 | LoadingEntry le = loadingEntry(fileno); |
50dc81ec AR |
643 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); |
644 | ||
abf396ec AR |
645 | debugs(47,9, "adding " << slotId << " to entry " << fileno); |
646 | // we do not need to preserve the order | |
647 | if (le.anchored()) { | |
648 | LoadingSlot inode = loadingSlot(anchor.start); | |
649 | chainSlots(inode.more, slotId); | |
50dc81ec | 650 | } else { |
abf396ec | 651 | chainSlots(anchor.start, slotId); |
50dc81ec AR |
652 | } |
653 | ||
abf396ec | 654 | le.size += header.payloadSize; // must precede freeBadEntry() calls |
50dc81ec | 655 | |
abf396ec AR |
656 | if (header.firstSlot == slotId) { |
657 | debugs(47,5, "added inode"); | |
658 | ||
659 | if (le.anchored()) { // we have already added another inode slot | |
660 | freeBadEntry(fileno, "inode conflict"); | |
661 | ++counts.clashcount; | |
662 | return; | |
663 | } | |
664 | ||
665 | le.anchored(true); | |
666 | ||
667 | if (!importEntry(anchor, fileno, header)) { | |
668 | freeBadEntry(fileno, "corrupted metainfo"); | |
669 | return; | |
670 | } | |
671 | ||
672 | // set total entry size and/or check it for consistency | |
673 | if (const uint64_t totalSize = header.entrySize) { | |
674 | assert(totalSize != static_cast<uint64_t>(-1)); | |
675 | if (!anchor.basics.swap_file_sz) { | |
676 | anchor.basics.swap_file_sz = totalSize; | |
677 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
678 | } else if (totalSize != anchor.basics.swap_file_sz) { | |
679 | freeBadEntry(fileno, "size mismatch"); | |
680 | return; | |
681 | } | |
682 | } | |
50dc81ec AR |
683 | } |
684 | ||
abf396ec | 685 | const uint64_t totalSize = anchor.basics.swap_file_sz; // may be 0/unknown |
50dc81ec AR |
686 | |
687 | if (totalSize > 0 && le.size > totalSize) { // overflow | |
ce44c1ea | 688 | debugs(47, 8, "overflow: " << le.size << " > " << totalSize); |
50dc81ec AR |
689 | freeBadEntry(fileno, "overflowing"); |
690 | return; | |
691 | } | |
692 | ||
693 | mapSlot(slotId, header); | |
abf396ec AR |
694 | if (totalSize > 0 && le.size == totalSize) |
695 | finalizeOrFree(fileno, le); // entry is probably fully loaded now | |
50dc81ec AR |
696 | } |
697 | ||
698 | /// initialize housekeeping information for a newly accepted entry | |
699 | void | |
700 | Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
701 | { | |
702 | anchor.setKey(reinterpret_cast<const cache_key*>(header.key)); | |
703 | assert(header.firstSlot >= 0); | |
abf396ec | 704 | anchor.start = -1; // addSlotToEntry() will set it |
50dc81ec AR |
705 | |
706 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
707 | ||
abf396ec AR |
708 | LoadingEntry le = loadingEntry(fileno); |
709 | le.state(LoadingEntry::leLoading); | |
50dc81ec AR |
710 | le.version = header.version; |
711 | le.size = 0; | |
712 | } | |
713 | ||
714 | /// handle a slot from an entry that we have not seen before | |
715 | void | |
716 | Rock::Rebuild::startNewEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
717 | { | |
50dc81ec AR |
718 | // A miss may have been stored at our fileno while we were loading other |
719 | // slots from disk. We ought to preserve that entry because it is fresher. | |
720 | const bool overwriteExisting = false; | |
721 | if (Ipc::StoreMap::Anchor *anchor = sd->map->openForWritingAt(fileno, overwriteExisting)) { | |
722 | primeNewEntry(*anchor, fileno, header); | |
723 | addSlotToEntry(fileno, slotId, header); // may fail | |
724 | assert(anchor->basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
725 | } else { | |
726 | // A new from-network entry is occupying our map slot; let it be, but | |
727 | // save us from the trouble of going through the above motions again. | |
abf396ec AR |
728 | LoadingEntry le = loadingEntry(fileno); |
729 | le.state(LoadingEntry::leIgnored); | |
730 | freeUnusedSlot(slotId, false); | |
50dc81ec AR |
731 | } |
732 | } | |
733 | ||
734 | /// does the header belong to the fileno entry being loaded? | |
735 | bool | |
736 | Rock::Rebuild::sameEntry(const sfileno fileno, const DbCellHeader &header) const | |
737 | { | |
abf396ec AR |
738 | // Header updates always result in multi-start chains and often |
739 | // result in multi-version chains so we can only compare the keys. | |
50dc81ec | 740 | const Ipc::StoreMap::Anchor &anchor = sd->map->writeableEntry(fileno); |
abf396ec | 741 | return anchor.sameKey(reinterpret_cast<const cache_key*>(header.key)); |
50dc81ec AR |
742 | } |
743 | ||
744 | /// handle freshly loaded (and validated) db slot header | |
745 | void | |
746 | Rock::Rebuild::useNewSlot(const SlotId slotId, const DbCellHeader &header) | |
747 | { | |
50dc81ec AR |
748 | const cache_key *const key = |
749 | reinterpret_cast<const cache_key*>(header.key); | |
abf396ec | 750 | const sfileno fileno = sd->map->fileNoByKey(key); |
50dc81ec AR |
751 | assert(0 <= fileno && fileno < dbEntryLimit); |
752 | ||
abf396ec AR |
753 | LoadingEntry le = loadingEntry(fileno); |
754 | debugs(47,9, "entry " << fileno << " state: " << le.state() << ", inode: " << | |
9d4e9cfb | 755 | header.firstSlot << ", size: " << header.payloadSize); |
50dc81ec | 756 | |
abf396ec | 757 | switch (le.state()) { |
50dc81ec AR |
758 | |
759 | case LoadingEntry::leEmpty: { | |
760 | startNewEntry(fileno, slotId, header); | |
761 | break; | |
762 | } | |
763 | ||
764 | case LoadingEntry::leLoading: { | |
abf396ec AR |
765 | if (sameEntry(fileno, header)) { |
766 | addSlotToEntry(fileno, slotId, header); // may fail | |
50dc81ec AR |
767 | } else { |
768 | // either the loading chain or this slot is stale; | |
769 | // be conservative and ignore both (and any future ones) | |
50dc81ec | 770 | freeBadEntry(fileno, "duplicated"); |
abf396ec | 771 | freeUnusedSlot(slotId, true); |
50dc81ec AR |
772 | ++counts.dupcount; |
773 | } | |
774 | break; | |
775 | } | |
776 | ||
777 | case LoadingEntry::leLoaded: { | |
778 | // either the previously loaded chain or this slot is stale; | |
779 | // be conservative and ignore both (and any future ones) | |
abf396ec | 780 | le.state(LoadingEntry::leCorrupted); |
50dc81ec | 781 | sd->map->freeEntry(fileno); // may not be immediately successful |
abf396ec | 782 | freeUnusedSlot(slotId, true); |
50dc81ec AR |
783 | ++counts.dupcount; |
784 | break; | |
785 | } | |
786 | ||
787 | case LoadingEntry::leCorrupted: { | |
788 | // previously seen slots messed things up so we must ignore this one | |
abf396ec | 789 | freeUnusedSlot(slotId, true); |
50dc81ec AR |
790 | break; |
791 | } | |
792 | ||
793 | case LoadingEntry::leIgnored: { | |
794 | // already replaced by a fresher or colliding from-network entry | |
abf396ec | 795 | freeUnusedSlot(slotId, false); |
50dc81ec AR |
796 | break; |
797 | } | |
798 | } | |
93910d5c | 799 | } |
f53969cc | 800 |