]>
Commit | Line | Data |
---|---|---|
e2851fe7 | 1 | /* |
bde978a6 | 2 | * Copyright (C) 1996-2015 The Squid Software Foundation and contributors |
bbc27441 AJ |
3 | * |
4 | * Squid software is distributed under GPLv2+ license and includes | |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
e2851fe7 AR |
7 | */ |
8 | ||
bbc27441 AJ |
9 | /* DEBUG: section 79 Disk IO Routines */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
438b04d4 | 12 | #include "disk.h" |
602d9612 | 13 | #include "fs/rock/RockDbCell.h" |
e2851fe7 AR |
14 | #include "fs/rock/RockRebuild.h" |
15 | #include "fs/rock/RockSwapDir.h" | |
67679543 | 16 | #include "globals.h" |
dcd84f80 | 17 | #include "ipc/StoreMap.h" |
582c2af2 | 18 | #include "md5.h" |
386d28bf | 19 | #include "SquidTime.h" |
687f5275 | 20 | #include "store_rebuild.h" |
602d9612 A |
21 | #include "tools.h" |
22 | #include "typedefs.h" | |
e2851fe7 | 23 | |
1a30fdf5 | 24 | #include <cerrno> |
21d845b1 | 25 | |
e2851fe7 AR |
26 | CBDATA_NAMESPACED_CLASS_INIT(Rock, Rebuild); |
27 | ||
e4d13993 AR |
28 | /** |
29 | \defgroup RockFsRebuild Rock Store Rebuild | |
30 | \ingroup Filesystems | |
31 | * | |
32 | \section Overview Overview | |
33 | * Several layers of information are manipualted during the rebuild: | |
34 | \par | |
35 | * Store Entry: Response message plus all the metainformation associated with | |
36 | * it. Identified by store key. At any given time, from Squid point | |
37 | * of view, there is only one entry with a given key, but several | |
38 | * different entries with the same key can be observed in any historical | |
39 | * archive (such as an access log or a store database). | |
40 | \par | |
41 | * Slot chain: A sequence of db slots representing a Store Entry state at | |
42 | * some point in time. Identified by key+version combination. Due to | |
43 | * transaction aborts, crashes, and idle periods, some chains may contain | |
44 | * incomplete or stale information. We assume that no two different chains | |
45 | * have the same key and version. If that assumption fails, we may serve a | |
46 | * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed. | |
47 | \par | |
48 | * Db slot: A db record containing a piece of a single store entry and linked | |
49 | * to other slots with the same key and version fields, forming a chain. | |
50 | * Slots are identified by their absolute position in the database file, | |
51 | * which is naturally unique. | |
52 | \par | |
53 | * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is | |
54 | * entry-level and is stored at fileno position. In other words, the array of | |
55 | * LoadingEntries should be interpreted as two arrays, one that maps slot ID | |
56 | * to the LoadingEntry::mapped/free/more members, and the second one that maps | |
57 | * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno. | |
58 | \par | |
59 | * When information from the newly loaded db slot contradicts the entry-level | |
60 | * information collected so far (e.g., the versions do not match or the total | |
61 | * chain size after the slot contribution exceeds the expected number), the | |
62 | * whole entry (and not just the chain or the slot!) is declared corrupted. | |
63 | \par | |
64 | * Why invalidate the whole entry? Rock Store is written for high-load | |
65 | * environments with large caches, where there is usually very few idle slots | |
66 | * in the database. A space occupied by a purged entry is usually immediately | |
67 | * reclaimed. A Squid crash or a transaction abort is rather unlikely to | |
68 | * leave a relatively large number of stale slots in the database. Thus, the | |
69 | * number of potentially corrupted entries is relatively small. On the other | |
70 | * hand, the damage from serving a single hadgepodge entry may be significant | |
71 | * to the user. In such an environment, invalidating the whole entry has | |
72 | * negligible performance impact but saves us from high-damage bugs. | |
73 | */ | |
74 | ||
9d4e9cfb AR |
75 | namespace Rock |
76 | { | |
50dc81ec AR |
77 | |
78 | /// maintains information about the store entry being loaded from disk | |
79 | /// used for identifying partially stored/loaded entries | |
9d4e9cfb AR |
80 | class LoadingEntry |
81 | { | |
50dc81ec AR |
82 | public: |
83 | LoadingEntry(): size(0), version(0), state(leEmpty), anchored(0), | |
f53969cc | 84 | mapped(0), freed(0), more(-1) {} |
50dc81ec AR |
85 | |
86 | /* store entry-level information indexed by sfileno */ | |
87 | uint64_t size; ///< payload seen so far | |
88 | uint32_t version; ///< DbCellHeader::version to distinguish same-URL chains | |
36c84e19 AR |
89 | uint8_t state:3; ///< current entry state (one of the State values) |
90 | uint8_t anchored:1; ///< whether we loaded the inode slot for this entry | |
50dc81ec AR |
91 | |
92 | /* db slot-level information indexed by slotId, starting with firstSlot */ | |
36c84e19 AR |
93 | uint8_t mapped:1; ///< whether this slot was added to a mapped entry |
94 | uint8_t freed:1; ///< whether this slot was marked as free | |
95 | Ipc::StoreMapSliceId more; ///< another slot in some entry chain (unordered) | |
50dc81ec AR |
96 | bool used() const { return freed || mapped || more != -1; } |
97 | ||
98 | /// possible entry states | |
99 | typedef enum { leEmpty = 0, leLoading, leLoaded, leCorrupted, leIgnored } State; | |
100 | }; | |
101 | ||
102 | } /* namespace Rock */ | |
103 | ||
078274f6 | 104 | Rock::Rebuild::Rebuild(SwapDir *dir): AsyncJob("Rock::Rebuild"), |
f53969cc SM |
105 | sd(dir), |
106 | entries(NULL), | |
107 | dbSize(0), | |
108 | dbSlotSize(0), | |
109 | dbSlotLimit(0), | |
110 | dbEntryLimit(0), | |
111 | fd(-1), | |
112 | dbOffset(0), | |
113 | loadingPos(0), | |
114 | validationPos(0) | |
e2851fe7 AR |
115 | { |
116 | assert(sd); | |
117 | memset(&counts, 0, sizeof(counts)); | |
118 | dbSize = sd->diskOffsetLimit(); // we do not care about the trailer waste | |
36c84e19 AR |
119 | dbSlotSize = sd->slotSize; |
120 | dbEntryLimit = sd->entryLimitActual(); | |
121 | dbSlotLimit = sd->slotLimitActual(); | |
122 | assert(dbEntryLimit <= dbSlotLimit); | |
e2851fe7 AR |
123 | } |
124 | ||
125 | Rock::Rebuild::~Rebuild() | |
126 | { | |
127 | if (fd >= 0) | |
128 | file_close(fd); | |
50dc81ec | 129 | delete[] entries; |
e2851fe7 AR |
130 | } |
131 | ||
132 | /// prepares and initiates entry loading sequence | |
133 | void | |
9199139f AR |
134 | Rock::Rebuild::start() |
135 | { | |
078274f6 AR |
136 | // in SMP mode, only the disker is responsible for populating the map |
137 | if (UsingSmp() && !IamDiskProcess()) { | |
138 | debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" << | |
9199139f | 139 | sd->index << " from " << sd->filePath); |
078274f6 AR |
140 | mustStop("non-disker"); |
141 | return; | |
142 | } | |
143 | ||
095ec2b1 AR |
144 | debugs(47, DBG_IMPORTANT, "Loading cache_dir #" << sd->index << |
145 | " from " << sd->filePath); | |
e2851fe7 AR |
146 | |
147 | fd = file_open(sd->filePath, O_RDONLY | O_BINARY); | |
148 | if (fd < 0) | |
149 | failure("cannot open db", errno); | |
150 | ||
50dc81ec AR |
151 | char hdrBuf[SwapDir::HeaderSize]; |
152 | if (read(fd, hdrBuf, sizeof(hdrBuf)) != SwapDir::HeaderSize) | |
e2851fe7 AR |
153 | failure("cannot read db header", errno); |
154 | ||
50dc81ec AR |
155 | // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours |
156 | assert(sizeof(DbCellHeader) < SM_PAGE_SIZE); | |
157 | buf.init(SM_PAGE_SIZE, SM_PAGE_SIZE); | |
158 | ||
e2851fe7 | 159 | dbOffset = SwapDir::HeaderSize; |
50dc81ec | 160 | |
36c84e19 | 161 | entries = new LoadingEntry[dbSlotLimit]; |
e2851fe7 AR |
162 | |
163 | checkpoint(); | |
164 | } | |
165 | ||
078274f6 | 166 | /// continues after a pause if not done |
e2851fe7 AR |
167 | void |
168 | Rock::Rebuild::checkpoint() | |
169 | { | |
50dc81ec | 170 | if (!done()) |
e2851fe7 | 171 | eventAdd("Rock::Rebuild", Rock::Rebuild::Steps, this, 0.01, 1, true); |
078274f6 AR |
172 | } |
173 | ||
174 | bool | |
175 | Rock::Rebuild::doneAll() const | |
176 | { | |
36c84e19 | 177 | return loadingPos >= dbSlotLimit && validationPos >= dbSlotLimit && |
9d4e9cfb | 178 | AsyncJob::doneAll(); |
e2851fe7 AR |
179 | } |
180 | ||
181 | void | |
182 | Rock::Rebuild::Steps(void *data) | |
183 | { | |
078274f6 AR |
184 | // use async call to enable job call protection that time events lack |
185 | CallJobHere(47, 5, static_cast<Rebuild*>(data), Rock::Rebuild, steps); | |
e2851fe7 AR |
186 | } |
187 | ||
93910d5c | 188 | void |
50dc81ec | 189 | Rock::Rebuild::steps() |
93910d5c | 190 | { |
36c84e19 | 191 | if (loadingPos < dbSlotLimit) |
50dc81ec AR |
192 | loadingSteps(); |
193 | else | |
194 | validationSteps(); | |
195 | ||
196 | checkpoint(); | |
93910d5c AR |
197 | } |
198 | ||
e2851fe7 | 199 | void |
50dc81ec | 200 | Rock::Rebuild::loadingSteps() |
9199139f | 201 | { |
539283df | 202 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 203 | dbOffset << " <= " << dbSize); |
e2851fe7 | 204 | |
386d28bf | 205 | // Balance our desire to maximize the number of entries processed at once |
9199139f | 206 | // (and, hence, minimize overheads and total rebuild time) with a |
386d28bf AR |
207 | // requirement to also process Coordinator events, disk I/Os, etc. |
208 | const int maxSpentMsec = 50; // keep small: most RAM I/Os are under 1ms | |
209 | const timeval loopStart = current_time; | |
210 | ||
211 | int loaded = 0; | |
36c84e19 | 212 | while (loadingPos < dbSlotLimit) { |
50dc81ec | 213 | loadOneSlot(); |
36c84e19 | 214 | dbOffset += dbSlotSize; |
6d68a230 | 215 | ++loadingPos; |
386d28bf | 216 | ++loaded; |
e2851fe7 AR |
217 | |
218 | if (counts.scancount % 1000 == 0) | |
36c84e19 | 219 | storeRebuildProgress(sd->index, dbSlotLimit, counts.scancount); |
386d28bf AR |
220 | |
221 | if (opt_foreground_rebuild) | |
222 | continue; // skip "few entries at a time" check below | |
223 | ||
224 | getCurrentTime(); | |
225 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
226 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
227 | debugs(47, 5, HERE << "pausing after " << loaded << " entries in " << | |
9199139f | 228 | elapsedMsec << "ms; " << (elapsedMsec/loaded) << "ms per entry"); |
386d28bf AR |
229 | break; |
230 | } | |
231 | } | |
e2851fe7 AR |
232 | } |
233 | ||
93910d5c | 234 | void |
50dc81ec | 235 | Rock::Rebuild::loadOneSlot() |
93910d5c | 236 | { |
539283df | 237 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 238 | dbOffset << " <= " << dbSize); |
e2851fe7 | 239 | |
c728b6f9 AR |
240 | ++counts.scancount; |
241 | ||
e2851fe7 AR |
242 | if (lseek(fd, dbOffset, SEEK_SET) < 0) |
243 | failure("cannot seek to db entry", errno); | |
244 | ||
50dc81ec | 245 | buf.reset(); |
c728b6f9 AR |
246 | |
247 | if (!storeRebuildLoadEntry(fd, sd->index, buf, counts)) | |
248 | return; | |
249 | ||
6d68a230 | 250 | const SlotId slotId = loadingPos; |
93910d5c | 251 | |
50dc81ec AR |
252 | // get our header |
253 | DbCellHeader header; | |
c728b6f9 | 254 | if (buf.contentSize() < static_cast<mb_size_t>(sizeof(header))) { |
51618c6a | 255 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
ce44c1ea AR |
256 | "Ignoring truncated " << buf.contentSize() << "-byte " << |
257 | "cache entry meta data at " << dbOffset); | |
50dc81ec | 258 | freeSlotIfIdle(slotId, true); |
c728b6f9 AR |
259 | return; |
260 | } | |
50dc81ec AR |
261 | memcpy(&header, buf.content(), sizeof(header)); |
262 | if (header.empty()) { | |
263 | freeSlotIfIdle(slotId, false); | |
264 | return; | |
265 | } | |
36c84e19 | 266 | if (!header.sane(dbSlotSize, dbSlotLimit)) { |
51618c6a | 267 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
9199139f | 268 | "Ignoring malformed cache entry meta data at " << dbOffset); |
50dc81ec | 269 | freeSlotIfIdle(slotId, true); |
e2851fe7 | 270 | return; |
9199139f | 271 | } |
50dc81ec AR |
272 | buf.consume(sizeof(header)); // optimize to avoid memmove() |
273 | ||
274 | useNewSlot(slotId, header); | |
275 | } | |
276 | ||
277 | /// parse StoreEntry basics and add them to the map, returning true on success | |
278 | bool | |
279 | Rock::Rebuild::importEntry(Ipc::StoreMapAnchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
280 | { | |
281 | cache_key key[SQUID_MD5_DIGEST_LENGTH]; | |
282 | StoreEntry loadedE; | |
50dc81ec | 283 | const uint64_t knownSize = header.entrySize > 0 ? |
9d4e9cfb | 284 | header.entrySize : anchor.basics.swap_file_sz.get(); |
ce44c1ea AR |
285 | if (!storeRebuildParseEntry(buf, loadedE, key, counts, knownSize)) |
286 | return false; | |
287 | ||
50dc81ec AR |
288 | // the entry size may still be unknown at this time |
289 | ||
ce44c1ea AR |
290 | debugs(47, 8, "importing basics for entry " << fileno << |
291 | " swap_file_sz: " << loadedE.swap_file_sz); | |
50dc81ec AR |
292 | anchor.set(loadedE); |
293 | ||
294 | // we have not validated whether all db cells for this entry were loaded | |
295 | EBIT_CLR(anchor.basics.flags, ENTRY_VALIDATED); | |
296 | ||
297 | // loadedE->dump(5); | |
298 | ||
299 | return true; | |
93910d5c | 300 | } |
e2851fe7 | 301 | |
93910d5c | 302 | void |
50dc81ec | 303 | Rock::Rebuild::validationSteps() |
93910d5c | 304 | { |
50dc81ec | 305 | debugs(47, 5, sd->index << " validating from " << validationPos); |
93910d5c | 306 | |
50dc81ec AR |
307 | // see loadingSteps() for the rationale; TODO: avoid duplication |
308 | const int maxSpentMsec = 50; // keep small: validation does not do I/O | |
309 | const timeval loopStart = current_time; | |
e2851fe7 | 310 | |
50dc81ec | 311 | int validated = 0; |
36c84e19 | 312 | while (validationPos < dbSlotLimit) { |
50dc81ec AR |
313 | validateOneEntry(); |
314 | ++validationPos; | |
315 | ++validated; | |
93910d5c | 316 | |
50dc81ec AR |
317 | if (validationPos % 1000 == 0) |
318 | debugs(20, 2, "validated: " << validationPos); | |
e2851fe7 | 319 | |
50dc81ec AR |
320 | if (opt_foreground_rebuild) |
321 | continue; // skip "few entries at a time" check below | |
322 | ||
323 | getCurrentTime(); | |
324 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
325 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
326 | debugs(47, 5, "pausing after " << validated << " entries in " << | |
327 | elapsedMsec << "ms; " << (elapsedMsec/validated) << "ms per entry"); | |
328 | break; | |
329 | } | |
330 | } | |
331 | } | |
332 | ||
333 | void | |
334 | Rock::Rebuild::validateOneEntry() | |
335 | { | |
336 | LoadingEntry &e = entries[validationPos]; | |
337 | switch (e.state) { | |
338 | ||
339 | case LoadingEntry::leEmpty: | |
340 | break; // no entry hashed to this position | |
341 | ||
342 | case LoadingEntry::leLoading: | |
343 | freeBadEntry(validationPos, "partially stored"); | |
344 | break; | |
345 | ||
346 | case LoadingEntry::leLoaded: | |
347 | break; // we have already unlocked this entry | |
348 | ||
349 | case LoadingEntry::leCorrupted: | |
350 | break; // we have already removed this entry | |
351 | } | |
352 | } | |
353 | ||
354 | /// Marks remaining bad entry slots as free and unlocks the entry. The map | |
355 | /// cannot do this because Loading entries may have holes in the slots chain. | |
356 | void | |
357 | Rock::Rebuild::freeBadEntry(const sfileno fileno, const char *eDescription) | |
358 | { | |
359 | debugs(47, 2, "cache_dir #" << sd->index << ' ' << eDescription << | |
360 | " entry " << fileno << " is ignored during rebuild"); | |
361 | ||
362 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); | |
363 | ||
364 | bool freedSome = false; | |
365 | // free all loaded non-anchor slots | |
366 | SlotId slotId = entries[anchor.start].more; | |
367 | while (slotId >= 0) { | |
368 | const SlotId next = entries[slotId].more; | |
369 | freeSlot(slotId, false); | |
370 | slotId = next; | |
371 | freedSome = true; | |
372 | } | |
373 | // free anchor slot if it was loaded | |
374 | if (entries[fileno].anchored) { | |
375 | freeSlot(anchor.start, false); | |
376 | freedSome = true; | |
377 | } | |
378 | assert(freedSome); | |
379 | ||
380 | sd->map->forgetWritingEntry(fileno); | |
381 | ++counts.invalid; | |
e2851fe7 AR |
382 | } |
383 | ||
384 | void | |
9199139f AR |
385 | Rock::Rebuild::swanSong() |
386 | { | |
078274f6 | 387 | debugs(47,3, HERE << "cache_dir #" << sd->index << " rebuild level: " << |
9199139f | 388 | StoreController::store_dirs_rebuilding); |
078274f6 | 389 | --StoreController::store_dirs_rebuilding; |
e2851fe7 | 390 | storeRebuildComplete(&counts); |
e2851fe7 AR |
391 | } |
392 | ||
393 | void | |
9199139f AR |
394 | Rock::Rebuild::failure(const char *msg, int errNo) |
395 | { | |
539283df | 396 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 397 | dbOffset << " <= " << dbSize); |
e2851fe7 AR |
398 | |
399 | if (errNo) | |
f5adb654 AR |
400 | debugs(47, DBG_CRITICAL, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo)); |
401 | debugs(47, DBG_CRITICAL, "Do you need to run 'squid -z' to initialize storage?"); | |
e2851fe7 AR |
402 | |
403 | assert(sd); | |
404 | fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.", | |
9199139f | 405 | sd->index, sd->filePath, msg); |
e2851fe7 | 406 | } |
93910d5c | 407 | |
50dc81ec AR |
408 | /// adds slot to the free slot index |
409 | void | |
410 | Rock::Rebuild::freeSlot(const SlotId slotId, const bool invalid) | |
93910d5c | 411 | { |
50dc81ec AR |
412 | debugs(47,5, sd->index << " frees slot " << slotId); |
413 | LoadingEntry &le = entries[slotId]; | |
414 | assert(!le.freed); | |
415 | le.freed = 1; | |
416 | ||
417 | if (invalid) { | |
418 | ++counts.invalid; | |
419 | //sd->unlink(fileno); leave garbage on disk, it should not hurt | |
420 | } | |
421 | ||
422 | Ipc::Mem::PageId pageId; | |
423 | pageId.pool = sd->index+1; | |
424 | pageId.number = slotId+1; | |
425 | sd->freeSlots->push(pageId); | |
426 | } | |
427 | ||
428 | /// adds slot to the free slot index but only if the slot is unused | |
429 | void | |
430 | Rock::Rebuild::freeSlotIfIdle(const SlotId slotId, const bool invalid) | |
431 | { | |
432 | const LoadingEntry &le = entries[slotId]; | |
433 | ||
434 | // mapped slots must be freed via freeBadEntry() to keep the map in sync | |
435 | assert(!le.mapped); | |
436 | ||
437 | if (!le.used()) | |
438 | freeSlot(slotId, invalid); | |
439 | } | |
440 | ||
441 | /// adds slot to the entry chain in the map | |
442 | void | |
443 | Rock::Rebuild::mapSlot(const SlotId slotId, const DbCellHeader &header) | |
444 | { | |
445 | LoadingEntry &le = entries[slotId]; | |
446 | assert(!le.mapped); | |
447 | assert(!le.freed); | |
448 | le.mapped = 1; | |
449 | ||
450 | Ipc::StoreMapSlice slice; | |
451 | slice.next = header.nextSlot; | |
452 | slice.size = header.payloadSize; | |
453 | sd->map->importSlice(slotId, slice); | |
454 | } | |
455 | ||
456 | /// adds slot to an existing entry chain; caller must check that the slot | |
457 | /// belongs to the chain it is being added to | |
458 | void | |
459 | Rock::Rebuild::addSlotToEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
460 | { | |
461 | LoadingEntry &le = entries[fileno]; | |
462 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); | |
463 | ||
464 | assert(le.version == header.version); | |
465 | ||
466 | // mark anchor as loaded or add the secondary slot to the chain | |
467 | LoadingEntry &inode = entries[header.firstSlot]; | |
468 | if (header.firstSlot == slotId) { | |
469 | debugs(47,5, "adding inode"); | |
470 | assert(!inode.freed); | |
471 | le.anchored = 1; | |
472 | } else { | |
473 | debugs(47,9, "linking " << slotId << " to " << inode.more); | |
474 | // we do not need to preserve the order | |
475 | LoadingEntry &slice = entries[slotId]; | |
476 | assert(!slice.freed); | |
477 | assert(slice.more < 0); | |
478 | slice.more = inode.more; | |
479 | inode.more = slotId; | |
480 | } | |
481 | ||
482 | if (header.firstSlot == slotId && !importEntry(anchor, fileno, header)) { | |
483 | le.state = LoadingEntry::leCorrupted; | |
484 | freeBadEntry(fileno, "corrupted metainfo"); | |
485 | return; | |
486 | } | |
487 | ||
488 | // set total entry size and/or check it for consistency | |
ce44c1ea | 489 | debugs(47, 8, "header.entrySize: " << header.entrySize << " swap_file_sz: " << anchor.basics.swap_file_sz); |
50dc81ec AR |
490 | uint64_t totalSize = header.entrySize; |
491 | assert(totalSize != static_cast<uint64_t>(-1)); | |
492 | if (!totalSize && anchor.basics.swap_file_sz) { | |
493 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
494 | // perhaps we loaded a later slot (with entrySize) earlier | |
495 | totalSize = anchor.basics.swap_file_sz; | |
9d4e9cfb | 496 | } else if (totalSize && !anchor.basics.swap_file_sz) { |
50dc81ec AR |
497 | anchor.basics.swap_file_sz = totalSize; |
498 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
9d4e9cfb | 499 | } else if (totalSize != anchor.basics.swap_file_sz) { |
50dc81ec AR |
500 | le.state = LoadingEntry::leCorrupted; |
501 | freeBadEntry(fileno, "size mismatch"); | |
502 | return; | |
503 | } | |
504 | ||
505 | le.size += header.payloadSize; | |
506 | ||
507 | if (totalSize > 0 && le.size > totalSize) { // overflow | |
ce44c1ea | 508 | debugs(47, 8, "overflow: " << le.size << " > " << totalSize); |
50dc81ec AR |
509 | le.state = LoadingEntry::leCorrupted; |
510 | freeBadEntry(fileno, "overflowing"); | |
511 | return; | |
512 | } | |
513 | ||
514 | mapSlot(slotId, header); | |
515 | if (totalSize > 0 && le.size == totalSize) { | |
516 | // entry fully loaded, unlock it | |
517 | // we have validated that all db cells for this entry were loaded | |
518 | EBIT_SET(anchor.basics.flags, ENTRY_VALIDATED); | |
519 | le.state = LoadingEntry::leLoaded; | |
520 | sd->map->closeForWriting(fileno, false); | |
521 | ++counts.objcount; | |
522 | } | |
523 | } | |
524 | ||
525 | /// initialize housekeeping information for a newly accepted entry | |
526 | void | |
527 | Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
528 | { | |
529 | anchor.setKey(reinterpret_cast<const cache_key*>(header.key)); | |
530 | assert(header.firstSlot >= 0); | |
531 | anchor.start = header.firstSlot; | |
532 | ||
533 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
534 | ||
535 | LoadingEntry &le = entries[fileno]; | |
536 | le.state = LoadingEntry::leLoading; | |
537 | le.version = header.version; | |
538 | le.size = 0; | |
539 | } | |
540 | ||
541 | /// handle a slot from an entry that we have not seen before | |
542 | void | |
543 | Rock::Rebuild::startNewEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
544 | { | |
545 | // If some other from-disk entry is/was using this slot as its inode OR | |
9d4e9cfb | 546 | // if some other from-disk entry is/was using our inode slot, then the |
50dc81ec AR |
547 | // entries are conflicting. We cannot identify other entries, so we just |
548 | // remove ours and hope that the others were/will be handled correctly. | |
549 | const LoadingEntry &slice = entries[slotId]; | |
550 | const LoadingEntry &inode = entries[header.firstSlot]; | |
551 | if (slice.used() || inode.used()) { | |
552 | debugs(47,8, "slice/inode used: " << slice.used() << inode.used()); | |
553 | LoadingEntry &le = entries[fileno]; | |
554 | le.state = LoadingEntry::leCorrupted; | |
555 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
556 | // if not idle, the other entry will handle its slice | |
557 | ++counts.clashcount; | |
558 | return; | |
559 | } | |
560 | ||
561 | // A miss may have been stored at our fileno while we were loading other | |
562 | // slots from disk. We ought to preserve that entry because it is fresher. | |
563 | const bool overwriteExisting = false; | |
564 | if (Ipc::StoreMap::Anchor *anchor = sd->map->openForWritingAt(fileno, overwriteExisting)) { | |
565 | primeNewEntry(*anchor, fileno, header); | |
566 | addSlotToEntry(fileno, slotId, header); // may fail | |
567 | assert(anchor->basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
568 | } else { | |
569 | // A new from-network entry is occupying our map slot; let it be, but | |
570 | // save us from the trouble of going through the above motions again. | |
571 | LoadingEntry &le = entries[fileno]; | |
572 | le.state = LoadingEntry::leIgnored; | |
573 | freeSlotIfIdle(slotId, false); | |
574 | } | |
575 | } | |
576 | ||
577 | /// does the header belong to the fileno entry being loaded? | |
578 | bool | |
579 | Rock::Rebuild::sameEntry(const sfileno fileno, const DbCellHeader &header) const | |
580 | { | |
581 | const Ipc::StoreMap::Anchor &anchor = sd->map->writeableEntry(fileno); | |
582 | const LoadingEntry &le = entries[fileno]; | |
583 | // any order will work, but do fast comparisons first: | |
584 | return le.version == header.version && | |
9d4e9cfb AR |
585 | anchor.start == static_cast<Ipc::StoreMapSliceId>(header.firstSlot) && |
586 | anchor.sameKey(reinterpret_cast<const cache_key*>(header.key)); | |
50dc81ec AR |
587 | } |
588 | ||
589 | /// is the new header consistent with information already loaded? | |
590 | bool | |
591 | Rock::Rebuild::canAdd(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) const | |
592 | { | |
593 | if (!sameEntry(fileno, header)) { | |
594 | debugs(79, 7, "cannot add; wrong entry"); | |
595 | return false; | |
596 | } | |
597 | ||
598 | const LoadingEntry &le = entries[slotId]; | |
599 | // We cannot add a slot that was already declared free or mapped. | |
600 | if (le.freed || le.mapped) { | |
601 | debugs(79, 7, "cannot add; freed/mapped: " << le.freed << le.mapped); | |
602 | return false; | |
603 | } | |
604 | ||
605 | if (slotId == header.firstSlot) { | |
606 | // If we are the inode, the anchored flag cannot be set yet. | |
607 | if (entries[fileno].anchored) { | |
608 | debugs(79, 7, "cannot add; extra anchor"); | |
609 | return false; | |
610 | } | |
611 | ||
612 | // And there should have been some other slot for this entry to exist. | |
613 | if (le.more < 0) { | |
614 | debugs(79, 7, "cannot add; missing slots"); | |
615 | return false; | |
616 | } | |
617 | ||
618 | return true; | |
619 | } | |
620 | ||
621 | // We are the continuation slice so the more field is reserved for us. | |
622 | if (le.more >= 0) { | |
623 | debugs(79, 7, "cannot add; foreign slot"); | |
624 | return false; | |
625 | } | |
626 | ||
627 | return true; | |
628 | } | |
629 | ||
630 | /// handle freshly loaded (and validated) db slot header | |
631 | void | |
632 | Rock::Rebuild::useNewSlot(const SlotId slotId, const DbCellHeader &header) | |
633 | { | |
634 | LoadingEntry &slice = entries[slotId]; | |
635 | assert(!slice.freed); // we cannot free what was not loaded | |
636 | ||
637 | const cache_key *const key = | |
638 | reinterpret_cast<const cache_key*>(header.key); | |
639 | const sfileno fileno = sd->map->anchorIndexByKey(key); | |
640 | assert(0 <= fileno && fileno < dbEntryLimit); | |
641 | ||
642 | LoadingEntry &le = entries[fileno]; | |
643 | debugs(47,9, "entry " << fileno << " state: " << le.state << ", inode: " << | |
9d4e9cfb | 644 | header.firstSlot << ", size: " << header.payloadSize); |
50dc81ec AR |
645 | |
646 | switch (le.state) { | |
647 | ||
648 | case LoadingEntry::leEmpty: { | |
649 | startNewEntry(fileno, slotId, header); | |
650 | break; | |
651 | } | |
652 | ||
653 | case LoadingEntry::leLoading: { | |
654 | if (canAdd(fileno, slotId, header)) { | |
655 | addSlotToEntry(fileno, slotId, header); | |
656 | } else { | |
657 | // either the loading chain or this slot is stale; | |
658 | // be conservative and ignore both (and any future ones) | |
659 | le.state = LoadingEntry::leCorrupted; | |
660 | freeBadEntry(fileno, "duplicated"); | |
661 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
662 | ++counts.dupcount; | |
663 | } | |
664 | break; | |
665 | } | |
666 | ||
667 | case LoadingEntry::leLoaded: { | |
668 | // either the previously loaded chain or this slot is stale; | |
669 | // be conservative and ignore both (and any future ones) | |
670 | le.state = LoadingEntry::leCorrupted; | |
671 | sd->map->freeEntry(fileno); // may not be immediately successful | |
672 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
673 | ++counts.dupcount; | |
674 | break; | |
675 | } | |
676 | ||
677 | case LoadingEntry::leCorrupted: { | |
678 | // previously seen slots messed things up so we must ignore this one | |
679 | freeSlotIfIdle(slotId, false); | |
680 | break; | |
681 | } | |
682 | ||
683 | case LoadingEntry::leIgnored: { | |
684 | // already replaced by a fresher or colliding from-network entry | |
685 | freeSlotIfIdle(slotId, false); | |
686 | break; | |
687 | } | |
688 | } | |
93910d5c | 689 | } |
f53969cc | 690 |