]>
Commit | Line | Data |
---|---|---|
e2851fe7 | 1 | /* |
e2851fe7 AR |
2 | * DEBUG: section 79 Disk IO Routines |
3 | */ | |
4 | ||
f7f3304a | 5 | #include "squid.h" |
438b04d4 | 6 | #include "disk.h" |
e2851fe7 AR |
7 | #include "fs/rock/RockRebuild.h" |
8 | #include "fs/rock/RockSwapDir.h" | |
fcd789da | 9 | #include "fs/rock/RockDbCell.h" |
93910d5c | 10 | #include "ipc/StoreMap.h" |
67679543 | 11 | #include "globals.h" |
582c2af2 | 12 | #include "md5.h" |
5bed43d6 | 13 | #include "tools.h" |
582c2af2 | 14 | #include "typedefs.h" |
386d28bf | 15 | #include "SquidTime.h" |
687f5275 | 16 | #include "store_rebuild.h" |
e2851fe7 | 17 | |
21d845b1 FC |
18 | #if HAVE_ERRNO_H |
19 | #include <errno.h> | |
20 | #endif | |
21 | ||
e2851fe7 AR |
22 | CBDATA_NAMESPACED_CLASS_INIT(Rock, Rebuild); |
23 | ||
50dc81ec AR |
24 | namespace Rock { |
25 | ||
26 | /// maintains information about the store entry being loaded from disk | |
27 | /// used for identifying partially stored/loaded entries | |
28 | class LoadingEntry { | |
29 | public: | |
30 | LoadingEntry(): size(0), version(0), state(leEmpty), anchored(0), | |
31 | mapped(0), freed(0), more(-1) {} | |
32 | ||
33 | /* store entry-level information indexed by sfileno */ | |
34 | uint64_t size; ///< payload seen so far | |
35 | uint32_t version; ///< DbCellHeader::version to distinguish same-URL chains | |
36 | uint32_t state:3; ///< current entry state (one of the State values) | |
37 | uint32_t anchored:1; ///< whether we loaded the inode slot for this entry | |
38 | ||
39 | /* db slot-level information indexed by slotId, starting with firstSlot */ | |
40 | uint32_t mapped:1; ///< whether this slot was added to a mapped entry | |
41 | uint32_t freed:1; ///< whether this slot was marked as free | |
42 | sfileno more:25; ///< another slot in some entry chain (unordered) | |
43 | bool used() const { return freed || mapped || more != -1; } | |
44 | ||
45 | /// possible entry states | |
46 | typedef enum { leEmpty = 0, leLoading, leLoaded, leCorrupted, leIgnored } State; | |
47 | }; | |
48 | ||
49 | } /* namespace Rock */ | |
50 | ||
51 | /** | |
52 | Several layers of information is manipualted during the rebuild: | |
53 | ||
54 | Store Entry: Response message plus all the metainformation associated with | |
55 | it. Identified by store key. At any given time, from Squid point | |
56 | of view, there is only one entry with a given key, but several | |
57 | different entries with the same key can be observed in any historical | |
58 | archive (such as an access log or a store database). | |
59 | ||
60 | Slot chain: A sequence of db slots representing a Store Entry state at | |
61 | some point in time. Identified by key+version combination. Due to | |
62 | transaction aborts, crashes, and idle periods, some chains may contain | |
63 | incomplete or stale information. We assume that no two different chains | |
64 | have the same key and version. If that assumption fails, we may serve a | |
65 | hodgepodge entry during rebuild, until "extra" slots are loaded/noticed. | |
66 | ||
67 | Db slot: A db record containing a piece of a single store entry and linked | |
68 | to other slots with the same key and version fields, forming a chain. | |
69 | Slots are identified by their absolute position in the database file, | |
70 | which is naturally unique. | |
71 | ||
72 | ||
73 | Except for the "mapped", "freed", and "more" fields, LoadingEntry info is | |
74 | entry-level and is stored at fileno position. In other words, the array of | |
75 | LoadingEntries should be interpreted as two arrays, one that maps slot ID | |
76 | to the LoadingEntry::mapped/free/more members, and the second one that maps | |
77 | fileno to all other LoadingEntry members. StoreMap maps slot key to fileno. | |
78 | ||
79 | ||
80 | When information from the newly loaded db slot contradicts the entry-level | |
81 | information collected so far (e.g., the versions do not match or the total | |
82 | chain size after the slot contribution exceeds the expected number), the | |
83 | whole entry (and not just the chain or the slot!) is declared corrupted. | |
84 | ||
85 | Why invalidate the whole entry? Rock Store is written for high-load | |
86 | environments with large caches, where there is usually very few idle slots | |
87 | in the database. A space occupied by a purged entry is usually immediately | |
88 | reclaimed. A Squid crash or a transaction abort is rather unlikely to | |
89 | leave a relatively large number of stale slots in the database. Thus, the | |
90 | number of potentially corrupted entries is relatively small. On the other | |
91 | hand, the damage from serving a single hadgepodge entry may be significant | |
92 | to the user. In such an environment, invalidating the whole entry has | |
93 | negligible performance impact but saves us from high-damage bugs. | |
94 | */ | |
95 | ||
96 | ||
078274f6 | 97 | Rock::Rebuild::Rebuild(SwapDir *dir): AsyncJob("Rock::Rebuild"), |
9199139f | 98 | sd(dir), |
50dc81ec | 99 | entries(NULL), |
9199139f AR |
100 | dbSize(0), |
101 | dbEntrySize(0), | |
102 | dbEntryLimit(0), | |
93910d5c | 103 | dbSlot(0), |
9199139f AR |
104 | fd(-1), |
105 | dbOffset(0), | |
50dc81ec AR |
106 | slotPos(0), |
107 | validationPos(0) | |
e2851fe7 AR |
108 | { |
109 | assert(sd); | |
110 | memset(&counts, 0, sizeof(counts)); | |
111 | dbSize = sd->diskOffsetLimit(); // we do not care about the trailer waste | |
e51ce7da | 112 | dbEntrySize = sd->slotSize; |
e2851fe7 AR |
113 | dbEntryLimit = sd->entryLimit(); |
114 | } | |
115 | ||
116 | Rock::Rebuild::~Rebuild() | |
117 | { | |
118 | if (fd >= 0) | |
119 | file_close(fd); | |
50dc81ec | 120 | delete[] entries; |
e2851fe7 AR |
121 | } |
122 | ||
123 | /// prepares and initiates entry loading sequence | |
124 | void | |
9199139f AR |
125 | Rock::Rebuild::start() |
126 | { | |
078274f6 AR |
127 | // in SMP mode, only the disker is responsible for populating the map |
128 | if (UsingSmp() && !IamDiskProcess()) { | |
129 | debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" << | |
9199139f | 130 | sd->index << " from " << sd->filePath); |
078274f6 AR |
131 | mustStop("non-disker"); |
132 | return; | |
133 | } | |
134 | ||
095ec2b1 AR |
135 | debugs(47, DBG_IMPORTANT, "Loading cache_dir #" << sd->index << |
136 | " from " << sd->filePath); | |
e2851fe7 AR |
137 | |
138 | fd = file_open(sd->filePath, O_RDONLY | O_BINARY); | |
139 | if (fd < 0) | |
140 | failure("cannot open db", errno); | |
141 | ||
50dc81ec AR |
142 | char hdrBuf[SwapDir::HeaderSize]; |
143 | if (read(fd, hdrBuf, sizeof(hdrBuf)) != SwapDir::HeaderSize) | |
e2851fe7 AR |
144 | failure("cannot read db header", errno); |
145 | ||
50dc81ec AR |
146 | // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours |
147 | assert(sizeof(DbCellHeader) < SM_PAGE_SIZE); | |
148 | buf.init(SM_PAGE_SIZE, SM_PAGE_SIZE); | |
149 | ||
e2851fe7 | 150 | dbOffset = SwapDir::HeaderSize; |
50dc81ec AR |
151 | slotPos = 0; |
152 | ||
153 | entries = new LoadingEntry[dbEntryLimit]; | |
e2851fe7 AR |
154 | |
155 | checkpoint(); | |
156 | } | |
157 | ||
078274f6 | 158 | /// continues after a pause if not done |
e2851fe7 AR |
159 | void |
160 | Rock::Rebuild::checkpoint() | |
161 | { | |
50dc81ec | 162 | if (!done()) |
e2851fe7 | 163 | eventAdd("Rock::Rebuild", Rock::Rebuild::Steps, this, 0.01, 1, true); |
078274f6 AR |
164 | } |
165 | ||
166 | bool | |
167 | Rock::Rebuild::doneAll() const | |
168 | { | |
50dc81ec AR |
169 | return dbOffset >= dbSize && validationPos >= dbEntryLimit && |
170 | AsyncJob::doneAll(); | |
e2851fe7 AR |
171 | } |
172 | ||
173 | void | |
174 | Rock::Rebuild::Steps(void *data) | |
175 | { | |
078274f6 AR |
176 | // use async call to enable job call protection that time events lack |
177 | CallJobHere(47, 5, static_cast<Rebuild*>(data), Rock::Rebuild, steps); | |
e2851fe7 AR |
178 | } |
179 | ||
93910d5c | 180 | void |
50dc81ec | 181 | Rock::Rebuild::steps() |
93910d5c | 182 | { |
50dc81ec AR |
183 | if (dbOffset < dbSize) |
184 | loadingSteps(); | |
185 | else | |
186 | validationSteps(); | |
187 | ||
188 | checkpoint(); | |
93910d5c AR |
189 | } |
190 | ||
e2851fe7 | 191 | void |
50dc81ec | 192 | Rock::Rebuild::loadingSteps() |
9199139f | 193 | { |
50dc81ec | 194 | debugs(47,5, HERE << sd->index << " slot " << slotPos << " at " << |
9199139f | 195 | dbOffset << " <= " << dbSize); |
e2851fe7 | 196 | |
386d28bf | 197 | // Balance our desire to maximize the number of entries processed at once |
9199139f | 198 | // (and, hence, minimize overheads and total rebuild time) with a |
386d28bf AR |
199 | // requirement to also process Coordinator events, disk I/Os, etc. |
200 | const int maxSpentMsec = 50; // keep small: most RAM I/Os are under 1ms | |
201 | const timeval loopStart = current_time; | |
202 | ||
203 | int loaded = 0; | |
204 | while (loaded < dbEntryLimit && dbOffset < dbSize) { | |
50dc81ec | 205 | loadOneSlot(); |
e2851fe7 | 206 | dbOffset += dbEntrySize; |
50dc81ec | 207 | ++slotPos; |
386d28bf | 208 | ++loaded; |
e2851fe7 AR |
209 | |
210 | if (counts.scancount % 1000 == 0) | |
386d28bf AR |
211 | storeRebuildProgress(sd->index, dbEntryLimit, counts.scancount); |
212 | ||
213 | if (opt_foreground_rebuild) | |
214 | continue; // skip "few entries at a time" check below | |
215 | ||
216 | getCurrentTime(); | |
217 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
218 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
219 | debugs(47, 5, HERE << "pausing after " << loaded << " entries in " << | |
9199139f | 220 | elapsedMsec << "ms; " << (elapsedMsec/loaded) << "ms per entry"); |
386d28bf AR |
221 | break; |
222 | } | |
223 | } | |
e2851fe7 AR |
224 | } |
225 | ||
93910d5c | 226 | void |
50dc81ec | 227 | Rock::Rebuild::loadOneSlot() |
93910d5c | 228 | { |
50dc81ec | 229 | debugs(47,5, HERE << sd->index << " slot " << slotPos << " at " << |
9199139f | 230 | dbOffset << " <= " << dbSize); |
e2851fe7 | 231 | |
c728b6f9 AR |
232 | ++counts.scancount; |
233 | ||
e2851fe7 AR |
234 | if (lseek(fd, dbOffset, SEEK_SET) < 0) |
235 | failure("cannot seek to db entry", errno); | |
236 | ||
50dc81ec | 237 | buf.reset(); |
c728b6f9 AR |
238 | |
239 | if (!storeRebuildLoadEntry(fd, sd->index, buf, counts)) | |
240 | return; | |
241 | ||
50dc81ec | 242 | const SlotId slotId = slotPos; |
93910d5c | 243 | |
50dc81ec AR |
244 | // get our header |
245 | DbCellHeader header; | |
c728b6f9 | 246 | if (buf.contentSize() < static_cast<mb_size_t>(sizeof(header))) { |
51618c6a | 247 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
9199139f | 248 | "Ignoring truncated cache entry meta data at " << dbOffset); |
50dc81ec | 249 | freeSlotIfIdle(slotId, true); |
c728b6f9 AR |
250 | return; |
251 | } | |
c728b6f9 | 252 | |
50dc81ec AR |
253 | memcpy(&header, buf.content(), sizeof(header)); |
254 | if (header.empty()) { | |
255 | freeSlotIfIdle(slotId, false); | |
256 | return; | |
257 | } | |
c728b6f9 | 258 | if (!header.sane()) { |
51618c6a | 259 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
9199139f | 260 | "Ignoring malformed cache entry meta data at " << dbOffset); |
50dc81ec | 261 | freeSlotIfIdle(slotId, true); |
e2851fe7 | 262 | return; |
9199139f | 263 | } |
50dc81ec AR |
264 | buf.consume(sizeof(header)); // optimize to avoid memmove() |
265 | ||
266 | useNewSlot(slotId, header); | |
267 | } | |
268 | ||
269 | /// parse StoreEntry basics and add them to the map, returning true on success | |
270 | bool | |
271 | Rock::Rebuild::importEntry(Ipc::StoreMapAnchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
272 | { | |
273 | cache_key key[SQUID_MD5_DIGEST_LENGTH]; | |
274 | StoreEntry loadedE; | |
275 | if (!storeRebuildParseEntry(buf, loadedE, key, counts, 0)) | |
276 | return false; | |
277 | ||
278 | const uint64_t knownSize = header.entrySize > 0 ? | |
279 | header.entrySize : anchor.basics.swap_file_sz; | |
280 | if (!loadedE.swap_file_sz && knownSize) | |
281 | loadedE.swap_file_sz = knownSize; | |
282 | // the entry size may still be unknown at this time | |
283 | ||
284 | debugs(47, 8, "importing entry basics for " << fileno); | |
285 | anchor.set(loadedE); | |
286 | ||
287 | // we have not validated whether all db cells for this entry were loaded | |
288 | EBIT_CLR(anchor.basics.flags, ENTRY_VALIDATED); | |
289 | ||
290 | // loadedE->dump(5); | |
291 | ||
292 | return true; | |
93910d5c | 293 | } |
e2851fe7 | 294 | |
93910d5c | 295 | void |
50dc81ec | 296 | Rock::Rebuild::validationSteps() |
93910d5c | 297 | { |
50dc81ec | 298 | debugs(47, 5, sd->index << " validating from " << validationPos); |
93910d5c | 299 | |
50dc81ec AR |
300 | // see loadingSteps() for the rationale; TODO: avoid duplication |
301 | const int maxSpentMsec = 50; // keep small: validation does not do I/O | |
302 | const timeval loopStart = current_time; | |
e2851fe7 | 303 | |
50dc81ec AR |
304 | int validated = 0; |
305 | while (validationPos < dbEntryLimit) { | |
306 | validateOneEntry(); | |
307 | ++validationPos; | |
308 | ++validated; | |
93910d5c | 309 | |
50dc81ec AR |
310 | if (validationPos % 1000 == 0) |
311 | debugs(20, 2, "validated: " << validationPos); | |
e2851fe7 | 312 | |
50dc81ec AR |
313 | if (opt_foreground_rebuild) |
314 | continue; // skip "few entries at a time" check below | |
315 | ||
316 | getCurrentTime(); | |
317 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
318 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
319 | debugs(47, 5, "pausing after " << validated << " entries in " << | |
320 | elapsedMsec << "ms; " << (elapsedMsec/validated) << "ms per entry"); | |
321 | break; | |
322 | } | |
323 | } | |
324 | } | |
325 | ||
326 | void | |
327 | Rock::Rebuild::validateOneEntry() | |
328 | { | |
329 | LoadingEntry &e = entries[validationPos]; | |
330 | switch (e.state) { | |
331 | ||
332 | case LoadingEntry::leEmpty: | |
333 | break; // no entry hashed to this position | |
334 | ||
335 | case LoadingEntry::leLoading: | |
336 | freeBadEntry(validationPos, "partially stored"); | |
337 | break; | |
338 | ||
339 | case LoadingEntry::leLoaded: | |
340 | break; // we have already unlocked this entry | |
341 | ||
342 | case LoadingEntry::leCorrupted: | |
343 | break; // we have already removed this entry | |
344 | } | |
345 | } | |
346 | ||
347 | /// Marks remaining bad entry slots as free and unlocks the entry. The map | |
348 | /// cannot do this because Loading entries may have holes in the slots chain. | |
349 | void | |
350 | Rock::Rebuild::freeBadEntry(const sfileno fileno, const char *eDescription) | |
351 | { | |
352 | debugs(47, 2, "cache_dir #" << sd->index << ' ' << eDescription << | |
353 | " entry " << fileno << " is ignored during rebuild"); | |
354 | ||
355 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); | |
356 | ||
357 | bool freedSome = false; | |
358 | // free all loaded non-anchor slots | |
359 | SlotId slotId = entries[anchor.start].more; | |
360 | while (slotId >= 0) { | |
361 | const SlotId next = entries[slotId].more; | |
362 | freeSlot(slotId, false); | |
363 | slotId = next; | |
364 | freedSome = true; | |
365 | } | |
366 | // free anchor slot if it was loaded | |
367 | if (entries[fileno].anchored) { | |
368 | freeSlot(anchor.start, false); | |
369 | freedSome = true; | |
370 | } | |
371 | assert(freedSome); | |
372 | ||
373 | sd->map->forgetWritingEntry(fileno); | |
374 | ++counts.invalid; | |
e2851fe7 AR |
375 | } |
376 | ||
377 | void | |
9199139f AR |
378 | Rock::Rebuild::swanSong() |
379 | { | |
078274f6 | 380 | debugs(47,3, HERE << "cache_dir #" << sd->index << " rebuild level: " << |
9199139f | 381 | StoreController::store_dirs_rebuilding); |
078274f6 | 382 | --StoreController::store_dirs_rebuilding; |
e2851fe7 | 383 | storeRebuildComplete(&counts); |
e2851fe7 AR |
384 | } |
385 | ||
386 | void | |
9199139f AR |
387 | Rock::Rebuild::failure(const char *msg, int errNo) |
388 | { | |
50dc81ec | 389 | debugs(47,5, HERE << sd->index << " slot " << slotPos << " at " << |
9199139f | 390 | dbOffset << " <= " << dbSize); |
e2851fe7 AR |
391 | |
392 | if (errNo) | |
f5adb654 AR |
393 | debugs(47, DBG_CRITICAL, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo)); |
394 | debugs(47, DBG_CRITICAL, "Do you need to run 'squid -z' to initialize storage?"); | |
e2851fe7 AR |
395 | |
396 | assert(sd); | |
397 | fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.", | |
9199139f | 398 | sd->index, sd->filePath, msg); |
e2851fe7 | 399 | } |
93910d5c | 400 | |
50dc81ec AR |
401 | /// adds slot to the free slot index |
402 | void | |
403 | Rock::Rebuild::freeSlot(const SlotId slotId, const bool invalid) | |
93910d5c | 404 | { |
50dc81ec AR |
405 | debugs(47,5, sd->index << " frees slot " << slotId); |
406 | LoadingEntry &le = entries[slotId]; | |
407 | assert(!le.freed); | |
408 | le.freed = 1; | |
409 | ||
410 | if (invalid) { | |
411 | ++counts.invalid; | |
412 | //sd->unlink(fileno); leave garbage on disk, it should not hurt | |
413 | } | |
414 | ||
415 | Ipc::Mem::PageId pageId; | |
416 | pageId.pool = sd->index+1; | |
417 | pageId.number = slotId+1; | |
418 | sd->freeSlots->push(pageId); | |
419 | } | |
420 | ||
421 | /// adds slot to the free slot index but only if the slot is unused | |
422 | void | |
423 | Rock::Rebuild::freeSlotIfIdle(const SlotId slotId, const bool invalid) | |
424 | { | |
425 | const LoadingEntry &le = entries[slotId]; | |
426 | ||
427 | // mapped slots must be freed via freeBadEntry() to keep the map in sync | |
428 | assert(!le.mapped); | |
429 | ||
430 | if (!le.used()) | |
431 | freeSlot(slotId, invalid); | |
432 | } | |
433 | ||
434 | /// adds slot to the entry chain in the map | |
435 | void | |
436 | Rock::Rebuild::mapSlot(const SlotId slotId, const DbCellHeader &header) | |
437 | { | |
438 | LoadingEntry &le = entries[slotId]; | |
439 | assert(!le.mapped); | |
440 | assert(!le.freed); | |
441 | le.mapped = 1; | |
442 | ||
443 | Ipc::StoreMapSlice slice; | |
444 | slice.next = header.nextSlot; | |
445 | slice.size = header.payloadSize; | |
446 | sd->map->importSlice(slotId, slice); | |
447 | } | |
448 | ||
449 | /// adds slot to an existing entry chain; caller must check that the slot | |
450 | /// belongs to the chain it is being added to | |
451 | void | |
452 | Rock::Rebuild::addSlotToEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
453 | { | |
454 | LoadingEntry &le = entries[fileno]; | |
455 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); | |
456 | ||
457 | assert(le.version == header.version); | |
458 | ||
459 | // mark anchor as loaded or add the secondary slot to the chain | |
460 | LoadingEntry &inode = entries[header.firstSlot]; | |
461 | if (header.firstSlot == slotId) { | |
462 | debugs(47,5, "adding inode"); | |
463 | assert(!inode.freed); | |
464 | le.anchored = 1; | |
465 | } else { | |
466 | debugs(47,9, "linking " << slotId << " to " << inode.more); | |
467 | // we do not need to preserve the order | |
468 | LoadingEntry &slice = entries[slotId]; | |
469 | assert(!slice.freed); | |
470 | assert(slice.more < 0); | |
471 | slice.more = inode.more; | |
472 | inode.more = slotId; | |
473 | } | |
474 | ||
475 | if (header.firstSlot == slotId && !importEntry(anchor, fileno, header)) { | |
476 | le.state = LoadingEntry::leCorrupted; | |
477 | freeBadEntry(fileno, "corrupted metainfo"); | |
478 | return; | |
479 | } | |
480 | ||
481 | // set total entry size and/or check it for consistency | |
482 | uint64_t totalSize = header.entrySize; | |
483 | assert(totalSize != static_cast<uint64_t>(-1)); | |
484 | if (!totalSize && anchor.basics.swap_file_sz) { | |
485 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
486 | // perhaps we loaded a later slot (with entrySize) earlier | |
487 | totalSize = anchor.basics.swap_file_sz; | |
488 | } else | |
489 | if (totalSize && !anchor.basics.swap_file_sz) { | |
490 | anchor.basics.swap_file_sz = totalSize; | |
491 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
492 | } else | |
493 | if (totalSize != anchor.basics.swap_file_sz) { | |
494 | le.state = LoadingEntry::leCorrupted; | |
495 | freeBadEntry(fileno, "size mismatch"); | |
496 | return; | |
497 | } | |
498 | ||
499 | le.size += header.payloadSize; | |
500 | ||
501 | if (totalSize > 0 && le.size > totalSize) { // overflow | |
502 | le.state = LoadingEntry::leCorrupted; | |
503 | freeBadEntry(fileno, "overflowing"); | |
504 | return; | |
505 | } | |
506 | ||
507 | mapSlot(slotId, header); | |
508 | if (totalSize > 0 && le.size == totalSize) { | |
509 | // entry fully loaded, unlock it | |
510 | // we have validated that all db cells for this entry were loaded | |
511 | EBIT_SET(anchor.basics.flags, ENTRY_VALIDATED); | |
512 | le.state = LoadingEntry::leLoaded; | |
513 | sd->map->closeForWriting(fileno, false); | |
514 | ++counts.objcount; | |
515 | } | |
516 | } | |
517 | ||
518 | /// initialize housekeeping information for a newly accepted entry | |
519 | void | |
520 | Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
521 | { | |
522 | anchor.setKey(reinterpret_cast<const cache_key*>(header.key)); | |
523 | assert(header.firstSlot >= 0); | |
524 | anchor.start = header.firstSlot; | |
525 | ||
526 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
527 | ||
528 | LoadingEntry &le = entries[fileno]; | |
529 | le.state = LoadingEntry::leLoading; | |
530 | le.version = header.version; | |
531 | le.size = 0; | |
532 | } | |
533 | ||
534 | /// handle a slot from an entry that we have not seen before | |
535 | void | |
536 | Rock::Rebuild::startNewEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
537 | { | |
538 | // If some other from-disk entry is/was using this slot as its inode OR | |
539 | // if some other from-disk entry is/was using our inode slot, then the | |
540 | // entries are conflicting. We cannot identify other entries, so we just | |
541 | // remove ours and hope that the others were/will be handled correctly. | |
542 | const LoadingEntry &slice = entries[slotId]; | |
543 | const LoadingEntry &inode = entries[header.firstSlot]; | |
544 | if (slice.used() || inode.used()) { | |
545 | debugs(47,8, "slice/inode used: " << slice.used() << inode.used()); | |
546 | LoadingEntry &le = entries[fileno]; | |
547 | le.state = LoadingEntry::leCorrupted; | |
548 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
549 | // if not idle, the other entry will handle its slice | |
550 | ++counts.clashcount; | |
551 | return; | |
552 | } | |
553 | ||
554 | // A miss may have been stored at our fileno while we were loading other | |
555 | // slots from disk. We ought to preserve that entry because it is fresher. | |
556 | const bool overwriteExisting = false; | |
557 | if (Ipc::StoreMap::Anchor *anchor = sd->map->openForWritingAt(fileno, overwriteExisting)) { | |
558 | primeNewEntry(*anchor, fileno, header); | |
559 | addSlotToEntry(fileno, slotId, header); // may fail | |
560 | assert(anchor->basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
561 | } else { | |
562 | // A new from-network entry is occupying our map slot; let it be, but | |
563 | // save us from the trouble of going through the above motions again. | |
564 | LoadingEntry &le = entries[fileno]; | |
565 | le.state = LoadingEntry::leIgnored; | |
566 | freeSlotIfIdle(slotId, false); | |
567 | } | |
568 | } | |
569 | ||
570 | /// does the header belong to the fileno entry being loaded? | |
571 | bool | |
572 | Rock::Rebuild::sameEntry(const sfileno fileno, const DbCellHeader &header) const | |
573 | { | |
574 | const Ipc::StoreMap::Anchor &anchor = sd->map->writeableEntry(fileno); | |
575 | const LoadingEntry &le = entries[fileno]; | |
576 | // any order will work, but do fast comparisons first: | |
577 | return le.version == header.version && | |
578 | anchor.start == static_cast<Ipc::StoreMapSliceId>(header.firstSlot) && | |
579 | anchor.sameKey(reinterpret_cast<const cache_key*>(header.key)); | |
580 | } | |
581 | ||
582 | /// is the new header consistent with information already loaded? | |
583 | bool | |
584 | Rock::Rebuild::canAdd(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) const | |
585 | { | |
586 | if (!sameEntry(fileno, header)) { | |
587 | debugs(79, 7, "cannot add; wrong entry"); | |
588 | return false; | |
589 | } | |
590 | ||
591 | const LoadingEntry &le = entries[slotId]; | |
592 | // We cannot add a slot that was already declared free or mapped. | |
593 | if (le.freed || le.mapped) { | |
594 | debugs(79, 7, "cannot add; freed/mapped: " << le.freed << le.mapped); | |
595 | return false; | |
596 | } | |
597 | ||
598 | if (slotId == header.firstSlot) { | |
599 | // If we are the inode, the anchored flag cannot be set yet. | |
600 | if (entries[fileno].anchored) { | |
601 | debugs(79, 7, "cannot add; extra anchor"); | |
602 | return false; | |
603 | } | |
604 | ||
605 | // And there should have been some other slot for this entry to exist. | |
606 | if (le.more < 0) { | |
607 | debugs(79, 7, "cannot add; missing slots"); | |
608 | return false; | |
609 | } | |
610 | ||
611 | return true; | |
612 | } | |
613 | ||
614 | // We are the continuation slice so the more field is reserved for us. | |
615 | if (le.more >= 0) { | |
616 | debugs(79, 7, "cannot add; foreign slot"); | |
617 | return false; | |
618 | } | |
619 | ||
620 | return true; | |
621 | } | |
622 | ||
623 | /// handle freshly loaded (and validated) db slot header | |
624 | void | |
625 | Rock::Rebuild::useNewSlot(const SlotId slotId, const DbCellHeader &header) | |
626 | { | |
627 | LoadingEntry &slice = entries[slotId]; | |
628 | assert(!slice.freed); // we cannot free what was not loaded | |
629 | ||
630 | const cache_key *const key = | |
631 | reinterpret_cast<const cache_key*>(header.key); | |
632 | const sfileno fileno = sd->map->anchorIndexByKey(key); | |
633 | assert(0 <= fileno && fileno < dbEntryLimit); | |
634 | ||
635 | LoadingEntry &le = entries[fileno]; | |
636 | debugs(47,9, "entry " << fileno << " state: " << le.state << ", inode: " << | |
637 | header.firstSlot << ", size: " << header.payloadSize); | |
638 | ||
639 | switch (le.state) { | |
640 | ||
641 | case LoadingEntry::leEmpty: { | |
642 | startNewEntry(fileno, slotId, header); | |
643 | break; | |
644 | } | |
645 | ||
646 | case LoadingEntry::leLoading: { | |
647 | if (canAdd(fileno, slotId, header)) { | |
648 | addSlotToEntry(fileno, slotId, header); | |
649 | } else { | |
650 | // either the loading chain or this slot is stale; | |
651 | // be conservative and ignore both (and any future ones) | |
652 | le.state = LoadingEntry::leCorrupted; | |
653 | freeBadEntry(fileno, "duplicated"); | |
654 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
655 | ++counts.dupcount; | |
656 | } | |
657 | break; | |
658 | } | |
659 | ||
660 | case LoadingEntry::leLoaded: { | |
661 | // either the previously loaded chain or this slot is stale; | |
662 | // be conservative and ignore both (and any future ones) | |
663 | le.state = LoadingEntry::leCorrupted; | |
664 | sd->map->freeEntry(fileno); // may not be immediately successful | |
665 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
666 | ++counts.dupcount; | |
667 | break; | |
668 | } | |
669 | ||
670 | case LoadingEntry::leCorrupted: { | |
671 | // previously seen slots messed things up so we must ignore this one | |
672 | freeSlotIfIdle(slotId, false); | |
673 | break; | |
674 | } | |
675 | ||
676 | case LoadingEntry::leIgnored: { | |
677 | // already replaced by a fresher or colliding from-network entry | |
678 | freeSlotIfIdle(slotId, false); | |
679 | break; | |
680 | } | |
681 | } | |
93910d5c | 682 | } |