]>
Commit | Line | Data |
---|---|---|
e2851fe7 | 1 | /* |
ef57eb7b | 2 | * Copyright (C) 1996-2016 The Squid Software Foundation and contributors |
bbc27441 AJ |
3 | * |
4 | * Squid software is distributed under GPLv2+ license and includes | |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
e2851fe7 AR |
7 | */ |
8 | ||
bbc27441 AJ |
9 | /* DEBUG: section 79 Disk IO Routines */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
2745fea5 | 12 | #include "base/AsyncJobCalls.h" |
602d9612 | 13 | #include "fs/rock/RockDbCell.h" |
e2851fe7 AR |
14 | #include "fs/rock/RockRebuild.h" |
15 | #include "fs/rock/RockSwapDir.h" | |
b3f7fd88 | 16 | #include "fs_io.h" |
67679543 | 17 | #include "globals.h" |
dcd84f80 | 18 | #include "ipc/StoreMap.h" |
582c2af2 | 19 | #include "md5.h" |
386d28bf | 20 | #include "SquidTime.h" |
2745fea5 | 21 | #include "Store.h" |
b3f7fd88 | 22 | #include "store_rebuild.h" |
602d9612 | 23 | #include "tools.h" |
e2851fe7 | 24 | |
1a30fdf5 | 25 | #include <cerrno> |
21d845b1 | 26 | |
e2851fe7 AR |
27 | CBDATA_NAMESPACED_CLASS_INIT(Rock, Rebuild); |
28 | ||
e4d13993 AR |
29 | /** |
30 | \defgroup RockFsRebuild Rock Store Rebuild | |
31 | \ingroup Filesystems | |
32 | * | |
33 | \section Overview Overview | |
34 | * Several layers of information are manipualted during the rebuild: | |
35 | \par | |
36 | * Store Entry: Response message plus all the metainformation associated with | |
37 | * it. Identified by store key. At any given time, from Squid point | |
38 | * of view, there is only one entry with a given key, but several | |
39 | * different entries with the same key can be observed in any historical | |
40 | * archive (such as an access log or a store database). | |
41 | \par | |
42 | * Slot chain: A sequence of db slots representing a Store Entry state at | |
43 | * some point in time. Identified by key+version combination. Due to | |
44 | * transaction aborts, crashes, and idle periods, some chains may contain | |
45 | * incomplete or stale information. We assume that no two different chains | |
46 | * have the same key and version. If that assumption fails, we may serve a | |
47 | * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed. | |
48 | \par | |
49 | * Db slot: A db record containing a piece of a single store entry and linked | |
50 | * to other slots with the same key and version fields, forming a chain. | |
51 | * Slots are identified by their absolute position in the database file, | |
52 | * which is naturally unique. | |
53 | \par | |
54 | * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is | |
55 | * entry-level and is stored at fileno position. In other words, the array of | |
56 | * LoadingEntries should be interpreted as two arrays, one that maps slot ID | |
57 | * to the LoadingEntry::mapped/free/more members, and the second one that maps | |
58 | * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno. | |
59 | \par | |
60 | * When information from the newly loaded db slot contradicts the entry-level | |
61 | * information collected so far (e.g., the versions do not match or the total | |
62 | * chain size after the slot contribution exceeds the expected number), the | |
63 | * whole entry (and not just the chain or the slot!) is declared corrupted. | |
64 | \par | |
65 | * Why invalidate the whole entry? Rock Store is written for high-load | |
66 | * environments with large caches, where there is usually very few idle slots | |
67 | * in the database. A space occupied by a purged entry is usually immediately | |
68 | * reclaimed. A Squid crash or a transaction abort is rather unlikely to | |
69 | * leave a relatively large number of stale slots in the database. Thus, the | |
70 | * number of potentially corrupted entries is relatively small. On the other | |
71 | * hand, the damage from serving a single hadgepodge entry may be significant | |
72 | * to the user. In such an environment, invalidating the whole entry has | |
73 | * negligible performance impact but saves us from high-damage bugs. | |
74 | */ | |
75 | ||
9d4e9cfb AR |
76 | namespace Rock |
77 | { | |
50dc81ec AR |
78 | |
79 | /// maintains information about the store entry being loaded from disk | |
80 | /// used for identifying partially stored/loaded entries | |
9d4e9cfb AR |
81 | class LoadingEntry |
82 | { | |
50dc81ec AR |
83 | public: |
84 | LoadingEntry(): size(0), version(0), state(leEmpty), anchored(0), | |
f53969cc | 85 | mapped(0), freed(0), more(-1) {} |
50dc81ec AR |
86 | |
87 | /* store entry-level information indexed by sfileno */ | |
88 | uint64_t size; ///< payload seen so far | |
89 | uint32_t version; ///< DbCellHeader::version to distinguish same-URL chains | |
36c84e19 AR |
90 | uint8_t state:3; ///< current entry state (one of the State values) |
91 | uint8_t anchored:1; ///< whether we loaded the inode slot for this entry | |
50dc81ec AR |
92 | |
93 | /* db slot-level information indexed by slotId, starting with firstSlot */ | |
36c84e19 AR |
94 | uint8_t mapped:1; ///< whether this slot was added to a mapped entry |
95 | uint8_t freed:1; ///< whether this slot was marked as free | |
96 | Ipc::StoreMapSliceId more; ///< another slot in some entry chain (unordered) | |
50dc81ec AR |
97 | bool used() const { return freed || mapped || more != -1; } |
98 | ||
99 | /// possible entry states | |
100 | typedef enum { leEmpty = 0, leLoading, leLoaded, leCorrupted, leIgnored } State; | |
101 | }; | |
102 | ||
103 | } /* namespace Rock */ | |
104 | ||
078274f6 | 105 | Rock::Rebuild::Rebuild(SwapDir *dir): AsyncJob("Rock::Rebuild"), |
f53969cc SM |
106 | sd(dir), |
107 | entries(NULL), | |
108 | dbSize(0), | |
109 | dbSlotSize(0), | |
110 | dbSlotLimit(0), | |
111 | dbEntryLimit(0), | |
112 | fd(-1), | |
113 | dbOffset(0), | |
114 | loadingPos(0), | |
115 | validationPos(0) | |
e2851fe7 AR |
116 | { |
117 | assert(sd); | |
118 | memset(&counts, 0, sizeof(counts)); | |
119 | dbSize = sd->diskOffsetLimit(); // we do not care about the trailer waste | |
36c84e19 AR |
120 | dbSlotSize = sd->slotSize; |
121 | dbEntryLimit = sd->entryLimitActual(); | |
122 | dbSlotLimit = sd->slotLimitActual(); | |
123 | assert(dbEntryLimit <= dbSlotLimit); | |
e2851fe7 AR |
124 | } |
125 | ||
126 | Rock::Rebuild::~Rebuild() | |
127 | { | |
128 | if (fd >= 0) | |
129 | file_close(fd); | |
50dc81ec | 130 | delete[] entries; |
e2851fe7 AR |
131 | } |
132 | ||
133 | /// prepares and initiates entry loading sequence | |
134 | void | |
9199139f AR |
135 | Rock::Rebuild::start() |
136 | { | |
078274f6 AR |
137 | // in SMP mode, only the disker is responsible for populating the map |
138 | if (UsingSmp() && !IamDiskProcess()) { | |
139 | debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" << | |
9199139f | 140 | sd->index << " from " << sd->filePath); |
078274f6 AR |
141 | mustStop("non-disker"); |
142 | return; | |
143 | } | |
144 | ||
095ec2b1 AR |
145 | debugs(47, DBG_IMPORTANT, "Loading cache_dir #" << sd->index << |
146 | " from " << sd->filePath); | |
e2851fe7 AR |
147 | |
148 | fd = file_open(sd->filePath, O_RDONLY | O_BINARY); | |
149 | if (fd < 0) | |
150 | failure("cannot open db", errno); | |
151 | ||
50dc81ec AR |
152 | char hdrBuf[SwapDir::HeaderSize]; |
153 | if (read(fd, hdrBuf, sizeof(hdrBuf)) != SwapDir::HeaderSize) | |
e2851fe7 AR |
154 | failure("cannot read db header", errno); |
155 | ||
50dc81ec AR |
156 | // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours |
157 | assert(sizeof(DbCellHeader) < SM_PAGE_SIZE); | |
158 | buf.init(SM_PAGE_SIZE, SM_PAGE_SIZE); | |
159 | ||
e2851fe7 | 160 | dbOffset = SwapDir::HeaderSize; |
50dc81ec | 161 | |
36c84e19 | 162 | entries = new LoadingEntry[dbSlotLimit]; |
e2851fe7 AR |
163 | |
164 | checkpoint(); | |
165 | } | |
166 | ||
078274f6 | 167 | /// continues after a pause if not done |
e2851fe7 AR |
168 | void |
169 | Rock::Rebuild::checkpoint() | |
170 | { | |
50dc81ec | 171 | if (!done()) |
e2851fe7 | 172 | eventAdd("Rock::Rebuild", Rock::Rebuild::Steps, this, 0.01, 1, true); |
078274f6 AR |
173 | } |
174 | ||
175 | bool | |
176 | Rock::Rebuild::doneAll() const | |
177 | { | |
36c84e19 | 178 | return loadingPos >= dbSlotLimit && validationPos >= dbSlotLimit && |
9d4e9cfb | 179 | AsyncJob::doneAll(); |
e2851fe7 AR |
180 | } |
181 | ||
182 | void | |
183 | Rock::Rebuild::Steps(void *data) | |
184 | { | |
078274f6 AR |
185 | // use async call to enable job call protection that time events lack |
186 | CallJobHere(47, 5, static_cast<Rebuild*>(data), Rock::Rebuild, steps); | |
e2851fe7 AR |
187 | } |
188 | ||
93910d5c | 189 | void |
50dc81ec | 190 | Rock::Rebuild::steps() |
93910d5c | 191 | { |
36c84e19 | 192 | if (loadingPos < dbSlotLimit) |
50dc81ec AR |
193 | loadingSteps(); |
194 | else | |
195 | validationSteps(); | |
196 | ||
197 | checkpoint(); | |
93910d5c AR |
198 | } |
199 | ||
e2851fe7 | 200 | void |
50dc81ec | 201 | Rock::Rebuild::loadingSteps() |
9199139f | 202 | { |
539283df | 203 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 204 | dbOffset << " <= " << dbSize); |
e2851fe7 | 205 | |
386d28bf | 206 | // Balance our desire to maximize the number of entries processed at once |
9199139f | 207 | // (and, hence, minimize overheads and total rebuild time) with a |
386d28bf AR |
208 | // requirement to also process Coordinator events, disk I/Os, etc. |
209 | const int maxSpentMsec = 50; // keep small: most RAM I/Os are under 1ms | |
210 | const timeval loopStart = current_time; | |
211 | ||
212 | int loaded = 0; | |
36c84e19 | 213 | while (loadingPos < dbSlotLimit) { |
50dc81ec | 214 | loadOneSlot(); |
36c84e19 | 215 | dbOffset += dbSlotSize; |
6d68a230 | 216 | ++loadingPos; |
386d28bf | 217 | ++loaded; |
e2851fe7 AR |
218 | |
219 | if (counts.scancount % 1000 == 0) | |
36c84e19 | 220 | storeRebuildProgress(sd->index, dbSlotLimit, counts.scancount); |
386d28bf AR |
221 | |
222 | if (opt_foreground_rebuild) | |
223 | continue; // skip "few entries at a time" check below | |
224 | ||
225 | getCurrentTime(); | |
226 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
227 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
228 | debugs(47, 5, HERE << "pausing after " << loaded << " entries in " << | |
9199139f | 229 | elapsedMsec << "ms; " << (elapsedMsec/loaded) << "ms per entry"); |
386d28bf AR |
230 | break; |
231 | } | |
232 | } | |
e2851fe7 AR |
233 | } |
234 | ||
93910d5c | 235 | void |
50dc81ec | 236 | Rock::Rebuild::loadOneSlot() |
93910d5c | 237 | { |
539283df | 238 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 239 | dbOffset << " <= " << dbSize); |
e2851fe7 | 240 | |
c728b6f9 AR |
241 | ++counts.scancount; |
242 | ||
e2851fe7 AR |
243 | if (lseek(fd, dbOffset, SEEK_SET) < 0) |
244 | failure("cannot seek to db entry", errno); | |
245 | ||
50dc81ec | 246 | buf.reset(); |
c728b6f9 AR |
247 | |
248 | if (!storeRebuildLoadEntry(fd, sd->index, buf, counts)) | |
249 | return; | |
250 | ||
6d68a230 | 251 | const SlotId slotId = loadingPos; |
93910d5c | 252 | |
50dc81ec AR |
253 | // get our header |
254 | DbCellHeader header; | |
c728b6f9 | 255 | if (buf.contentSize() < static_cast<mb_size_t>(sizeof(header))) { |
51618c6a | 256 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
ce44c1ea AR |
257 | "Ignoring truncated " << buf.contentSize() << "-byte " << |
258 | "cache entry meta data at " << dbOffset); | |
50dc81ec | 259 | freeSlotIfIdle(slotId, true); |
c728b6f9 AR |
260 | return; |
261 | } | |
50dc81ec AR |
262 | memcpy(&header, buf.content(), sizeof(header)); |
263 | if (header.empty()) { | |
264 | freeSlotIfIdle(slotId, false); | |
265 | return; | |
266 | } | |
36c84e19 | 267 | if (!header.sane(dbSlotSize, dbSlotLimit)) { |
51618c6a | 268 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
9199139f | 269 | "Ignoring malformed cache entry meta data at " << dbOffset); |
50dc81ec | 270 | freeSlotIfIdle(slotId, true); |
e2851fe7 | 271 | return; |
9199139f | 272 | } |
50dc81ec AR |
273 | buf.consume(sizeof(header)); // optimize to avoid memmove() |
274 | ||
275 | useNewSlot(slotId, header); | |
276 | } | |
277 | ||
278 | /// parse StoreEntry basics and add them to the map, returning true on success | |
279 | bool | |
280 | Rock::Rebuild::importEntry(Ipc::StoreMapAnchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
281 | { | |
282 | cache_key key[SQUID_MD5_DIGEST_LENGTH]; | |
283 | StoreEntry loadedE; | |
50dc81ec | 284 | const uint64_t knownSize = header.entrySize > 0 ? |
d2b13bab | 285 | header.entrySize : anchor.basics.swap_file_sz.load(); |
ce44c1ea AR |
286 | if (!storeRebuildParseEntry(buf, loadedE, key, counts, knownSize)) |
287 | return false; | |
288 | ||
50dc81ec AR |
289 | // the entry size may still be unknown at this time |
290 | ||
ce44c1ea AR |
291 | debugs(47, 8, "importing basics for entry " << fileno << |
292 | " swap_file_sz: " << loadedE.swap_file_sz); | |
50dc81ec AR |
293 | anchor.set(loadedE); |
294 | ||
295 | // we have not validated whether all db cells for this entry were loaded | |
296 | EBIT_CLR(anchor.basics.flags, ENTRY_VALIDATED); | |
297 | ||
298 | // loadedE->dump(5); | |
299 | ||
300 | return true; | |
93910d5c | 301 | } |
e2851fe7 | 302 | |
93910d5c | 303 | void |
50dc81ec | 304 | Rock::Rebuild::validationSteps() |
93910d5c | 305 | { |
50dc81ec | 306 | debugs(47, 5, sd->index << " validating from " << validationPos); |
93910d5c | 307 | |
50dc81ec AR |
308 | // see loadingSteps() for the rationale; TODO: avoid duplication |
309 | const int maxSpentMsec = 50; // keep small: validation does not do I/O | |
310 | const timeval loopStart = current_time; | |
e2851fe7 | 311 | |
50dc81ec | 312 | int validated = 0; |
36c84e19 | 313 | while (validationPos < dbSlotLimit) { |
50dc81ec AR |
314 | validateOneEntry(); |
315 | ++validationPos; | |
316 | ++validated; | |
93910d5c | 317 | |
50dc81ec AR |
318 | if (validationPos % 1000 == 0) |
319 | debugs(20, 2, "validated: " << validationPos); | |
e2851fe7 | 320 | |
50dc81ec AR |
321 | if (opt_foreground_rebuild) |
322 | continue; // skip "few entries at a time" check below | |
323 | ||
324 | getCurrentTime(); | |
325 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
326 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
327 | debugs(47, 5, "pausing after " << validated << " entries in " << | |
328 | elapsedMsec << "ms; " << (elapsedMsec/validated) << "ms per entry"); | |
329 | break; | |
330 | } | |
331 | } | |
332 | } | |
333 | ||
334 | void | |
335 | Rock::Rebuild::validateOneEntry() | |
336 | { | |
337 | LoadingEntry &e = entries[validationPos]; | |
338 | switch (e.state) { | |
339 | ||
340 | case LoadingEntry::leEmpty: | |
341 | break; // no entry hashed to this position | |
342 | ||
343 | case LoadingEntry::leLoading: | |
344 | freeBadEntry(validationPos, "partially stored"); | |
345 | break; | |
346 | ||
347 | case LoadingEntry::leLoaded: | |
348 | break; // we have already unlocked this entry | |
349 | ||
350 | case LoadingEntry::leCorrupted: | |
351 | break; // we have already removed this entry | |
352 | } | |
353 | } | |
354 | ||
355 | /// Marks remaining bad entry slots as free and unlocks the entry. The map | |
356 | /// cannot do this because Loading entries may have holes in the slots chain. | |
357 | void | |
358 | Rock::Rebuild::freeBadEntry(const sfileno fileno, const char *eDescription) | |
359 | { | |
360 | debugs(47, 2, "cache_dir #" << sd->index << ' ' << eDescription << | |
361 | " entry " << fileno << " is ignored during rebuild"); | |
362 | ||
363 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); | |
364 | ||
365 | bool freedSome = false; | |
366 | // free all loaded non-anchor slots | |
367 | SlotId slotId = entries[anchor.start].more; | |
368 | while (slotId >= 0) { | |
369 | const SlotId next = entries[slotId].more; | |
370 | freeSlot(slotId, false); | |
371 | slotId = next; | |
372 | freedSome = true; | |
373 | } | |
374 | // free anchor slot if it was loaded | |
375 | if (entries[fileno].anchored) { | |
376 | freeSlot(anchor.start, false); | |
377 | freedSome = true; | |
378 | } | |
379 | assert(freedSome); | |
380 | ||
381 | sd->map->forgetWritingEntry(fileno); | |
382 | ++counts.invalid; | |
e2851fe7 AR |
383 | } |
384 | ||
385 | void | |
9199139f AR |
386 | Rock::Rebuild::swanSong() |
387 | { | |
078274f6 | 388 | debugs(47,3, HERE << "cache_dir #" << sd->index << " rebuild level: " << |
9199139f | 389 | StoreController::store_dirs_rebuilding); |
078274f6 | 390 | --StoreController::store_dirs_rebuilding; |
e2851fe7 | 391 | storeRebuildComplete(&counts); |
e2851fe7 AR |
392 | } |
393 | ||
394 | void | |
9199139f AR |
395 | Rock::Rebuild::failure(const char *msg, int errNo) |
396 | { | |
539283df | 397 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 398 | dbOffset << " <= " << dbSize); |
e2851fe7 AR |
399 | |
400 | if (errNo) | |
f5adb654 AR |
401 | debugs(47, DBG_CRITICAL, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo)); |
402 | debugs(47, DBG_CRITICAL, "Do you need to run 'squid -z' to initialize storage?"); | |
e2851fe7 AR |
403 | |
404 | assert(sd); | |
405 | fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.", | |
9199139f | 406 | sd->index, sd->filePath, msg); |
e2851fe7 | 407 | } |
93910d5c | 408 | |
50dc81ec AR |
409 | /// adds slot to the free slot index |
410 | void | |
411 | Rock::Rebuild::freeSlot(const SlotId slotId, const bool invalid) | |
93910d5c | 412 | { |
50dc81ec AR |
413 | debugs(47,5, sd->index << " frees slot " << slotId); |
414 | LoadingEntry &le = entries[slotId]; | |
415 | assert(!le.freed); | |
416 | le.freed = 1; | |
417 | ||
418 | if (invalid) { | |
419 | ++counts.invalid; | |
420 | //sd->unlink(fileno); leave garbage on disk, it should not hurt | |
421 | } | |
422 | ||
423 | Ipc::Mem::PageId pageId; | |
424 | pageId.pool = sd->index+1; | |
425 | pageId.number = slotId+1; | |
426 | sd->freeSlots->push(pageId); | |
427 | } | |
428 | ||
429 | /// adds slot to the free slot index but only if the slot is unused | |
430 | void | |
431 | Rock::Rebuild::freeSlotIfIdle(const SlotId slotId, const bool invalid) | |
432 | { | |
433 | const LoadingEntry &le = entries[slotId]; | |
434 | ||
435 | // mapped slots must be freed via freeBadEntry() to keep the map in sync | |
436 | assert(!le.mapped); | |
437 | ||
438 | if (!le.used()) | |
439 | freeSlot(slotId, invalid); | |
440 | } | |
441 | ||
442 | /// adds slot to the entry chain in the map | |
443 | void | |
444 | Rock::Rebuild::mapSlot(const SlotId slotId, const DbCellHeader &header) | |
445 | { | |
446 | LoadingEntry &le = entries[slotId]; | |
447 | assert(!le.mapped); | |
448 | assert(!le.freed); | |
449 | le.mapped = 1; | |
450 | ||
451 | Ipc::StoreMapSlice slice; | |
452 | slice.next = header.nextSlot; | |
453 | slice.size = header.payloadSize; | |
454 | sd->map->importSlice(slotId, slice); | |
455 | } | |
456 | ||
457 | /// adds slot to an existing entry chain; caller must check that the slot | |
458 | /// belongs to the chain it is being added to | |
459 | void | |
460 | Rock::Rebuild::addSlotToEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
461 | { | |
462 | LoadingEntry &le = entries[fileno]; | |
463 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); | |
464 | ||
465 | assert(le.version == header.version); | |
466 | ||
467 | // mark anchor as loaded or add the secondary slot to the chain | |
468 | LoadingEntry &inode = entries[header.firstSlot]; | |
469 | if (header.firstSlot == slotId) { | |
470 | debugs(47,5, "adding inode"); | |
471 | assert(!inode.freed); | |
472 | le.anchored = 1; | |
473 | } else { | |
474 | debugs(47,9, "linking " << slotId << " to " << inode.more); | |
475 | // we do not need to preserve the order | |
476 | LoadingEntry &slice = entries[slotId]; | |
477 | assert(!slice.freed); | |
478 | assert(slice.more < 0); | |
479 | slice.more = inode.more; | |
480 | inode.more = slotId; | |
481 | } | |
482 | ||
483 | if (header.firstSlot == slotId && !importEntry(anchor, fileno, header)) { | |
484 | le.state = LoadingEntry::leCorrupted; | |
485 | freeBadEntry(fileno, "corrupted metainfo"); | |
486 | return; | |
487 | } | |
488 | ||
489 | // set total entry size and/or check it for consistency | |
ce44c1ea | 490 | debugs(47, 8, "header.entrySize: " << header.entrySize << " swap_file_sz: " << anchor.basics.swap_file_sz); |
50dc81ec AR |
491 | uint64_t totalSize = header.entrySize; |
492 | assert(totalSize != static_cast<uint64_t>(-1)); | |
493 | if (!totalSize && anchor.basics.swap_file_sz) { | |
494 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
495 | // perhaps we loaded a later slot (with entrySize) earlier | |
496 | totalSize = anchor.basics.swap_file_sz; | |
9d4e9cfb | 497 | } else if (totalSize && !anchor.basics.swap_file_sz) { |
50dc81ec AR |
498 | anchor.basics.swap_file_sz = totalSize; |
499 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
9d4e9cfb | 500 | } else if (totalSize != anchor.basics.swap_file_sz) { |
50dc81ec AR |
501 | le.state = LoadingEntry::leCorrupted; |
502 | freeBadEntry(fileno, "size mismatch"); | |
503 | return; | |
504 | } | |
505 | ||
506 | le.size += header.payloadSize; | |
507 | ||
508 | if (totalSize > 0 && le.size > totalSize) { // overflow | |
ce44c1ea | 509 | debugs(47, 8, "overflow: " << le.size << " > " << totalSize); |
50dc81ec AR |
510 | le.state = LoadingEntry::leCorrupted; |
511 | freeBadEntry(fileno, "overflowing"); | |
512 | return; | |
513 | } | |
514 | ||
515 | mapSlot(slotId, header); | |
516 | if (totalSize > 0 && le.size == totalSize) { | |
517 | // entry fully loaded, unlock it | |
518 | // we have validated that all db cells for this entry were loaded | |
519 | EBIT_SET(anchor.basics.flags, ENTRY_VALIDATED); | |
520 | le.state = LoadingEntry::leLoaded; | |
521 | sd->map->closeForWriting(fileno, false); | |
522 | ++counts.objcount; | |
523 | } | |
524 | } | |
525 | ||
526 | /// initialize housekeeping information for a newly accepted entry | |
527 | void | |
528 | Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
529 | { | |
530 | anchor.setKey(reinterpret_cast<const cache_key*>(header.key)); | |
531 | assert(header.firstSlot >= 0); | |
532 | anchor.start = header.firstSlot; | |
533 | ||
534 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
535 | ||
536 | LoadingEntry &le = entries[fileno]; | |
537 | le.state = LoadingEntry::leLoading; | |
538 | le.version = header.version; | |
539 | le.size = 0; | |
540 | } | |
541 | ||
542 | /// handle a slot from an entry that we have not seen before | |
543 | void | |
544 | Rock::Rebuild::startNewEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
545 | { | |
546 | // If some other from-disk entry is/was using this slot as its inode OR | |
9d4e9cfb | 547 | // if some other from-disk entry is/was using our inode slot, then the |
50dc81ec AR |
548 | // entries are conflicting. We cannot identify other entries, so we just |
549 | // remove ours and hope that the others were/will be handled correctly. | |
550 | const LoadingEntry &slice = entries[slotId]; | |
551 | const LoadingEntry &inode = entries[header.firstSlot]; | |
552 | if (slice.used() || inode.used()) { | |
553 | debugs(47,8, "slice/inode used: " << slice.used() << inode.used()); | |
554 | LoadingEntry &le = entries[fileno]; | |
555 | le.state = LoadingEntry::leCorrupted; | |
556 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
557 | // if not idle, the other entry will handle its slice | |
558 | ++counts.clashcount; | |
559 | return; | |
560 | } | |
561 | ||
562 | // A miss may have been stored at our fileno while we were loading other | |
563 | // slots from disk. We ought to preserve that entry because it is fresher. | |
564 | const bool overwriteExisting = false; | |
565 | if (Ipc::StoreMap::Anchor *anchor = sd->map->openForWritingAt(fileno, overwriteExisting)) { | |
566 | primeNewEntry(*anchor, fileno, header); | |
567 | addSlotToEntry(fileno, slotId, header); // may fail | |
568 | assert(anchor->basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
569 | } else { | |
570 | // A new from-network entry is occupying our map slot; let it be, but | |
571 | // save us from the trouble of going through the above motions again. | |
572 | LoadingEntry &le = entries[fileno]; | |
573 | le.state = LoadingEntry::leIgnored; | |
574 | freeSlotIfIdle(slotId, false); | |
575 | } | |
576 | } | |
577 | ||
578 | /// does the header belong to the fileno entry being loaded? | |
579 | bool | |
580 | Rock::Rebuild::sameEntry(const sfileno fileno, const DbCellHeader &header) const | |
581 | { | |
582 | const Ipc::StoreMap::Anchor &anchor = sd->map->writeableEntry(fileno); | |
583 | const LoadingEntry &le = entries[fileno]; | |
584 | // any order will work, but do fast comparisons first: | |
585 | return le.version == header.version && | |
9d4e9cfb AR |
586 | anchor.start == static_cast<Ipc::StoreMapSliceId>(header.firstSlot) && |
587 | anchor.sameKey(reinterpret_cast<const cache_key*>(header.key)); | |
50dc81ec AR |
588 | } |
589 | ||
590 | /// is the new header consistent with information already loaded? | |
591 | bool | |
592 | Rock::Rebuild::canAdd(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) const | |
593 | { | |
594 | if (!sameEntry(fileno, header)) { | |
595 | debugs(79, 7, "cannot add; wrong entry"); | |
596 | return false; | |
597 | } | |
598 | ||
599 | const LoadingEntry &le = entries[slotId]; | |
600 | // We cannot add a slot that was already declared free or mapped. | |
601 | if (le.freed || le.mapped) { | |
602 | debugs(79, 7, "cannot add; freed/mapped: " << le.freed << le.mapped); | |
603 | return false; | |
604 | } | |
605 | ||
606 | if (slotId == header.firstSlot) { | |
607 | // If we are the inode, the anchored flag cannot be set yet. | |
608 | if (entries[fileno].anchored) { | |
609 | debugs(79, 7, "cannot add; extra anchor"); | |
610 | return false; | |
611 | } | |
612 | ||
613 | // And there should have been some other slot for this entry to exist. | |
614 | if (le.more < 0) { | |
615 | debugs(79, 7, "cannot add; missing slots"); | |
616 | return false; | |
617 | } | |
618 | ||
619 | return true; | |
620 | } | |
621 | ||
622 | // We are the continuation slice so the more field is reserved for us. | |
623 | if (le.more >= 0) { | |
624 | debugs(79, 7, "cannot add; foreign slot"); | |
625 | return false; | |
626 | } | |
627 | ||
628 | return true; | |
629 | } | |
630 | ||
631 | /// handle freshly loaded (and validated) db slot header | |
632 | void | |
633 | Rock::Rebuild::useNewSlot(const SlotId slotId, const DbCellHeader &header) | |
634 | { | |
635 | LoadingEntry &slice = entries[slotId]; | |
636 | assert(!slice.freed); // we cannot free what was not loaded | |
637 | ||
638 | const cache_key *const key = | |
639 | reinterpret_cast<const cache_key*>(header.key); | |
640 | const sfileno fileno = sd->map->anchorIndexByKey(key); | |
641 | assert(0 <= fileno && fileno < dbEntryLimit); | |
642 | ||
643 | LoadingEntry &le = entries[fileno]; | |
644 | debugs(47,9, "entry " << fileno << " state: " << le.state << ", inode: " << | |
9d4e9cfb | 645 | header.firstSlot << ", size: " << header.payloadSize); |
50dc81ec AR |
646 | |
647 | switch (le.state) { | |
648 | ||
649 | case LoadingEntry::leEmpty: { | |
650 | startNewEntry(fileno, slotId, header); | |
651 | break; | |
652 | } | |
653 | ||
654 | case LoadingEntry::leLoading: { | |
655 | if (canAdd(fileno, slotId, header)) { | |
656 | addSlotToEntry(fileno, slotId, header); | |
657 | } else { | |
658 | // either the loading chain or this slot is stale; | |
659 | // be conservative and ignore both (and any future ones) | |
660 | le.state = LoadingEntry::leCorrupted; | |
661 | freeBadEntry(fileno, "duplicated"); | |
662 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
663 | ++counts.dupcount; | |
664 | } | |
665 | break; | |
666 | } | |
667 | ||
668 | case LoadingEntry::leLoaded: { | |
669 | // either the previously loaded chain or this slot is stale; | |
670 | // be conservative and ignore both (and any future ones) | |
671 | le.state = LoadingEntry::leCorrupted; | |
672 | sd->map->freeEntry(fileno); // may not be immediately successful | |
673 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
674 | ++counts.dupcount; | |
675 | break; | |
676 | } | |
677 | ||
678 | case LoadingEntry::leCorrupted: { | |
679 | // previously seen slots messed things up so we must ignore this one | |
680 | freeSlotIfIdle(slotId, false); | |
681 | break; | |
682 | } | |
683 | ||
684 | case LoadingEntry::leIgnored: { | |
685 | // already replaced by a fresher or colliding from-network entry | |
686 | freeSlotIfIdle(slotId, false); | |
687 | break; | |
688 | } | |
689 | } | |
93910d5c | 690 | } |
f53969cc | 691 |