]>
Commit | Line | Data |
---|---|---|
e2851fe7 | 1 | /* |
bde978a6 | 2 | * Copyright (C) 1996-2015 The Squid Software Foundation and contributors |
bbc27441 AJ |
3 | * |
4 | * Squid software is distributed under GPLv2+ license and includes | |
5 | * contributions from numerous individuals and organizations. | |
6 | * Please see the COPYING and CONTRIBUTORS files for details. | |
e2851fe7 AR |
7 | */ |
8 | ||
bbc27441 AJ |
9 | /* DEBUG: section 79 Disk IO Routines */ |
10 | ||
f7f3304a | 11 | #include "squid.h" |
438b04d4 | 12 | #include "disk.h" |
602d9612 | 13 | #include "fs/rock/RockDbCell.h" |
e2851fe7 AR |
14 | #include "fs/rock/RockRebuild.h" |
15 | #include "fs/rock/RockSwapDir.h" | |
67679543 | 16 | #include "globals.h" |
dcd84f80 | 17 | #include "ipc/StoreMap.h" |
582c2af2 | 18 | #include "md5.h" |
386d28bf | 19 | #include "SquidTime.h" |
687f5275 | 20 | #include "store_rebuild.h" |
602d9612 | 21 | #include "tools.h" |
e2851fe7 | 22 | |
1a30fdf5 | 23 | #include <cerrno> |
21d845b1 | 24 | |
e2851fe7 AR |
25 | CBDATA_NAMESPACED_CLASS_INIT(Rock, Rebuild); |
26 | ||
e4d13993 AR |
27 | /** |
28 | \defgroup RockFsRebuild Rock Store Rebuild | |
29 | \ingroup Filesystems | |
30 | * | |
31 | \section Overview Overview | |
32 | * Several layers of information are manipualted during the rebuild: | |
33 | \par | |
34 | * Store Entry: Response message plus all the metainformation associated with | |
35 | * it. Identified by store key. At any given time, from Squid point | |
36 | * of view, there is only one entry with a given key, but several | |
37 | * different entries with the same key can be observed in any historical | |
38 | * archive (such as an access log or a store database). | |
39 | \par | |
40 | * Slot chain: A sequence of db slots representing a Store Entry state at | |
41 | * some point in time. Identified by key+version combination. Due to | |
42 | * transaction aborts, crashes, and idle periods, some chains may contain | |
43 | * incomplete or stale information. We assume that no two different chains | |
44 | * have the same key and version. If that assumption fails, we may serve a | |
45 | * hodgepodge entry during rebuild, until "extra" slots are loaded/noticed. | |
46 | \par | |
47 | * Db slot: A db record containing a piece of a single store entry and linked | |
48 | * to other slots with the same key and version fields, forming a chain. | |
49 | * Slots are identified by their absolute position in the database file, | |
50 | * which is naturally unique. | |
51 | \par | |
52 | * Except for the "mapped", "freed", and "more" fields, LoadingEntry info is | |
53 | * entry-level and is stored at fileno position. In other words, the array of | |
54 | * LoadingEntries should be interpreted as two arrays, one that maps slot ID | |
55 | * to the LoadingEntry::mapped/free/more members, and the second one that maps | |
56 | * fileno to all other LoadingEntry members. StoreMap maps slot key to fileno. | |
57 | \par | |
58 | * When information from the newly loaded db slot contradicts the entry-level | |
59 | * information collected so far (e.g., the versions do not match or the total | |
60 | * chain size after the slot contribution exceeds the expected number), the | |
61 | * whole entry (and not just the chain or the slot!) is declared corrupted. | |
62 | \par | |
63 | * Why invalidate the whole entry? Rock Store is written for high-load | |
64 | * environments with large caches, where there is usually very few idle slots | |
65 | * in the database. A space occupied by a purged entry is usually immediately | |
66 | * reclaimed. A Squid crash or a transaction abort is rather unlikely to | |
67 | * leave a relatively large number of stale slots in the database. Thus, the | |
68 | * number of potentially corrupted entries is relatively small. On the other | |
69 | * hand, the damage from serving a single hadgepodge entry may be significant | |
70 | * to the user. In such an environment, invalidating the whole entry has | |
71 | * negligible performance impact but saves us from high-damage bugs. | |
72 | */ | |
73 | ||
9d4e9cfb AR |
74 | namespace Rock |
75 | { | |
50dc81ec AR |
76 | |
77 | /// maintains information about the store entry being loaded from disk | |
78 | /// used for identifying partially stored/loaded entries | |
9d4e9cfb AR |
79 | class LoadingEntry |
80 | { | |
50dc81ec AR |
81 | public: |
82 | LoadingEntry(): size(0), version(0), state(leEmpty), anchored(0), | |
f53969cc | 83 | mapped(0), freed(0), more(-1) {} |
50dc81ec AR |
84 | |
85 | /* store entry-level information indexed by sfileno */ | |
86 | uint64_t size; ///< payload seen so far | |
87 | uint32_t version; ///< DbCellHeader::version to distinguish same-URL chains | |
36c84e19 AR |
88 | uint8_t state:3; ///< current entry state (one of the State values) |
89 | uint8_t anchored:1; ///< whether we loaded the inode slot for this entry | |
50dc81ec AR |
90 | |
91 | /* db slot-level information indexed by slotId, starting with firstSlot */ | |
36c84e19 AR |
92 | uint8_t mapped:1; ///< whether this slot was added to a mapped entry |
93 | uint8_t freed:1; ///< whether this slot was marked as free | |
94 | Ipc::StoreMapSliceId more; ///< another slot in some entry chain (unordered) | |
50dc81ec AR |
95 | bool used() const { return freed || mapped || more != -1; } |
96 | ||
97 | /// possible entry states | |
98 | typedef enum { leEmpty = 0, leLoading, leLoaded, leCorrupted, leIgnored } State; | |
99 | }; | |
100 | ||
101 | } /* namespace Rock */ | |
102 | ||
078274f6 | 103 | Rock::Rebuild::Rebuild(SwapDir *dir): AsyncJob("Rock::Rebuild"), |
f53969cc SM |
104 | sd(dir), |
105 | entries(NULL), | |
106 | dbSize(0), | |
107 | dbSlotSize(0), | |
108 | dbSlotLimit(0), | |
109 | dbEntryLimit(0), | |
110 | fd(-1), | |
111 | dbOffset(0), | |
112 | loadingPos(0), | |
113 | validationPos(0) | |
e2851fe7 AR |
114 | { |
115 | assert(sd); | |
116 | memset(&counts, 0, sizeof(counts)); | |
117 | dbSize = sd->diskOffsetLimit(); // we do not care about the trailer waste | |
36c84e19 AR |
118 | dbSlotSize = sd->slotSize; |
119 | dbEntryLimit = sd->entryLimitActual(); | |
120 | dbSlotLimit = sd->slotLimitActual(); | |
121 | assert(dbEntryLimit <= dbSlotLimit); | |
e2851fe7 AR |
122 | } |
123 | ||
124 | Rock::Rebuild::~Rebuild() | |
125 | { | |
126 | if (fd >= 0) | |
127 | file_close(fd); | |
50dc81ec | 128 | delete[] entries; |
e2851fe7 AR |
129 | } |
130 | ||
131 | /// prepares and initiates entry loading sequence | |
132 | void | |
9199139f AR |
133 | Rock::Rebuild::start() |
134 | { | |
078274f6 AR |
135 | // in SMP mode, only the disker is responsible for populating the map |
136 | if (UsingSmp() && !IamDiskProcess()) { | |
137 | debugs(47, 2, "Non-disker skips rebuilding of cache_dir #" << | |
9199139f | 138 | sd->index << " from " << sd->filePath); |
078274f6 AR |
139 | mustStop("non-disker"); |
140 | return; | |
141 | } | |
142 | ||
095ec2b1 AR |
143 | debugs(47, DBG_IMPORTANT, "Loading cache_dir #" << sd->index << |
144 | " from " << sd->filePath); | |
e2851fe7 AR |
145 | |
146 | fd = file_open(sd->filePath, O_RDONLY | O_BINARY); | |
147 | if (fd < 0) | |
148 | failure("cannot open db", errno); | |
149 | ||
50dc81ec AR |
150 | char hdrBuf[SwapDir::HeaderSize]; |
151 | if (read(fd, hdrBuf, sizeof(hdrBuf)) != SwapDir::HeaderSize) | |
e2851fe7 AR |
152 | failure("cannot read db header", errno); |
153 | ||
50dc81ec AR |
154 | // slot prefix of SM_PAGE_SIZE should fit both core entry header and ours |
155 | assert(sizeof(DbCellHeader) < SM_PAGE_SIZE); | |
156 | buf.init(SM_PAGE_SIZE, SM_PAGE_SIZE); | |
157 | ||
e2851fe7 | 158 | dbOffset = SwapDir::HeaderSize; |
50dc81ec | 159 | |
36c84e19 | 160 | entries = new LoadingEntry[dbSlotLimit]; |
e2851fe7 AR |
161 | |
162 | checkpoint(); | |
163 | } | |
164 | ||
078274f6 | 165 | /// continues after a pause if not done |
e2851fe7 AR |
166 | void |
167 | Rock::Rebuild::checkpoint() | |
168 | { | |
50dc81ec | 169 | if (!done()) |
e2851fe7 | 170 | eventAdd("Rock::Rebuild", Rock::Rebuild::Steps, this, 0.01, 1, true); |
078274f6 AR |
171 | } |
172 | ||
173 | bool | |
174 | Rock::Rebuild::doneAll() const | |
175 | { | |
36c84e19 | 176 | return loadingPos >= dbSlotLimit && validationPos >= dbSlotLimit && |
9d4e9cfb | 177 | AsyncJob::doneAll(); |
e2851fe7 AR |
178 | } |
179 | ||
180 | void | |
181 | Rock::Rebuild::Steps(void *data) | |
182 | { | |
078274f6 AR |
183 | // use async call to enable job call protection that time events lack |
184 | CallJobHere(47, 5, static_cast<Rebuild*>(data), Rock::Rebuild, steps); | |
e2851fe7 AR |
185 | } |
186 | ||
93910d5c | 187 | void |
50dc81ec | 188 | Rock::Rebuild::steps() |
93910d5c | 189 | { |
36c84e19 | 190 | if (loadingPos < dbSlotLimit) |
50dc81ec AR |
191 | loadingSteps(); |
192 | else | |
193 | validationSteps(); | |
194 | ||
195 | checkpoint(); | |
93910d5c AR |
196 | } |
197 | ||
e2851fe7 | 198 | void |
50dc81ec | 199 | Rock::Rebuild::loadingSteps() |
9199139f | 200 | { |
539283df | 201 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 202 | dbOffset << " <= " << dbSize); |
e2851fe7 | 203 | |
386d28bf | 204 | // Balance our desire to maximize the number of entries processed at once |
9199139f | 205 | // (and, hence, minimize overheads and total rebuild time) with a |
386d28bf AR |
206 | // requirement to also process Coordinator events, disk I/Os, etc. |
207 | const int maxSpentMsec = 50; // keep small: most RAM I/Os are under 1ms | |
208 | const timeval loopStart = current_time; | |
209 | ||
210 | int loaded = 0; | |
36c84e19 | 211 | while (loadingPos < dbSlotLimit) { |
50dc81ec | 212 | loadOneSlot(); |
36c84e19 | 213 | dbOffset += dbSlotSize; |
6d68a230 | 214 | ++loadingPos; |
386d28bf | 215 | ++loaded; |
e2851fe7 AR |
216 | |
217 | if (counts.scancount % 1000 == 0) | |
36c84e19 | 218 | storeRebuildProgress(sd->index, dbSlotLimit, counts.scancount); |
386d28bf AR |
219 | |
220 | if (opt_foreground_rebuild) | |
221 | continue; // skip "few entries at a time" check below | |
222 | ||
223 | getCurrentTime(); | |
224 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
225 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
226 | debugs(47, 5, HERE << "pausing after " << loaded << " entries in " << | |
9199139f | 227 | elapsedMsec << "ms; " << (elapsedMsec/loaded) << "ms per entry"); |
386d28bf AR |
228 | break; |
229 | } | |
230 | } | |
e2851fe7 AR |
231 | } |
232 | ||
93910d5c | 233 | void |
50dc81ec | 234 | Rock::Rebuild::loadOneSlot() |
93910d5c | 235 | { |
539283df | 236 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 237 | dbOffset << " <= " << dbSize); |
e2851fe7 | 238 | |
c728b6f9 AR |
239 | ++counts.scancount; |
240 | ||
e2851fe7 AR |
241 | if (lseek(fd, dbOffset, SEEK_SET) < 0) |
242 | failure("cannot seek to db entry", errno); | |
243 | ||
50dc81ec | 244 | buf.reset(); |
c728b6f9 AR |
245 | |
246 | if (!storeRebuildLoadEntry(fd, sd->index, buf, counts)) | |
247 | return; | |
248 | ||
6d68a230 | 249 | const SlotId slotId = loadingPos; |
93910d5c | 250 | |
50dc81ec AR |
251 | // get our header |
252 | DbCellHeader header; | |
c728b6f9 | 253 | if (buf.contentSize() < static_cast<mb_size_t>(sizeof(header))) { |
51618c6a | 254 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
ce44c1ea AR |
255 | "Ignoring truncated " << buf.contentSize() << "-byte " << |
256 | "cache entry meta data at " << dbOffset); | |
50dc81ec | 257 | freeSlotIfIdle(slotId, true); |
c728b6f9 AR |
258 | return; |
259 | } | |
50dc81ec AR |
260 | memcpy(&header, buf.content(), sizeof(header)); |
261 | if (header.empty()) { | |
262 | freeSlotIfIdle(slotId, false); | |
263 | return; | |
264 | } | |
36c84e19 | 265 | if (!header.sane(dbSlotSize, dbSlotLimit)) { |
51618c6a | 266 | debugs(47, DBG_IMPORTANT, "WARNING: cache_dir[" << sd->index << "]: " << |
9199139f | 267 | "Ignoring malformed cache entry meta data at " << dbOffset); |
50dc81ec | 268 | freeSlotIfIdle(slotId, true); |
e2851fe7 | 269 | return; |
9199139f | 270 | } |
50dc81ec AR |
271 | buf.consume(sizeof(header)); // optimize to avoid memmove() |
272 | ||
273 | useNewSlot(slotId, header); | |
274 | } | |
275 | ||
276 | /// parse StoreEntry basics and add them to the map, returning true on success | |
277 | bool | |
278 | Rock::Rebuild::importEntry(Ipc::StoreMapAnchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
279 | { | |
280 | cache_key key[SQUID_MD5_DIGEST_LENGTH]; | |
281 | StoreEntry loadedE; | |
50dc81ec | 282 | const uint64_t knownSize = header.entrySize > 0 ? |
d2b13bab | 283 | header.entrySize : anchor.basics.swap_file_sz.load(); |
ce44c1ea AR |
284 | if (!storeRebuildParseEntry(buf, loadedE, key, counts, knownSize)) |
285 | return false; | |
286 | ||
50dc81ec AR |
287 | // the entry size may still be unknown at this time |
288 | ||
ce44c1ea AR |
289 | debugs(47, 8, "importing basics for entry " << fileno << |
290 | " swap_file_sz: " << loadedE.swap_file_sz); | |
50dc81ec AR |
291 | anchor.set(loadedE); |
292 | ||
293 | // we have not validated whether all db cells for this entry were loaded | |
294 | EBIT_CLR(anchor.basics.flags, ENTRY_VALIDATED); | |
295 | ||
296 | // loadedE->dump(5); | |
297 | ||
298 | return true; | |
93910d5c | 299 | } |
e2851fe7 | 300 | |
93910d5c | 301 | void |
50dc81ec | 302 | Rock::Rebuild::validationSteps() |
93910d5c | 303 | { |
50dc81ec | 304 | debugs(47, 5, sd->index << " validating from " << validationPos); |
93910d5c | 305 | |
50dc81ec AR |
306 | // see loadingSteps() for the rationale; TODO: avoid duplication |
307 | const int maxSpentMsec = 50; // keep small: validation does not do I/O | |
308 | const timeval loopStart = current_time; | |
e2851fe7 | 309 | |
50dc81ec | 310 | int validated = 0; |
36c84e19 | 311 | while (validationPos < dbSlotLimit) { |
50dc81ec AR |
312 | validateOneEntry(); |
313 | ++validationPos; | |
314 | ++validated; | |
93910d5c | 315 | |
50dc81ec AR |
316 | if (validationPos % 1000 == 0) |
317 | debugs(20, 2, "validated: " << validationPos); | |
e2851fe7 | 318 | |
50dc81ec AR |
319 | if (opt_foreground_rebuild) |
320 | continue; // skip "few entries at a time" check below | |
321 | ||
322 | getCurrentTime(); | |
323 | const double elapsedMsec = tvSubMsec(loopStart, current_time); | |
324 | if (elapsedMsec > maxSpentMsec || elapsedMsec < 0) { | |
325 | debugs(47, 5, "pausing after " << validated << " entries in " << | |
326 | elapsedMsec << "ms; " << (elapsedMsec/validated) << "ms per entry"); | |
327 | break; | |
328 | } | |
329 | } | |
330 | } | |
331 | ||
332 | void | |
333 | Rock::Rebuild::validateOneEntry() | |
334 | { | |
335 | LoadingEntry &e = entries[validationPos]; | |
336 | switch (e.state) { | |
337 | ||
338 | case LoadingEntry::leEmpty: | |
339 | break; // no entry hashed to this position | |
340 | ||
341 | case LoadingEntry::leLoading: | |
342 | freeBadEntry(validationPos, "partially stored"); | |
343 | break; | |
344 | ||
345 | case LoadingEntry::leLoaded: | |
346 | break; // we have already unlocked this entry | |
347 | ||
348 | case LoadingEntry::leCorrupted: | |
349 | break; // we have already removed this entry | |
350 | } | |
351 | } | |
352 | ||
353 | /// Marks remaining bad entry slots as free and unlocks the entry. The map | |
354 | /// cannot do this because Loading entries may have holes in the slots chain. | |
355 | void | |
356 | Rock::Rebuild::freeBadEntry(const sfileno fileno, const char *eDescription) | |
357 | { | |
358 | debugs(47, 2, "cache_dir #" << sd->index << ' ' << eDescription << | |
359 | " entry " << fileno << " is ignored during rebuild"); | |
360 | ||
361 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); | |
362 | ||
363 | bool freedSome = false; | |
364 | // free all loaded non-anchor slots | |
365 | SlotId slotId = entries[anchor.start].more; | |
366 | while (slotId >= 0) { | |
367 | const SlotId next = entries[slotId].more; | |
368 | freeSlot(slotId, false); | |
369 | slotId = next; | |
370 | freedSome = true; | |
371 | } | |
372 | // free anchor slot if it was loaded | |
373 | if (entries[fileno].anchored) { | |
374 | freeSlot(anchor.start, false); | |
375 | freedSome = true; | |
376 | } | |
377 | assert(freedSome); | |
378 | ||
379 | sd->map->forgetWritingEntry(fileno); | |
380 | ++counts.invalid; | |
e2851fe7 AR |
381 | } |
382 | ||
383 | void | |
9199139f AR |
384 | Rock::Rebuild::swanSong() |
385 | { | |
078274f6 | 386 | debugs(47,3, HERE << "cache_dir #" << sd->index << " rebuild level: " << |
9199139f | 387 | StoreController::store_dirs_rebuilding); |
078274f6 | 388 | --StoreController::store_dirs_rebuilding; |
e2851fe7 | 389 | storeRebuildComplete(&counts); |
e2851fe7 AR |
390 | } |
391 | ||
392 | void | |
9199139f AR |
393 | Rock::Rebuild::failure(const char *msg, int errNo) |
394 | { | |
539283df | 395 | debugs(47,5, sd->index << " slot " << loadingPos << " at " << |
9199139f | 396 | dbOffset << " <= " << dbSize); |
e2851fe7 AR |
397 | |
398 | if (errNo) | |
f5adb654 AR |
399 | debugs(47, DBG_CRITICAL, "ERROR: Rock cache_dir rebuild failure: " << xstrerr(errNo)); |
400 | debugs(47, DBG_CRITICAL, "Do you need to run 'squid -z' to initialize storage?"); | |
e2851fe7 AR |
401 | |
402 | assert(sd); | |
403 | fatalf("Rock cache_dir[%d] rebuild of %s failed: %s.", | |
9199139f | 404 | sd->index, sd->filePath, msg); |
e2851fe7 | 405 | } |
93910d5c | 406 | |
50dc81ec AR |
407 | /// adds slot to the free slot index |
408 | void | |
409 | Rock::Rebuild::freeSlot(const SlotId slotId, const bool invalid) | |
93910d5c | 410 | { |
50dc81ec AR |
411 | debugs(47,5, sd->index << " frees slot " << slotId); |
412 | LoadingEntry &le = entries[slotId]; | |
413 | assert(!le.freed); | |
414 | le.freed = 1; | |
415 | ||
416 | if (invalid) { | |
417 | ++counts.invalid; | |
418 | //sd->unlink(fileno); leave garbage on disk, it should not hurt | |
419 | } | |
420 | ||
421 | Ipc::Mem::PageId pageId; | |
422 | pageId.pool = sd->index+1; | |
423 | pageId.number = slotId+1; | |
424 | sd->freeSlots->push(pageId); | |
425 | } | |
426 | ||
427 | /// adds slot to the free slot index but only if the slot is unused | |
428 | void | |
429 | Rock::Rebuild::freeSlotIfIdle(const SlotId slotId, const bool invalid) | |
430 | { | |
431 | const LoadingEntry &le = entries[slotId]; | |
432 | ||
433 | // mapped slots must be freed via freeBadEntry() to keep the map in sync | |
434 | assert(!le.mapped); | |
435 | ||
436 | if (!le.used()) | |
437 | freeSlot(slotId, invalid); | |
438 | } | |
439 | ||
440 | /// adds slot to the entry chain in the map | |
441 | void | |
442 | Rock::Rebuild::mapSlot(const SlotId slotId, const DbCellHeader &header) | |
443 | { | |
444 | LoadingEntry &le = entries[slotId]; | |
445 | assert(!le.mapped); | |
446 | assert(!le.freed); | |
447 | le.mapped = 1; | |
448 | ||
449 | Ipc::StoreMapSlice slice; | |
450 | slice.next = header.nextSlot; | |
451 | slice.size = header.payloadSize; | |
452 | sd->map->importSlice(slotId, slice); | |
453 | } | |
454 | ||
455 | /// adds slot to an existing entry chain; caller must check that the slot | |
456 | /// belongs to the chain it is being added to | |
457 | void | |
458 | Rock::Rebuild::addSlotToEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
459 | { | |
460 | LoadingEntry &le = entries[fileno]; | |
461 | Ipc::StoreMapAnchor &anchor = sd->map->writeableEntry(fileno); | |
462 | ||
463 | assert(le.version == header.version); | |
464 | ||
465 | // mark anchor as loaded or add the secondary slot to the chain | |
466 | LoadingEntry &inode = entries[header.firstSlot]; | |
467 | if (header.firstSlot == slotId) { | |
468 | debugs(47,5, "adding inode"); | |
469 | assert(!inode.freed); | |
470 | le.anchored = 1; | |
471 | } else { | |
472 | debugs(47,9, "linking " << slotId << " to " << inode.more); | |
473 | // we do not need to preserve the order | |
474 | LoadingEntry &slice = entries[slotId]; | |
475 | assert(!slice.freed); | |
476 | assert(slice.more < 0); | |
477 | slice.more = inode.more; | |
478 | inode.more = slotId; | |
479 | } | |
480 | ||
481 | if (header.firstSlot == slotId && !importEntry(anchor, fileno, header)) { | |
482 | le.state = LoadingEntry::leCorrupted; | |
483 | freeBadEntry(fileno, "corrupted metainfo"); | |
484 | return; | |
485 | } | |
486 | ||
487 | // set total entry size and/or check it for consistency | |
ce44c1ea | 488 | debugs(47, 8, "header.entrySize: " << header.entrySize << " swap_file_sz: " << anchor.basics.swap_file_sz); |
50dc81ec AR |
489 | uint64_t totalSize = header.entrySize; |
490 | assert(totalSize != static_cast<uint64_t>(-1)); | |
491 | if (!totalSize && anchor.basics.swap_file_sz) { | |
492 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
493 | // perhaps we loaded a later slot (with entrySize) earlier | |
494 | totalSize = anchor.basics.swap_file_sz; | |
9d4e9cfb | 495 | } else if (totalSize && !anchor.basics.swap_file_sz) { |
50dc81ec AR |
496 | anchor.basics.swap_file_sz = totalSize; |
497 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
9d4e9cfb | 498 | } else if (totalSize != anchor.basics.swap_file_sz) { |
50dc81ec AR |
499 | le.state = LoadingEntry::leCorrupted; |
500 | freeBadEntry(fileno, "size mismatch"); | |
501 | return; | |
502 | } | |
503 | ||
504 | le.size += header.payloadSize; | |
505 | ||
506 | if (totalSize > 0 && le.size > totalSize) { // overflow | |
ce44c1ea | 507 | debugs(47, 8, "overflow: " << le.size << " > " << totalSize); |
50dc81ec AR |
508 | le.state = LoadingEntry::leCorrupted; |
509 | freeBadEntry(fileno, "overflowing"); | |
510 | return; | |
511 | } | |
512 | ||
513 | mapSlot(slotId, header); | |
514 | if (totalSize > 0 && le.size == totalSize) { | |
515 | // entry fully loaded, unlock it | |
516 | // we have validated that all db cells for this entry were loaded | |
517 | EBIT_SET(anchor.basics.flags, ENTRY_VALIDATED); | |
518 | le.state = LoadingEntry::leLoaded; | |
519 | sd->map->closeForWriting(fileno, false); | |
520 | ++counts.objcount; | |
521 | } | |
522 | } | |
523 | ||
524 | /// initialize housekeeping information for a newly accepted entry | |
525 | void | |
526 | Rock::Rebuild::primeNewEntry(Ipc::StoreMap::Anchor &anchor, const sfileno fileno, const DbCellHeader &header) | |
527 | { | |
528 | anchor.setKey(reinterpret_cast<const cache_key*>(header.key)); | |
529 | assert(header.firstSlot >= 0); | |
530 | anchor.start = header.firstSlot; | |
531 | ||
532 | assert(anchor.basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
533 | ||
534 | LoadingEntry &le = entries[fileno]; | |
535 | le.state = LoadingEntry::leLoading; | |
536 | le.version = header.version; | |
537 | le.size = 0; | |
538 | } | |
539 | ||
540 | /// handle a slot from an entry that we have not seen before | |
541 | void | |
542 | Rock::Rebuild::startNewEntry(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) | |
543 | { | |
544 | // If some other from-disk entry is/was using this slot as its inode OR | |
9d4e9cfb | 545 | // if some other from-disk entry is/was using our inode slot, then the |
50dc81ec AR |
546 | // entries are conflicting. We cannot identify other entries, so we just |
547 | // remove ours and hope that the others were/will be handled correctly. | |
548 | const LoadingEntry &slice = entries[slotId]; | |
549 | const LoadingEntry &inode = entries[header.firstSlot]; | |
550 | if (slice.used() || inode.used()) { | |
551 | debugs(47,8, "slice/inode used: " << slice.used() << inode.used()); | |
552 | LoadingEntry &le = entries[fileno]; | |
553 | le.state = LoadingEntry::leCorrupted; | |
554 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
555 | // if not idle, the other entry will handle its slice | |
556 | ++counts.clashcount; | |
557 | return; | |
558 | } | |
559 | ||
560 | // A miss may have been stored at our fileno while we were loading other | |
561 | // slots from disk. We ought to preserve that entry because it is fresher. | |
562 | const bool overwriteExisting = false; | |
563 | if (Ipc::StoreMap::Anchor *anchor = sd->map->openForWritingAt(fileno, overwriteExisting)) { | |
564 | primeNewEntry(*anchor, fileno, header); | |
565 | addSlotToEntry(fileno, slotId, header); // may fail | |
566 | assert(anchor->basics.swap_file_sz != static_cast<uint64_t>(-1)); | |
567 | } else { | |
568 | // A new from-network entry is occupying our map slot; let it be, but | |
569 | // save us from the trouble of going through the above motions again. | |
570 | LoadingEntry &le = entries[fileno]; | |
571 | le.state = LoadingEntry::leIgnored; | |
572 | freeSlotIfIdle(slotId, false); | |
573 | } | |
574 | } | |
575 | ||
576 | /// does the header belong to the fileno entry being loaded? | |
577 | bool | |
578 | Rock::Rebuild::sameEntry(const sfileno fileno, const DbCellHeader &header) const | |
579 | { | |
580 | const Ipc::StoreMap::Anchor &anchor = sd->map->writeableEntry(fileno); | |
581 | const LoadingEntry &le = entries[fileno]; | |
582 | // any order will work, but do fast comparisons first: | |
583 | return le.version == header.version && | |
9d4e9cfb AR |
584 | anchor.start == static_cast<Ipc::StoreMapSliceId>(header.firstSlot) && |
585 | anchor.sameKey(reinterpret_cast<const cache_key*>(header.key)); | |
50dc81ec AR |
586 | } |
587 | ||
588 | /// is the new header consistent with information already loaded? | |
589 | bool | |
590 | Rock::Rebuild::canAdd(const sfileno fileno, const SlotId slotId, const DbCellHeader &header) const | |
591 | { | |
592 | if (!sameEntry(fileno, header)) { | |
593 | debugs(79, 7, "cannot add; wrong entry"); | |
594 | return false; | |
595 | } | |
596 | ||
597 | const LoadingEntry &le = entries[slotId]; | |
598 | // We cannot add a slot that was already declared free or mapped. | |
599 | if (le.freed || le.mapped) { | |
600 | debugs(79, 7, "cannot add; freed/mapped: " << le.freed << le.mapped); | |
601 | return false; | |
602 | } | |
603 | ||
604 | if (slotId == header.firstSlot) { | |
605 | // If we are the inode, the anchored flag cannot be set yet. | |
606 | if (entries[fileno].anchored) { | |
607 | debugs(79, 7, "cannot add; extra anchor"); | |
608 | return false; | |
609 | } | |
610 | ||
611 | // And there should have been some other slot for this entry to exist. | |
612 | if (le.more < 0) { | |
613 | debugs(79, 7, "cannot add; missing slots"); | |
614 | return false; | |
615 | } | |
616 | ||
617 | return true; | |
618 | } | |
619 | ||
620 | // We are the continuation slice so the more field is reserved for us. | |
621 | if (le.more >= 0) { | |
622 | debugs(79, 7, "cannot add; foreign slot"); | |
623 | return false; | |
624 | } | |
625 | ||
626 | return true; | |
627 | } | |
628 | ||
629 | /// handle freshly loaded (and validated) db slot header | |
630 | void | |
631 | Rock::Rebuild::useNewSlot(const SlotId slotId, const DbCellHeader &header) | |
632 | { | |
633 | LoadingEntry &slice = entries[slotId]; | |
634 | assert(!slice.freed); // we cannot free what was not loaded | |
635 | ||
636 | const cache_key *const key = | |
637 | reinterpret_cast<const cache_key*>(header.key); | |
638 | const sfileno fileno = sd->map->anchorIndexByKey(key); | |
639 | assert(0 <= fileno && fileno < dbEntryLimit); | |
640 | ||
641 | LoadingEntry &le = entries[fileno]; | |
642 | debugs(47,9, "entry " << fileno << " state: " << le.state << ", inode: " << | |
9d4e9cfb | 643 | header.firstSlot << ", size: " << header.payloadSize); |
50dc81ec AR |
644 | |
645 | switch (le.state) { | |
646 | ||
647 | case LoadingEntry::leEmpty: { | |
648 | startNewEntry(fileno, slotId, header); | |
649 | break; | |
650 | } | |
651 | ||
652 | case LoadingEntry::leLoading: { | |
653 | if (canAdd(fileno, slotId, header)) { | |
654 | addSlotToEntry(fileno, slotId, header); | |
655 | } else { | |
656 | // either the loading chain or this slot is stale; | |
657 | // be conservative and ignore both (and any future ones) | |
658 | le.state = LoadingEntry::leCorrupted; | |
659 | freeBadEntry(fileno, "duplicated"); | |
660 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
661 | ++counts.dupcount; | |
662 | } | |
663 | break; | |
664 | } | |
665 | ||
666 | case LoadingEntry::leLoaded: { | |
667 | // either the previously loaded chain or this slot is stale; | |
668 | // be conservative and ignore both (and any future ones) | |
669 | le.state = LoadingEntry::leCorrupted; | |
670 | sd->map->freeEntry(fileno); // may not be immediately successful | |
671 | freeSlotIfIdle(slotId, slotId == header.firstSlot); | |
672 | ++counts.dupcount; | |
673 | break; | |
674 | } | |
675 | ||
676 | case LoadingEntry::leCorrupted: { | |
677 | // previously seen slots messed things up so we must ignore this one | |
678 | freeSlotIfIdle(slotId, false); | |
679 | break; | |
680 | } | |
681 | ||
682 | case LoadingEntry::leIgnored: { | |
683 | // already replaced by a fresher or colliding from-network entry | |
684 | freeSlotIfIdle(slotId, false); | |
685 | break; | |
686 | } | |
687 | } | |
93910d5c | 688 | } |
f53969cc | 689 |