]> git.ipfire.org Git - thirdparty/squid.git/blob - src/store/Disks.cc
Merged from v5 r15006
[thirdparty/squid.git] / src / store / Disks.cc
1 /*
2 * Copyright (C) 1996-2017 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9 /* DEBUG: section 47 Store Directory Routines */
10
11 #include "squid.h"
12 #include "Debug.h"
13 #include "globals.h"
14 #include "profiler/Profiler.h"
15 #include "SquidConfig.h"
16 #include "Store.h"
17 #include "store/Disk.h"
18 #include "store/Disks.h"
19 #include "swap_log_op.h"
20 #include "util.h" // for tvSubDsec() which should be in SquidTime.h
21
22 static STDIRSELECT storeDirSelectSwapDirRoundRobin;
23 static STDIRSELECT storeDirSelectSwapDirLeastLoad;
24 /**
25 * This function pointer is set according to 'store_dir_select_algorithm'
26 * in squid.conf.
27 */
28 STDIRSELECT *storeDirSelectSwapDir = storeDirSelectSwapDirLeastLoad;
29
30 /// The entry size to use for Disk::canStore() size limit checks.
31 /// This is an optimization to avoid similar calculations in every cache_dir.
32 static int64_t
33 objectSizeForDirSelection(const StoreEntry &entry)
34 {
35 // entry.objectLen() is negative here when we are still STORE_PENDING
36 int64_t minSize = entry.mem_obj->expectedReplySize();
37
38 // If entry size is unknown, use already accumulated bytes as an estimate.
39 // Controller::accumulateMore() guarantees that there are enough of them.
40 if (minSize < 0)
41 minSize = entry.mem_obj->endOffset();
42
43 assert(minSize >= 0);
44 minSize += entry.mem_obj->swap_hdr_sz;
45 return minSize;
46 }
47
48 /**
49 * This new selection scheme simply does round-robin on all SwapDirs.
50 * A SwapDir is skipped if it is over the max_size (100%) limit, or
51 * overloaded.
52 */
53 static int
54 storeDirSelectSwapDirRoundRobin(const StoreEntry * e)
55 {
56 const int64_t objsize = objectSizeForDirSelection(*e);
57
58 // Increment the first candidate once per selection (not once per
59 // iteration) to reduce bias when some disk(s) attract more entries.
60 static int firstCandidate = 0;
61 if (++firstCandidate >= Config.cacheSwap.n_configured)
62 firstCandidate = 0;
63
64 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
65 const int dirn = (firstCandidate + i) % Config.cacheSwap.n_configured;
66 const SwapDir *sd = dynamic_cast<SwapDir*>(INDEXSD(dirn));
67
68 int load = 0;
69 if (!sd->canStore(*e, objsize, load))
70 continue;
71
72 if (load < 0 || load > 1000) {
73 continue;
74 }
75
76 return dirn;
77 }
78
79 return -1;
80 }
81
82 /**
83 * Spread load across all of the store directories
84 *
85 * Note: We should modify this later on to prefer sticking objects
86 * in the *tightest fit* swapdir to conserve space, along with the
87 * actual swapdir usage. But for now, this hack will do while
88 * testing, so you should order your swapdirs in the config file
89 * from smallest max-size= to largest max-size=.
90 *
91 * We also have to choose nleast == nconf since we need to consider
92 * ALL swapdirs, regardless of state. Again, this is a hack while
93 * we sort out the real usefulness of this algorithm.
94 */
95 static int
96 storeDirSelectSwapDirLeastLoad(const StoreEntry * e)
97 {
98 int64_t most_free = 0;
99 int64_t best_objsize = -1;
100 int least_load = INT_MAX;
101 int load;
102 int dirn = -1;
103 int i;
104 RefCount<SwapDir> SD;
105
106 const int64_t objsize = objectSizeForDirSelection(*e);
107
108 for (i = 0; i < Config.cacheSwap.n_configured; ++i) {
109 SD = dynamic_cast<SwapDir *>(INDEXSD(i));
110 SD->flags.selected = false;
111
112 if (!SD->canStore(*e, objsize, load))
113 continue;
114
115 if (load < 0 || load > 1000)
116 continue;
117
118 if (load > least_load)
119 continue;
120
121 const int64_t cur_free = SD->maxSize() - SD->currentSize();
122
123 /* If the load is equal, then look in more details */
124 if (load == least_load) {
125 /* best max-size fit */
126 if (best_objsize != -1) {
127 // cache_dir with the smallest max-size gets the known-size object
128 // cache_dir with the largest max-size gets the unknown-size object
129 if ((objsize != -1 && SD->maxObjectSize() > best_objsize) ||
130 (objsize == -1 && SD->maxObjectSize() < best_objsize))
131 continue;
132 }
133
134 /* most free */
135 if (cur_free < most_free)
136 continue;
137 }
138
139 least_load = load;
140 best_objsize = SD->maxObjectSize();
141 most_free = cur_free;
142 dirn = i;
143 }
144
145 if (dirn >= 0)
146 dynamic_cast<SwapDir *>(INDEXSD(dirn))->flags.selected = true;
147
148 return dirn;
149 }
150
151 Store::Disks::Disks():
152 largestMinimumObjectSize(-1),
153 largestMaximumObjectSize(-1),
154 secondLargestMaximumObjectSize(-1)
155 {
156 }
157
158 SwapDir *
159 Store::Disks::store(int const x) const
160 {
161 return INDEXSD(x);
162 }
163
164 SwapDir &
165 Store::Disks::dir(const int i) const
166 {
167 SwapDir *sd = INDEXSD(i);
168 assert(sd);
169 return *sd;
170 }
171
172 int
173 Store::Disks::callback()
174 {
175 int result = 0;
176 int j;
177 static int ndir = 0;
178
179 do {
180 j = 0;
181
182 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
183 if (ndir >= Config.cacheSwap.n_configured)
184 ndir = ndir % Config.cacheSwap.n_configured;
185
186 int temp_result = store(ndir)->callback();
187
188 ++ndir;
189
190 j += temp_result;
191
192 result += temp_result;
193
194 if (j > 100)
195 fatal ("too much io\n");
196 }
197 } while (j > 0);
198
199 ++ndir;
200
201 return result;
202 }
203
204 void
205 Store::Disks::create()
206 {
207 if (Config.cacheSwap.n_configured == 0) {
208 debugs(0, DBG_PARSE_NOTE(DBG_CRITICAL), "No cache_dir stores are configured.");
209 }
210
211 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
212 if (dir(i).active())
213 store(i)->create();
214 }
215 }
216
217 StoreEntry *
218 Store::Disks::get(const cache_key *key)
219 {
220 if (const int cacheDirs = Config.cacheSwap.n_configured) {
221 // ask each cache_dir until the entry is found; use static starting
222 // point to avoid asking the same subset of disks more often
223 // TODO: coordinate with put() to be able to guess the right disk often
224 static int idx = 0;
225 for (int n = 0; n < cacheDirs; ++n) {
226 idx = (idx + 1) % cacheDirs;
227 SwapDir *sd = dynamic_cast<SwapDir*>(INDEXSD(idx));
228 if (!sd->active())
229 continue;
230
231 if (StoreEntry *e = sd->get(key)) {
232 debugs(20, 7, "cache_dir " << idx << " has: " << *e);
233 return e;
234 }
235 }
236 }
237
238 debugs(20, 6, "none of " << Config.cacheSwap.n_configured <<
239 " cache_dirs have " << storeKeyText(key));
240 return nullptr;
241 }
242
243 void
244 Store::Disks::init()
245 {
246 if (Config.Store.objectsPerBucket <= 0)
247 fatal("'store_objects_per_bucket' should be larger than 0.");
248
249 if (Config.Store.avgObjectSize <= 0)
250 fatal("'store_avg_object_size' should be larger than 0.");
251
252 /* Calculate size of hash table (maximum currently 64k buckets). */
253 /* this is very bogus, its specific to the any Store maintaining an
254 * in-core index, not global */
255 size_t buckets = (Store::Root().maxSize() + Config.memMaxSize) / Config.Store.avgObjectSize;
256 debugs(20, DBG_IMPORTANT, "Swap maxSize " << (Store::Root().maxSize() >> 10) <<
257 " + " << ( Config.memMaxSize >> 10) << " KB, estimated " << buckets << " objects");
258 buckets /= Config.Store.objectsPerBucket;
259 debugs(20, DBG_IMPORTANT, "Target number of buckets: " << buckets);
260 /* ideally the full scan period should be configurable, for the
261 * moment it remains at approximately 24 hours. */
262 store_hash_buckets = storeKeyHashBuckets(buckets);
263 debugs(20, DBG_IMPORTANT, "Using " << store_hash_buckets << " Store buckets");
264 debugs(20, DBG_IMPORTANT, "Max Mem size: " << ( Config.memMaxSize >> 10) << " KB" <<
265 (Config.memShared ? " [shared]" : ""));
266 debugs(20, DBG_IMPORTANT, "Max Swap size: " << (Store::Root().maxSize() >> 10) << " KB");
267
268 store_table = hash_create(storeKeyHashCmp,
269 store_hash_buckets, storeKeyHashHash);
270
271 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
272 /* this starts a search of the store dirs, loading their
273 * index. under the new Store api this should be
274 * driven by the StoreHashIndex, not by each store.
275 *
276 * That is, the HashIndex should perform a search of each dir it is
277 * indexing to do the hash insertions. The search is then able to
278 * decide 'from-memory', or 'from-clean-log' or 'from-dirty-log' or
279 * 'from-no-log'.
280 *
281 * Step 1: make the store rebuilds use a search internally
282 * Step 2: change the search logic to use the four modes described
283 * above
284 * Step 3: have the hash index walk the searches itself.
285 */
286 if (dir(i).active())
287 store(i)->init();
288 }
289
290 if (strcasecmp(Config.store_dir_select_algorithm, "round-robin") == 0) {
291 storeDirSelectSwapDir = storeDirSelectSwapDirRoundRobin;
292 debugs(47, DBG_IMPORTANT, "Using Round Robin store dir selection");
293 } else {
294 storeDirSelectSwapDir = storeDirSelectSwapDirLeastLoad;
295 debugs(47, DBG_IMPORTANT, "Using Least Load store dir selection");
296 }
297 }
298
299 uint64_t
300 Store::Disks::maxSize() const
301 {
302 uint64_t result = 0;
303
304 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
305 if (dir(i).doReportStat())
306 result += store(i)->maxSize();
307 }
308
309 return result;
310 }
311
312 uint64_t
313 Store::Disks::minSize() const
314 {
315 uint64_t result = 0;
316
317 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
318 if (dir(i).doReportStat())
319 result += store(i)->minSize();
320 }
321
322 return result;
323 }
324
325 uint64_t
326 Store::Disks::currentSize() const
327 {
328 uint64_t result = 0;
329
330 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
331 if (dir(i).doReportStat())
332 result += store(i)->currentSize();
333 }
334
335 return result;
336 }
337
338 uint64_t
339 Store::Disks::currentCount() const
340 {
341 uint64_t result = 0;
342
343 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
344 if (dir(i).doReportStat())
345 result += store(i)->currentCount();
346 }
347
348 return result;
349 }
350
351 int64_t
352 Store::Disks::maxObjectSize() const
353 {
354 return largestMaximumObjectSize;
355 }
356
357 void
358 Store::Disks::updateLimits()
359 {
360 largestMinimumObjectSize = -1;
361 largestMaximumObjectSize = -1;
362 secondLargestMaximumObjectSize = -1;
363
364 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
365 const auto &disk = dir(i);
366 if (!disk.active())
367 continue;
368
369 if (disk.minObjectSize() > largestMinimumObjectSize)
370 largestMinimumObjectSize = disk.minObjectSize();
371
372 const auto diskMaxObjectSize = disk.maxObjectSize();
373 if (diskMaxObjectSize > largestMaximumObjectSize) {
374 if (largestMaximumObjectSize >= 0) // was set
375 secondLargestMaximumObjectSize = largestMaximumObjectSize;
376 largestMaximumObjectSize = diskMaxObjectSize;
377 }
378 }
379 }
380
381 int64_t
382 Store::Disks::accumulateMore(const StoreEntry &entry) const
383 {
384 const auto accumulated = entry.mem_obj->availableForSwapOut();
385
386 /*
387 * Keep accumulating more bytes until the set of disks eligible to accept
388 * the entry becomes stable, and, hence, accumulating more is not going to
389 * affect the cache_dir selection. A stable set is usually reached
390 * immediately (or soon) because most configurations either do not use
391 * cache_dirs with explicit min-size/max-size limits or use the same
392 * max-size limit for all cache_dirs (and low min-size limits).
393 */
394
395 // Can the set of min-size cache_dirs accepting this entry change?
396 if (accumulated < largestMinimumObjectSize)
397 return largestMinimumObjectSize - accumulated;
398
399 // Can the set of max-size cache_dirs accepting this entry change
400 // (other than when the entry exceeds the largest maximum; see below)?
401 if (accumulated <= secondLargestMaximumObjectSize)
402 return secondLargestMaximumObjectSize - accumulated + 1;
403
404 /*
405 * Checking largestMaximumObjectSize instead eliminates the risk of starting
406 * to swap out an entry that later grows too big, but also implies huge
407 * accumulation in most environments. Accumulating huge entries not only
408 * consumes lots of RAM but also creates a burst of doPages() write requests
409 * that overwhelm the disk. To avoid these problems, we take the risk and
410 * allow swap out now. The disk will quit swapping out if the entry
411 * eventually grows too big for its selected cache_dir.
412 */
413 debugs(20, 3, "no: " << accumulated << '>' <<
414 secondLargestMaximumObjectSize << ',' << largestMinimumObjectSize);
415 return 0;
416 }
417
418 void
419 Store::Disks::getStats(StoreInfoStats &stats) const
420 {
421 // accumulate per-disk cache stats
422 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
423 StoreInfoStats dirStats;
424 store(i)->getStats(dirStats);
425 stats += dirStats;
426 }
427
428 // common to all disks
429 stats.swap.open_disk_fd = store_open_disk_fd;
430
431 // memory cache stats are collected in StoreController::getStats(), for now
432 }
433
434 void
435 Store::Disks::stat(StoreEntry & output) const
436 {
437 int i;
438
439 /* Now go through each store, calling its stat routine */
440
441 for (i = 0; i < Config.cacheSwap.n_configured; ++i) {
442 storeAppendPrintf(&output, "\n");
443 store(i)->stat(output);
444 }
445 }
446
447 void
448 Store::Disks::reference(StoreEntry &e)
449 {
450 e.disk().reference(e);
451 }
452
453 bool
454 Store::Disks::dereference(StoreEntry &e)
455 {
456 return e.disk().dereference(e);
457 }
458
459 void
460 Store::Disks::updateHeaders(StoreEntry *e)
461 {
462 Must(e);
463 return e->disk().updateHeaders(e);
464 }
465
466 void
467 Store::Disks::maintain()
468 {
469 int i;
470 /* walk each fs */
471
472 for (i = 0; i < Config.cacheSwap.n_configured; ++i) {
473 /* XXX FixMe: This should be done "in parallell" on the different
474 * cache_dirs, not one at a time.
475 */
476 /* call the maintain function .. */
477 store(i)->maintain();
478 }
479 }
480
481 void
482 Store::Disks::sync()
483 {
484 for (int i = 0; i < Config.cacheSwap.n_configured; ++i)
485 store(i)->sync();
486 }
487
488 void
489 Store::Disks::markForUnlink(StoreEntry &e) {
490 if (e.swap_filen >= 0)
491 store(e.swap_dirn)->markForUnlink(e);
492 }
493
494 void
495 Store::Disks::unlink(StoreEntry &e) {
496 if (e.swap_filen >= 0)
497 store(e.swap_dirn)->unlink(e);
498 }
499
500 bool
501 Store::Disks::anchorCollapsed(StoreEntry &collapsed, bool &inSync)
502 {
503 if (const int cacheDirs = Config.cacheSwap.n_configured) {
504 // ask each cache_dir until the entry is found; use static starting
505 // point to avoid asking the same subset of disks more often
506 // TODO: coordinate with put() to be able to guess the right disk often
507 static int idx = 0;
508 for (int n = 0; n < cacheDirs; ++n) {
509 idx = (idx + 1) % cacheDirs;
510 SwapDir &sd = dir(idx);
511 if (!sd.active())
512 continue;
513
514 if (sd.anchorCollapsed(collapsed, inSync)) {
515 debugs(20, 3, "cache_dir " << idx << " anchors " << collapsed);
516 return true;
517 }
518 }
519 }
520
521 debugs(20, 4, "none of " << Config.cacheSwap.n_configured <<
522 " cache_dirs have " << collapsed);
523 return false;
524 }
525
526 bool
527 Store::Disks::updateCollapsed(StoreEntry &collapsed)
528 {
529 return collapsed.swap_filen >= 0 &&
530 dir(collapsed.swap_dirn).updateCollapsed(collapsed);
531 }
532
533 bool
534 Store::Disks::smpAware() const
535 {
536 for (int i = 0; i < Config.cacheSwap.n_configured; ++i) {
537 // A mix is not supported, but we conservatively check every
538 // dir because features like collapsed revalidation should
539 // currently be disabled if any dir is SMP-aware
540 if (dir(i).smpAware())
541 return true;
542 }
543 return false;
544 }
545
546 /* Store::Disks globals that should be converted to use RegisteredRunner */
547
548 void
549 storeDirOpenSwapLogs()
550 {
551 for (int dirn = 0; dirn < Config.cacheSwap.n_configured; ++dirn)
552 INDEXSD(dirn)->openLog();
553 }
554
555 void
556 storeDirCloseSwapLogs()
557 {
558 for (int dirn = 0; dirn < Config.cacheSwap.n_configured; ++dirn)
559 INDEXSD(dirn)->closeLog();
560 }
561
562 /**
563 * storeDirWriteCleanLogs
564 *
565 * Writes a "clean" swap log file from in-memory metadata.
566 * This is a rewrite of the original function to troll each
567 * StoreDir and write the logs, and flush at the end of
568 * the run. Thanks goes to Eric Stern, since this solution
569 * came out of his COSS code.
570 */
571 int
572 storeDirWriteCleanLogs(int reopen)
573 {
574 const StoreEntry *e = NULL;
575 int n = 0;
576
577 struct timeval start;
578 double dt;
579 RefCount<SwapDir> sd;
580 int dirn;
581 int notdone = 1;
582
583 // Check for store_dirs_rebuilding because fatal() often calls us in early
584 // initialization phases, before store log is initialized and ready. Also,
585 // some stores do not support log cleanup during Store rebuilding.
586 if (StoreController::store_dirs_rebuilding) {
587 debugs(20, DBG_IMPORTANT, "Not currently OK to rewrite swap log.");
588 debugs(20, DBG_IMPORTANT, "storeDirWriteCleanLogs: Operation aborted.");
589 return 0;
590 }
591
592 debugs(20, DBG_IMPORTANT, "storeDirWriteCleanLogs: Starting...");
593 getCurrentTime();
594 start = current_time;
595
596 for (dirn = 0; dirn < Config.cacheSwap.n_configured; ++dirn) {
597 sd = dynamic_cast<SwapDir *>(INDEXSD(dirn));
598
599 if (sd->writeCleanStart() < 0) {
600 debugs(20, DBG_IMPORTANT, "log.clean.start() failed for dir #" << sd->index);
601 continue;
602 }
603 }
604
605 /*
606 * This may look inefficient as CPU wise it is more efficient to do this
607 * sequentially, but I/O wise the parallellism helps as it allows more
608 * hdd spindles to be active.
609 */
610 while (notdone) {
611 notdone = 0;
612
613 for (dirn = 0; dirn < Config.cacheSwap.n_configured; ++dirn) {
614 sd = dynamic_cast<SwapDir *>(INDEXSD(dirn));
615
616 if (NULL == sd->cleanLog)
617 continue;
618
619 e = sd->cleanLog->nextEntry();
620
621 if (!e)
622 continue;
623
624 notdone = 1;
625
626 if (!sd->canLog(*e))
627 continue;
628
629 sd->cleanLog->write(*e);
630
631 if ((++n & 0xFFFF) == 0) {
632 getCurrentTime();
633 debugs(20, DBG_IMPORTANT, " " << std::setw(7) << n <<
634 " entries written so far.");
635 }
636 }
637 }
638
639 /* Flush */
640 for (dirn = 0; dirn < Config.cacheSwap.n_configured; ++dirn)
641 dynamic_cast<SwapDir *>(INDEXSD(dirn))->writeCleanDone();
642
643 if (reopen)
644 storeDirOpenSwapLogs();
645
646 getCurrentTime();
647
648 dt = tvSubDsec(start, current_time);
649
650 debugs(20, DBG_IMPORTANT, " Finished. Wrote " << n << " entries.");
651 debugs(20, DBG_IMPORTANT, " Took "<< std::setw(3)<< std::setprecision(2) << dt <<
652 " seconds ("<< std::setw(6) << ((double) n / (dt > 0.0 ? dt : 1.0)) << " entries/sec).");
653
654 return n;
655 }
656
657 /* Globals that should be converted to static Store::Disks methods */
658
659 void
660 allocate_new_swapdir(Store::DiskConfig *swap)
661 {
662 if (swap->swapDirs == NULL) {
663 swap->n_allocated = 4;
664 swap->swapDirs = static_cast<SwapDir::Pointer *>(xcalloc(swap->n_allocated, sizeof(SwapDir::Pointer)));
665 }
666
667 if (swap->n_allocated == swap->n_configured) {
668 swap->n_allocated <<= 1;
669 SwapDir::Pointer *const tmp = static_cast<SwapDir::Pointer *>(xcalloc(swap->n_allocated, sizeof(SwapDir::Pointer)));
670 memcpy(tmp, swap->swapDirs, swap->n_configured * sizeof(SwapDir *));
671 xfree(swap->swapDirs);
672 swap->swapDirs = tmp;
673 }
674 }
675
676 void
677 free_cachedir(Store::DiskConfig *swap)
678 {
679 int i;
680 /* DON'T FREE THESE FOR RECONFIGURE */
681
682 if (reconfiguring)
683 return;
684
685 for (i = 0; i < swap->n_configured; ++i) {
686 /* TODO XXX this lets the swapdir free resources asynchronously
687 * swap->swapDirs[i]->deactivate();
688 * but there may be such a means already.
689 * RBC 20041225
690 */
691 swap->swapDirs[i] = NULL;
692 }
693
694 safe_free(swap->swapDirs);
695 swap->swapDirs = NULL;
696 swap->n_allocated = 0;
697 swap->n_configured = 0;
698 }
699
700 /* Globals that should be moved to some Store::UFS-specific logging module */
701
702 /**
703 * An entry written to the swap log MUST have the following
704 * properties.
705 * 1. It MUST be a public key. It does no good to log
706 * a public ADD, change the key, then log a private
707 * DEL. So we need to log a DEL before we change a
708 * key from public to private.
709 * 2. It MUST have a valid (> -1) swap_filen.
710 */
711 void
712 storeDirSwapLog(const StoreEntry * e, int op)
713 {
714 assert (e);
715 assert(!EBIT_TEST(e->flags, KEY_PRIVATE));
716 assert(e->swap_filen >= 0);
717 /*
718 * icons and such; don't write them to the swap log
719 */
720
721 if (EBIT_TEST(e->flags, ENTRY_SPECIAL))
722 return;
723
724 assert(op > SWAP_LOG_NOP && op < SWAP_LOG_MAX);
725
726 debugs(20, 3, "storeDirSwapLog: " <<
727 swap_log_op_str[op] << " " <<
728 e->getMD5Text() << " " <<
729 e->swap_dirn << " " <<
730 std::hex << std::uppercase << std::setfill('0') << std::setw(8) << e->swap_filen);
731
732 dynamic_cast<SwapDir *>(INDEXSD(e->swap_dirn))->logEntry(*e, op);
733 }
734