2 * Copyright (C) 1996-2016 The Squid Software Foundation and contributors
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
9 /* DEBUG: section 47 Store Directory Routines */
14 #include "profiler/Profiler.h"
15 #include "SquidConfig.h"
17 #include "store/Disk.h"
18 #include "store/Disks.h"
19 #include "swap_log_op.h"
20 #include "util.h" // for tvSubDsec() which should be in SquidTime.h
22 static STDIRSELECT storeDirSelectSwapDirRoundRobin
;
23 static STDIRSELECT storeDirSelectSwapDirLeastLoad
;
25 * This function pointer is set according to 'store_dir_select_algorithm'
28 STDIRSELECT
*storeDirSelectSwapDir
= storeDirSelectSwapDirLeastLoad
;
30 /// The entry size to use for Disk::canStore() size limit checks.
31 /// This is an optimization to avoid similar calculations in every cache_dir.
33 objectSizeForDirSelection(const StoreEntry
&entry
)
35 // entry.objectLen() is negative here when we are still STORE_PENDING
36 int64_t minSize
= entry
.mem_obj
->expectedReplySize();
38 // If entry size is unknown, use already accumulated bytes as an estimate.
39 // Controller::accumulateMore() guarantees that there are enough of them.
41 minSize
= entry
.mem_obj
->endOffset();
44 minSize
+= entry
.mem_obj
->swap_hdr_sz
;
49 * This new selection scheme simply does round-robin on all SwapDirs.
50 * A SwapDir is skipped if it is over the max_size (100%) limit, or
54 storeDirSelectSwapDirRoundRobin(const StoreEntry
* e
)
56 const int64_t objsize
= objectSizeForDirSelection(*e
);
58 // Increment the first candidate once per selection (not once per
59 // iteration) to reduce bias when some disk(s) attract more entries.
60 static int firstCandidate
= 0;
61 if (++firstCandidate
>= Config
.cacheSwap
.n_configured
)
64 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
65 const int dirn
= (firstCandidate
+ i
) % Config
.cacheSwap
.n_configured
;
66 const SwapDir
*sd
= dynamic_cast<SwapDir
*>(INDEXSD(dirn
));
69 if (!sd
->canStore(*e
, objsize
, load
))
72 if (load
< 0 || load
> 1000) {
83 * Spread load across all of the store directories
85 * Note: We should modify this later on to prefer sticking objects
86 * in the *tightest fit* swapdir to conserve space, along with the
87 * actual swapdir usage. But for now, this hack will do while
88 * testing, so you should order your swapdirs in the config file
89 * from smallest max-size= to largest max-size=.
91 * We also have to choose nleast == nconf since we need to consider
92 * ALL swapdirs, regardless of state. Again, this is a hack while
93 * we sort out the real usefulness of this algorithm.
96 storeDirSelectSwapDirLeastLoad(const StoreEntry
* e
)
98 int64_t most_free
= 0;
99 int64_t best_objsize
= -1;
100 int least_load
= INT_MAX
;
104 RefCount
<SwapDir
> SD
;
106 const int64_t objsize
= objectSizeForDirSelection(*e
);
108 for (i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
109 SD
= dynamic_cast<SwapDir
*>(INDEXSD(i
));
110 SD
->flags
.selected
= false;
112 if (!SD
->canStore(*e
, objsize
, load
))
115 if (load
< 0 || load
> 1000)
118 if (load
> least_load
)
121 const int64_t cur_free
= SD
->maxSize() - SD
->currentSize();
123 /* If the load is equal, then look in more details */
124 if (load
== least_load
) {
125 /* best max-size fit */
126 if (best_objsize
!= -1) {
127 // cache_dir with the smallest max-size gets the known-size object
128 // cache_dir with the largest max-size gets the unknown-size object
129 if ((objsize
!= -1 && SD
->maxObjectSize() > best_objsize
) ||
130 (objsize
== -1 && SD
->maxObjectSize() < best_objsize
))
135 if (cur_free
< most_free
)
140 best_objsize
= SD
->maxObjectSize();
141 most_free
= cur_free
;
146 dynamic_cast<SwapDir
*>(INDEXSD(dirn
))->flags
.selected
= true;
151 Store::Disks::Disks():
152 largestMinimumObjectSize(-1),
153 largestMaximumObjectSize(-1),
154 secondLargestMaximumObjectSize(-1)
159 Store::Disks::store(int const x
) const
165 Store::Disks::dir(const int i
) const
167 SwapDir
*sd
= INDEXSD(i
);
173 Store::Disks::callback()
182 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
183 if (ndir
>= Config
.cacheSwap
.n_configured
)
184 ndir
= ndir
% Config
.cacheSwap
.n_configured
;
186 int temp_result
= store(ndir
)->callback();
192 result
+= temp_result
;
195 fatal ("too much io\n");
205 Store::Disks::create()
207 if (Config
.cacheSwap
.n_configured
== 0) {
208 debugs(0, DBG_PARSE_NOTE(DBG_CRITICAL
), "No cache_dir stores are configured.");
211 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
218 Store::Disks::get(const cache_key
*key
)
220 if (const int cacheDirs
= Config
.cacheSwap
.n_configured
) {
221 // ask each cache_dir until the entry is found; use static starting
222 // point to avoid asking the same subset of disks more often
223 // TODO: coordinate with put() to be able to guess the right disk often
225 for (int n
= 0; n
< cacheDirs
; ++n
) {
226 idx
= (idx
+ 1) % cacheDirs
;
227 SwapDir
*sd
= dynamic_cast<SwapDir
*>(INDEXSD(idx
));
231 if (StoreEntry
*e
= sd
->get(key
)) {
232 debugs(20, 7, "cache_dir " << idx
<< " has: " << *e
);
238 debugs(20, 6, "none of " << Config
.cacheSwap
.n_configured
<<
239 " cache_dirs have " << storeKeyText(key
));
246 if (Config
.Store
.objectsPerBucket
<= 0)
247 fatal("'store_objects_per_bucket' should be larger than 0.");
249 if (Config
.Store
.avgObjectSize
<= 0)
250 fatal("'store_avg_object_size' should be larger than 0.");
252 /* Calculate size of hash table (maximum currently 64k buckets). */
253 /* this is very bogus, its specific to the any Store maintaining an
254 * in-core index, not global */
255 size_t buckets
= (Store::Root().maxSize() + Config
.memMaxSize
) / Config
.Store
.avgObjectSize
;
256 debugs(20, DBG_IMPORTANT
, "Swap maxSize " << (Store::Root().maxSize() >> 10) <<
257 " + " << ( Config
.memMaxSize
>> 10) << " KB, estimated " << buckets
<< " objects");
258 buckets
/= Config
.Store
.objectsPerBucket
;
259 debugs(20, DBG_IMPORTANT
, "Target number of buckets: " << buckets
);
260 /* ideally the full scan period should be configurable, for the
261 * moment it remains at approximately 24 hours. */
262 store_hash_buckets
= storeKeyHashBuckets(buckets
);
263 debugs(20, DBG_IMPORTANT
, "Using " << store_hash_buckets
<< " Store buckets");
264 debugs(20, DBG_IMPORTANT
, "Max Mem size: " << ( Config
.memMaxSize
>> 10) << " KB" <<
265 (Config
.memShared
? " [shared]" : ""));
266 debugs(20, DBG_IMPORTANT
, "Max Swap size: " << (Store::Root().maxSize() >> 10) << " KB");
268 store_table
= hash_create(storeKeyHashCmp
,
269 store_hash_buckets
, storeKeyHashHash
);
271 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
272 /* this starts a search of the store dirs, loading their
273 * index. under the new Store api this should be
274 * driven by the StoreHashIndex, not by each store.
276 * That is, the HashIndex should perform a search of each dir it is
277 * indexing to do the hash insertions. The search is then able to
278 * decide 'from-memory', or 'from-clean-log' or 'from-dirty-log' or
281 * Step 1: make the store rebuilds use a search internally
282 * Step 2: change the search logic to use the four modes described
284 * Step 3: have the hash index walk the searches itself.
290 if (strcasecmp(Config
.store_dir_select_algorithm
, "round-robin") == 0) {
291 storeDirSelectSwapDir
= storeDirSelectSwapDirRoundRobin
;
292 debugs(47, DBG_IMPORTANT
, "Using Round Robin store dir selection");
294 storeDirSelectSwapDir
= storeDirSelectSwapDirLeastLoad
;
295 debugs(47, DBG_IMPORTANT
, "Using Least Load store dir selection");
300 Store::Disks::maxSize() const
304 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
305 if (dir(i
).doReportStat())
306 result
+= store(i
)->maxSize();
313 Store::Disks::minSize() const
317 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
318 if (dir(i
).doReportStat())
319 result
+= store(i
)->minSize();
326 Store::Disks::currentSize() const
330 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
331 if (dir(i
).doReportStat())
332 result
+= store(i
)->currentSize();
339 Store::Disks::currentCount() const
343 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
344 if (dir(i
).doReportStat())
345 result
+= store(i
)->currentCount();
352 Store::Disks::maxObjectSize() const
354 return largestMaximumObjectSize
;
358 Store::Disks::updateLimits()
360 largestMinimumObjectSize
= -1;
361 largestMaximumObjectSize
= -1;
362 secondLargestMaximumObjectSize
= -1;
364 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
365 const auto &disk
= dir(i
);
369 if (disk
.minObjectSize() > largestMinimumObjectSize
)
370 largestMinimumObjectSize
= disk
.minObjectSize();
372 const auto diskMaxObjectSize
= disk
.maxObjectSize();
373 if (diskMaxObjectSize
> largestMaximumObjectSize
) {
374 if (largestMaximumObjectSize
>= 0) // was set
375 secondLargestMaximumObjectSize
= largestMaximumObjectSize
;
376 largestMaximumObjectSize
= diskMaxObjectSize
;
382 Store::Disks::accumulateMore(const StoreEntry
&entry
) const
384 const auto accumulated
= entry
.mem_obj
->availableForSwapOut();
387 * Keep accumulating more bytes until the set of disks eligible to accept
388 * the entry becomes stable, and, hence, accumulating more is not going to
389 * affect the cache_dir selection. A stable set is usually reached
390 * immediately (or soon) because most configurations either do not use
391 * cache_dirs with explicit min-size/max-size limits or use the same
392 * max-size limit for all cache_dirs (and low min-size limits).
395 // Can the set of min-size cache_dirs accepting this entry change?
396 if (accumulated
< largestMinimumObjectSize
)
397 return largestMinimumObjectSize
- accumulated
;
399 // Can the set of max-size cache_dirs accepting this entry change
400 // (other than when the entry exceeds the largest maximum; see below)?
401 if (accumulated
<= secondLargestMaximumObjectSize
)
402 return secondLargestMaximumObjectSize
- accumulated
+ 1;
405 * Checking largestMaximumObjectSize instead eliminates the risk of starting
406 * to swap out an entry that later grows too big, but also implies huge
407 * accumulation in most environments. Accumulating huge entries not only
408 * consumes lots of RAM but also creates a burst of doPages() write requests
409 * that overwhelm the disk. To avoid these problems, we take the risk and
410 * allow swap out now. The disk will quit swapping out if the entry
411 * eventually grows too big for its selected cache_dir.
413 debugs(20, 3, "no: " << accumulated
<< '>' <<
414 secondLargestMaximumObjectSize
<< ',' << largestMinimumObjectSize
);
419 Store::Disks::getStats(StoreInfoStats
&stats
) const
421 // accumulate per-disk cache stats
422 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
423 StoreInfoStats dirStats
;
424 store(i
)->getStats(dirStats
);
428 // common to all disks
429 stats
.swap
.open_disk_fd
= store_open_disk_fd
;
431 // memory cache stats are collected in StoreController::getStats(), for now
435 Store::Disks::stat(StoreEntry
& output
) const
439 /* Now go through each store, calling its stat routine */
441 for (i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
442 storeAppendPrintf(&output
, "\n");
443 store(i
)->stat(output
);
448 Store::Disks::reference(StoreEntry
&e
)
450 e
.disk().reference(e
);
454 Store::Disks::dereference(StoreEntry
&e
)
456 return e
.disk().dereference(e
);
460 Store::Disks::updateHeaders(StoreEntry
*e
)
463 return e
->disk().updateHeaders(e
);
467 Store::Disks::maintain()
472 for (i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
) {
473 /* XXX FixMe: This should be done "in parallell" on the different
474 * cache_dirs, not one at a time.
476 /* call the maintain function .. */
477 store(i
)->maintain();
484 for (int i
= 0; i
< Config
.cacheSwap
.n_configured
; ++i
)
489 Store::Disks::markForUnlink(StoreEntry
&e
) {
490 if (e
.swap_filen
>= 0)
491 store(e
.swap_dirn
)->markForUnlink(e
);
495 Store::Disks::unlink(StoreEntry
&e
) {
496 if (e
.swap_filen
>= 0)
497 store(e
.swap_dirn
)->unlink(e
);
501 Store::Disks::anchorCollapsed(StoreEntry
&collapsed
, bool &inSync
)
503 if (const int cacheDirs
= Config
.cacheSwap
.n_configured
) {
504 // ask each cache_dir until the entry is found; use static starting
505 // point to avoid asking the same subset of disks more often
506 // TODO: coordinate with put() to be able to guess the right disk often
508 for (int n
= 0; n
< cacheDirs
; ++n
) {
509 idx
= (idx
+ 1) % cacheDirs
;
510 SwapDir
&sd
= dir(idx
);
514 if (sd
.anchorCollapsed(collapsed
, inSync
)) {
515 debugs(20, 3, "cache_dir " << idx
<< " anchors " << collapsed
);
521 debugs(20, 4, "none of " << Config
.cacheSwap
.n_configured
<<
522 " cache_dirs have " << collapsed
);
527 Store::Disks::updateCollapsed(StoreEntry
&collapsed
)
529 return collapsed
.swap_filen
>= 0 &&
530 dir(collapsed
.swap_dirn
).updateCollapsed(collapsed
);
533 /* Store::Disks globals that should be converted to use RegisteredRunner */
536 storeDirOpenSwapLogs()
538 for (int dirn
= 0; dirn
< Config
.cacheSwap
.n_configured
; ++dirn
)
539 INDEXSD(dirn
)->openLog();
543 storeDirCloseSwapLogs()
545 for (int dirn
= 0; dirn
< Config
.cacheSwap
.n_configured
; ++dirn
)
546 INDEXSD(dirn
)->closeLog();
550 * storeDirWriteCleanLogs
552 * Writes a "clean" swap log file from in-memory metadata.
553 * This is a rewrite of the original function to troll each
554 * StoreDir and write the logs, and flush at the end of
555 * the run. Thanks goes to Eric Stern, since this solution
556 * came out of his COSS code.
559 storeDirWriteCleanLogs(int reopen
)
561 const StoreEntry
*e
= NULL
;
564 struct timeval start
;
566 RefCount
<SwapDir
> sd
;
570 // Check for store_dirs_rebuilding because fatal() often calls us in early
571 // initialization phases, before store log is initialized and ready. Also,
572 // some stores probably do not support log cleanup during Store rebuilding.
573 if (StoreController::store_dirs_rebuilding
) {
574 debugs(20, DBG_IMPORTANT
, "Not currently OK to rewrite swap log.");
575 debugs(20, DBG_IMPORTANT
, "storeDirWriteCleanLogs: Operation aborted.");
579 debugs(20, DBG_IMPORTANT
, "storeDirWriteCleanLogs: Starting...");
581 start
= current_time
;
583 for (dirn
= 0; dirn
< Config
.cacheSwap
.n_configured
; ++dirn
) {
584 sd
= dynamic_cast<SwapDir
*>(INDEXSD(dirn
));
586 if (sd
->writeCleanStart() < 0) {
587 debugs(20, DBG_IMPORTANT
, "log.clean.start() failed for dir #" << sd
->index
);
593 * This may look inefficient as CPU wise it is more efficient to do this
594 * sequentially, but I/O wise the parallellism helps as it allows more
595 * hdd spindles to be active.
600 for (dirn
= 0; dirn
< Config
.cacheSwap
.n_configured
; ++dirn
) {
601 sd
= dynamic_cast<SwapDir
*>(INDEXSD(dirn
));
603 if (NULL
== sd
->cleanLog
)
606 e
= sd
->cleanLog
->nextEntry();
616 sd
->cleanLog
->write(*e
);
618 if ((++n
& 0xFFFF) == 0) {
620 debugs(20, DBG_IMPORTANT
, " " << std::setw(7) << n
<<
621 " entries written so far.");
627 for (dirn
= 0; dirn
< Config
.cacheSwap
.n_configured
; ++dirn
)
628 dynamic_cast<SwapDir
*>(INDEXSD(dirn
))->writeCleanDone();
631 storeDirOpenSwapLogs();
635 dt
= tvSubDsec(start
, current_time
);
637 debugs(20, DBG_IMPORTANT
, " Finished. Wrote " << n
<< " entries.");
638 debugs(20, DBG_IMPORTANT
, " Took "<< std::setw(3)<< std::setprecision(2) << dt
<<
639 " seconds ("<< std::setw(6) << ((double) n
/ (dt
> 0.0 ? dt
: 1.0)) << " entries/sec).");
644 /* Globals that should be converted to static Store::Disks methods */
647 allocate_new_swapdir(Store::DiskConfig
*swap
)
649 if (swap
->swapDirs
== NULL
) {
650 swap
->n_allocated
= 4;
651 swap
->swapDirs
= static_cast<SwapDir::Pointer
*>(xcalloc(swap
->n_allocated
, sizeof(SwapDir::Pointer
)));
654 if (swap
->n_allocated
== swap
->n_configured
) {
655 swap
->n_allocated
<<= 1;
656 SwapDir::Pointer
*const tmp
= static_cast<SwapDir::Pointer
*>(xcalloc(swap
->n_allocated
, sizeof(SwapDir::Pointer
)));
657 memcpy(tmp
, swap
->swapDirs
, swap
->n_configured
* sizeof(SwapDir
*));
658 xfree(swap
->swapDirs
);
659 swap
->swapDirs
= tmp
;
664 free_cachedir(Store::DiskConfig
*swap
)
667 /* DON'T FREE THESE FOR RECONFIGURE */
672 for (i
= 0; i
< swap
->n_configured
; ++i
) {
673 /* TODO XXX this lets the swapdir free resources asynchronously
674 * swap->swapDirs[i]->deactivate();
675 * but there may be such a means already.
678 swap
->swapDirs
[i
] = NULL
;
681 safe_free(swap
->swapDirs
);
682 swap
->swapDirs
= NULL
;
683 swap
->n_allocated
= 0;
684 swap
->n_configured
= 0;
687 /* Globals that should be moved to some Store::UFS-specific logging module */
690 * An entry written to the swap log MUST have the following
692 * 1. It MUST be a public key. It does no good to log
693 * a public ADD, change the key, then log a private
694 * DEL. So we need to log a DEL before we change a
695 * key from public to private.
696 * 2. It MUST have a valid (> -1) swap_filen.
699 storeDirSwapLog(const StoreEntry
* e
, int op
)
702 assert(!EBIT_TEST(e
->flags
, KEY_PRIVATE
));
703 assert(e
->swap_filen
>= 0);
705 * icons and such; don't write them to the swap log
708 if (EBIT_TEST(e
->flags
, ENTRY_SPECIAL
))
711 assert(op
> SWAP_LOG_NOP
&& op
< SWAP_LOG_MAX
);
713 debugs(20, 3, "storeDirSwapLog: " <<
714 swap_log_op_str
[op
] << " " <<
715 e
->getMD5Text() << " " <<
716 e
->swap_dirn
<< " " <<
717 std::hex
<< std::uppercase
<< std::setfill('0') << std::setw(8) << e
->swap_filen
);
719 dynamic_cast<SwapDir
*>(INDEXSD(e
->swap_dirn
))->logEntry(*e
, op
);