From 5ca027f05122959ef5a27bd3e7bd66b89d0357aa Mon Sep 17 00:00:00 2001 From: Alex Rousskov Date: Sun, 1 May 2016 15:37:52 -0600 Subject: [PATCH] Accumulate fewer unknown-size responses to avoid overwhelming disks. Start swapping out an unknown-size entry as soon as size-based cache_dir selection is no longer affected by the entry growth. If the entry eventually exceeds the selected cache_dir entry size limits, terminate the swapout. The following description assumes that Squid deals with a cachable response that lacks a Content-Length header. These changes should not affect other responses. Prior to these changes, StoreEntry::mayStartSwapOut() delayed swapout decision until the entire response was received or the already accumulated portion of the response exceeded the [global] store entry size limit, whichever came first. This logic protected Store from entries with unknown sizes. AFAICT, that protection existed for two reasons: * Incorrect size-based cache_dir selection: When cache_dirs use different min-size and/or max-size settings, Squid cannot tell which cache_dir the unknown-size entry belongs to and, hence, may select the wrong cache_dir. * Disk bandwidth/space waste: If the growing entry exceeds all cache_dir max-size limits, the swapout has to be aborted, resulting in waste of previously spent resources (primarily disk bandwidth and space). The cost of those protections include RAM waste (up to maximum cachable object size for each of the concurrent unknown-size entry downloads) and sudden disk overload (when the entire delayed entry is written to disk in a large burst of write requests initiated from a tight doPages() loop at the end of the swapout sequence when the entry size becomes known). The latter cost is especially high because swapping the entire large object out in one function call can easily overflow disker queues and/or block Squid while the OS drains disk write buffers in an emergency mode. FWIW, AFAICT, cache_dir selection protection was the only reason for introducing response accumulation (trunk r4446). The RAM cost was realized a year later (r4954), and the disk costs were realized during this project. This change reduces those costs by starting to swap out an unknown-size entry ASAP, usually immediately. In most caching environments, most large cachable entries should be cached. It is usually better to spend [disk] resources gradually than to deal with sudden bursts [of disk write requests]. Occasional jolts make high performance unsustainable. This change does not affect size-based cache_dir selection: Squid still delays swapout until future entry growth cannot affect that selection. Fortunately, in most configurations, the correct selection can happen immediately because cache_dirs lack explicit min-size and/or max-size settings and simply rely on the *same-for-all* minimum_object_size and maximum_object_size values. We could make the trade-off between costly protections (i.e., accumulate until the entry size is known) and occasional gradual resource waste (i.e., start swapping out ASAP) configurable. However, I think it is best to wait for the use case that requires such configuration and can guide the design of those new configuration options. Side changes: * Honor forgotten minimum_object_size for cache_dirs without min-size in Store::Disk::objectSizeIsAcceptable() and fix its initial value to correctly detect a manually configured cache_dir min-size (which may be zero). However, the fixed bug is probably hidden by another (yet unfixed) bug: checkTooSmall() forgets about cache_dirs with min-size! * Allow unknown-size objects into the shared memory cache, which code could handle partial writes (since collapsed forwarding changes?). * Fixed Rock::SwapDir::canStore() handling of unknown-size objects. I do not see how such objects could get that far before, but if they could, most would probably be cached because the bug would hide the unknown size from Store::Disk::canStore() that declares them unstorable. --- src/MemStore.cc | 16 +++-- src/fs/rock/RockIoState.cc | 3 + src/fs/rock/RockSwapDir.cc | 4 +- src/fs/ufs/UFSStoreState.cc | 10 ++++ src/store.cc | 32 +--------- src/store/Controller.cc | 24 ++++++++ src/store/Controller.h | 7 +++ src/store/Disk.cc | 27 ++++----- src/store/Disk.h | 3 + src/store/Disks.cc | 113 ++++++++++++++++++++++++++++++------ src/store/Disks.h | 13 +++++ src/store_swapout.cc | 19 ++---- src/tests/stub_SwapDir.cc | 1 + 13 files changed, 183 insertions(+), 89 deletions(-) diff --git a/src/MemStore.cc b/src/MemStore.cc index c8db5ca52f..ffece9beba 100644 --- a/src/MemStore.cc +++ b/src/MemStore.cc @@ -625,16 +625,8 @@ MemStore::shouldCache(StoreEntry &e) const } const int64_t expectedSize = e.mem_obj->expectedReplySize(); // may be < 0 - - // objects of unknown size are not allowed into memory cache, for now - if (expectedSize < 0) { - debugs(20, 5, "Unknown expected size: " << e); - return false; - } - const int64_t loadedSize = e.mem_obj->endOffset(); const int64_t ramSize = max(loadedSize, expectedSize); - if (ramSize > maxObjectSize()) { debugs(20, 5, HERE << "Too big max(" << loadedSize << ", " << expectedSize << "): " << e); @@ -674,7 +666,10 @@ MemStore::startCaching(StoreEntry &e) e.mem_obj->memCache.index = index; e.mem_obj->memCache.io = MemObject::ioWriting; slot->set(e); - map->startAppending(index); + // Do not allow others to feed off an unknown-size entry because we will + // stop swapping it out if it grows too large. + if (e.mem_obj->expectedReplySize() >= 0) + map->startAppending(index); e.memOutDecision(true); return true; } @@ -700,6 +695,9 @@ MemStore::copyToShm(StoreEntry &e) return; // nothing to do (yet) } + // throw if an accepted unknown-size entry grew too big or max-size changed + Must(eSize <= maxObjectSize()); + const int32_t index = e.mem_obj->memCache.index; assert(index >= 0); Ipc::StoreMapAnchor &anchor = map->writeableEntry(index); diff --git a/src/fs/rock/RockIoState.cc b/src/fs/rock/RockIoState.cc index e43ea441a5..983adc45c0 100644 --- a/src/fs/rock/RockIoState.cc +++ b/src/fs/rock/RockIoState.cc @@ -184,6 +184,9 @@ Rock::IoState::tryWrite(char const *buf, size_t size, off_t coreOff) // either this is the first write or append; we do not support write gaps assert(!coreOff || coreOff == -1); + // throw if an accepted unknown-size entry grew too big or max-size changed + Must(offset_ + size <= static_cast(dir->maxObjectSize())); + // allocate the first slice during the first write if (!coreOff) { assert(sidCurrent < 0); diff --git a/src/fs/rock/RockSwapDir.cc b/src/fs/rock/RockSwapDir.cc index 34d26f1200..cbcc70b392 100644 --- a/src/fs/rock/RockSwapDir.cc +++ b/src/fs/rock/RockSwapDir.cc @@ -604,7 +604,9 @@ Rock::SwapDir::rebuild() bool Rock::SwapDir::canStore(const StoreEntry &e, int64_t diskSpaceNeeded, int &load) const { - if (!::SwapDir::canStore(e, sizeof(DbCellHeader)+diskSpaceNeeded, load)) + if (diskSpaceNeeded >= 0) + diskSpaceNeeded += sizeof(DbCellHeader); + if (!::SwapDir::canStore(e, diskSpaceNeeded, load)) return false; if (!theFile || !theFile->canWrite()) diff --git a/src/fs/ufs/UFSStoreState.cc b/src/fs/ufs/UFSStoreState.cc index 678df172e2..e4527a0c09 100644 --- a/src/fs/ufs/UFSStoreState.cc +++ b/src/fs/ufs/UFSStoreState.cc @@ -14,6 +14,7 @@ #include "DiskIO/ReadRequest.h" #include "DiskIO/WriteRequest.h" #include "Generic.h" +#include "SquidConfig.h" #include "SquidList.h" #include "Store.h" #include "store/Disk.h" @@ -167,6 +168,15 @@ Fs::Ufs::UFSStoreState::write(char const *buf, size_t size, off_t aOffset, FREE return false; } + const Store::Disk &dir = *INDEXSD(swap_dirn); + if (offset_ + size > static_cast(dir.maxObjectSize())) { + debugs(79, 2, "accepted unknown-size entry grew too big: " << + (offset_ + size) << " > " << dir.maxObjectSize()); + free_func((void*)buf); + tryClosing(); + return false; + } + queueWrite(buf, size, aOffset, free_func); drainWriteQueue(); return true; diff --git a/src/store.cc b/src/store.cc index 3f91d468ef..fbd8a7f2a0 100644 --- a/src/store.cc +++ b/src/store.cc @@ -1370,40 +1370,10 @@ storeInit(void) storeRegisterWithCacheManager(); } -/// computes maximum size of a cachable object -/// larger objects are rejected by all (disk and memory) cache stores -static int64_t -storeCalcMaxObjSize() -{ - int64_t ms = 0; // nothing can be cached without at least one store consent - - // global maximum is at least the disk store maximum - for (int i = 0; i < Config.cacheSwap.n_configured; ++i) { - assert (Config.cacheSwap.swapDirs[i].getRaw()); - const int64_t storeMax = dynamic_cast(Config.cacheSwap.swapDirs[i].getRaw())->maxObjectSize(); - if (ms < storeMax) - ms = storeMax; - } - - // global maximum is at least the memory store maximum - // TODO: move this into a memory cache class when we have one - const int64_t memMax = static_cast(min(Config.Store.maxInMemObjSize, Config.memMaxSize)); - if (ms < memMax) - ms = memMax; - - return ms; -} - void storeConfigure(void) { - store_swap_high = (long) (((float) Store::Root().maxSize() * - (float) Config.Swap.highWaterMark) / (float) 100); - store_swap_low = (long) (((float) Store::Root().maxSize() * - (float) Config.Swap.lowWaterMark) / (float) 100); - store_pages_max = Config.memMaxSize / sizeof(mem_node); - - store_maxobjsize = storeCalcMaxObjSize(); + Store::Root().updateLimits(); } bool diff --git a/src/store/Controller.cc b/src/store/Controller.cc index 9af6e609a4..352370c6e6 100644 --- a/src/store/Controller.cc +++ b/src/store/Controller.cc @@ -183,6 +183,23 @@ Store::Controller::maxObjectSize() const return swapDir->maxObjectSize(); } +void +Store::Controller::updateLimits() +{ + swapDir->updateLimits(); + + store_swap_high = (long) (((float) maxSize() * + (float) Config.Swap.highWaterMark) / (float) 100); + store_swap_low = (long) (((float) maxSize() * + (float) Config.Swap.lowWaterMark) / (float) 100); + store_pages_max = Config.memMaxSize / sizeof(mem_node); + + // TODO: move this into a memory cache class when we have one + const int64_t memMax = static_cast(min(Config.Store.maxInMemObjSize, Config.memMaxSize)); + const int64_t disksMax = swapDir ? swapDir->maxObjectSize() : 0; + store_maxobjsize = std::max(disksMax, memMax); +} + StoreSearch * Store::Controller::search() { @@ -326,6 +343,13 @@ Store::Controller::find(const cache_key *key) return nullptr; } +int64_t +Store::Controller::accumulateMore(StoreEntry &entry) const +{ + return swapDir ? swapDir->accumulateMore(entry) : 0; + // The memory cache should not influence for-swapout accumulation decision. +} + void Store::Controller::markForUnlink(StoreEntry &e) { diff --git a/src/store/Controller.h b/src/store/Controller.h index 65eb26ea8a..71e4eebf88 100644 --- a/src/store/Controller.h +++ b/src/store/Controller.h @@ -42,6 +42,13 @@ public: virtual void unlink(StoreEntry &) override; virtual int callback() override; + /// Additional unknown-size entry bytes required by Store in order to + /// reduce the risk of selecting the wrong disk cache for the growing entry. + int64_t accumulateMore(StoreEntry &) const; + + /// slowly calculate (and cache) hi/lo watermarks and similar limits + void updateLimits(); + /// called when the entry is no longer needed by any transaction void handleIdleEntry(StoreEntry &); diff --git a/src/store/Disk.cc b/src/store/Disk.cc index 48d323dcab..d7e3edb129 100644 --- a/src/store/Disk.cc +++ b/src/store/Disk.cc @@ -22,7 +22,7 @@ #include "tools.h" Store::Disk::Disk(char const *aType): theType(aType), - max_size(0), min_objsize(0), max_objsize (-1), + max_size(0), min_objsize(-1), max_objsize (-1), path(NULL), index(-1), disker(-1), repl(NULL), removals(0), scanned(0), cleanLog(NULL) @@ -92,6 +92,13 @@ Store::Disk::minSize() const return ((maxSize() * Config.Swap.lowWaterMark) / 100); } +int64_t +Store::Disk::minObjectSize() const +{ + // per-store min-size=N value is authoritative + return min_objsize > -1 ? min_objsize : Config.Store.minObjectSize; +} + int64_t Store::Disk::maxObjectSize() const { @@ -148,19 +155,9 @@ Store::Disk::diskFull() bool Store::Disk::objectSizeIsAcceptable(int64_t objsize) const { - // without limits, all object sizes are acceptable, including unknown ones - if (min_objsize <= 0 && max_objsize == -1) - return true; - - // with limits, objects with unknown sizes are not acceptable - if (objsize == -1) - return false; - - // without the upper limit, just check the lower limit - if (max_objsize == -1) - return min_objsize <= objsize; - - return min_objsize <= objsize && objsize < max_objsize; + // need either the expected or the already accumulated object size + assert(objsize >= 0); + return minObjectSize() <= objsize && objsize <= maxObjectSize(); } bool @@ -380,7 +377,7 @@ Store::Disk::optionObjectSizeParse(char const *option, const char *value, int is void Store::Disk::optionObjectSizeDump(StoreEntry * e) const { - if (min_objsize != 0) + if (min_objsize != -1) storeAppendPrintf(e, " min-size=%" PRId64, min_objsize); if (max_objsize != -1) diff --git a/src/store/Disk.h b/src/store/Disk.h index 054be9523f..4b47b49048 100644 --- a/src/store/Disk.h +++ b/src/store/Disk.h @@ -54,6 +54,9 @@ public: virtual bool dereference(StoreEntry &e) override; virtual void maintain() override; + /// the size of the smallest entry this cache_dir can store + int64_t minObjectSize() const; + /// configure the maximum object size for this storage area. /// May be any size up to the total storage area. void maxObjectSize(int64_t newMax); diff --git a/src/store/Disks.cc b/src/store/Disks.cc index f1bea0caef..53377e43f7 100644 --- a/src/store/Disks.cc +++ b/src/store/Disks.cc @@ -27,6 +27,24 @@ static STDIRSELECT storeDirSelectSwapDirLeastLoad; */ STDIRSELECT *storeDirSelectSwapDir = storeDirSelectSwapDirLeastLoad; +/// The entry size to use for Disk::canStore() size limit checks. +/// This is an optimization to avoid similar calculations in every cache_dir. +static int64_t +objectSizeForDirSelection(const StoreEntry &entry) +{ + // entry.objectLen() is negative here when we are still STORE_PENDING + int64_t minSize = entry.mem_obj->expectedReplySize(); + + // If entry size is unknown, use already accumulated bytes as an estimate. + // Controller::accumulateMore() guarantees that there are enough of them. + if (minSize < 0) + minSize = entry.mem_obj->endOffset(); + + assert(minSize >= 0); + minSize += entry.mem_obj->swap_hdr_sz; + return minSize; +} + /** * This new selection scheme simply does round-robin on all SwapDirs. * A SwapDir is skipped if it is over the max_size (100%) limit, or @@ -35,10 +53,7 @@ STDIRSELECT *storeDirSelectSwapDir = storeDirSelectSwapDirLeastLoad; static int storeDirSelectSwapDirRoundRobin(const StoreEntry * e) { - // e->objectLen() is negative at this point when we are still STORE_PENDING - ssize_t objsize = e->mem_obj->expectedReplySize(); - if (objsize != -1) - objsize += e->mem_obj->swap_hdr_sz; + const int64_t objsize = objectSizeForDirSelection(*e); // Increment the first candidate once per selection (not once per // iteration) to reduce bias when some disk(s) attract more entries. @@ -81,18 +96,14 @@ static int storeDirSelectSwapDirLeastLoad(const StoreEntry * e) { int64_t most_free = 0; - ssize_t least_objsize = -1; + int64_t best_objsize = -1; int least_load = INT_MAX; int load; int dirn = -1; int i; RefCount SD; - // e->objectLen() is negative at this point when we are still STORE_PENDING - ssize_t objsize = e->mem_obj->expectedReplySize(); - - if (objsize != -1) - objsize += e->mem_obj->swap_hdr_sz; + const int64_t objsize = objectSizeForDirSelection(*e); for (i = 0; i < Config.cacheSwap.n_configured; ++i) { SD = dynamic_cast(INDEXSD(i)); @@ -111,11 +122,14 @@ storeDirSelectSwapDirLeastLoad(const StoreEntry * e) /* If the load is equal, then look in more details */ if (load == least_load) { - /* closest max-size fit */ - - if (least_objsize != -1) - if (SD->maxObjectSize() > least_objsize) + /* best max-size fit */ + if (best_objsize != -1) { + // cache_dir with the smallest max-size gets the known-size object + // cache_dir with the largest max-size gets the unknown-size object + if ((objsize != -1 && SD->maxObjectSize() > best_objsize) || + (objsize == -1 && SD->maxObjectSize() < best_objsize)) continue; + } /* most free */ if (cur_free < most_free) @@ -123,7 +137,7 @@ storeDirSelectSwapDirLeastLoad(const StoreEntry * e) } least_load = load; - least_objsize = SD->maxObjectSize(); + best_objsize = SD->maxObjectSize(); most_free = cur_free; dirn = i; } @@ -134,6 +148,13 @@ storeDirSelectSwapDirLeastLoad(const StoreEntry * e) return dirn; } +Store::Disks::Disks(): + largestMinimumObjectSize(-1), + largestMaximumObjectSize(-1), + secondLargestMaximumObjectSize(-1) +{ +} + SwapDir * Store::Disks::store(int const x) const { @@ -330,14 +351,68 @@ Store::Disks::currentCount() const int64_t Store::Disks::maxObjectSize() const { - int64_t result = -1; + return largestMaximumObjectSize; +} + +void +Store::Disks::updateLimits() +{ + largestMinimumObjectSize = -1; + largestMaximumObjectSize = -1; + secondLargestMaximumObjectSize = -1; for (int i = 0; i < Config.cacheSwap.n_configured; ++i) { - if (dir(i).active() && store(i)->maxObjectSize() > result) - result = store(i)->maxObjectSize(); + const auto &disk = dir(i); + if (!disk.active()) + continue; + + if (disk.minObjectSize() > largestMinimumObjectSize) + largestMinimumObjectSize = disk.minObjectSize(); + + const auto diskMaxObjectSize = disk.maxObjectSize(); + if (diskMaxObjectSize > largestMaximumObjectSize) { + if (largestMaximumObjectSize >= 0) // was set + secondLargestMaximumObjectSize = largestMaximumObjectSize; + largestMaximumObjectSize = diskMaxObjectSize; + } } +} - return result; +int64_t +Store::Disks::accumulateMore(const StoreEntry &entry) const +{ + const auto accumulated = entry.mem_obj->availableForSwapOut(); + + /* + * Keep accumulating more bytes until the set of disks eligible to accept + * the entry becomes stable, and, hence, accumulating more is not going to + * affect the cache_dir selection. A stable set is usually reached + * immediately (or soon) because most configurations either do not use + * cache_dirs with explicit min-size/max-size limits or use the same + * max-size limit for all cache_dirs (and low min-size limits). + */ + + // Can the set of min-size cache_dirs accepting this entry change? + if (accumulated < largestMinimumObjectSize) + return largestMinimumObjectSize - accumulated; + + // Can the set of max-size cache_dirs accepting this entry change + // (other than when the entry exceeds the largest maximum; see below)? + if (accumulated <= secondLargestMaximumObjectSize) + return secondLargestMaximumObjectSize - accumulated + 1; + + /* + * Checking largestMaximumObjectSize instead eliminates the risk of starting + * to swap out an entry that later grows too big, but also implies huge + * accumulation in most environments. Accumulating huge entries not only + * consumes lots of RAM but also creates a burst of doPages() write requests + * that overwhelm the disk. To avoid these problems, we take the risk and + * allow swap out now. The disk will quit swapping out if the entry + * eventually grows too big for its selected cache_dir. + */ + debugs(20, 3, "no: " << accumulated << '>' << + secondLargestMaximumObjectSize << ',' << largestMinimumObjectSize); + return 0; } void diff --git a/src/store/Disks.h b/src/store/Disks.h index 1203e1c815..3b72034f9f 100644 --- a/src/store/Disks.h +++ b/src/store/Disks.h @@ -18,6 +18,8 @@ namespace Store { class Disks: public Controlled { public: + Disks(); + /* Storage API */ virtual void create() override; virtual void init() override; @@ -40,10 +42,21 @@ public: virtual void unlink(StoreEntry &) override; virtual int callback() override; + /// slowly calculate (and cache) hi/lo watermarks and similar limits + void updateLimits(); + + /// Additional unknown-size entry bytes required by disks in order to + /// reduce the risk of selecting the wrong disk cache for the growing entry. + int64_t accumulateMore(const StoreEntry&) const; + private: /* migration logic */ SwapDir *store(int const x) const; SwapDir &dir(int const idx) const; + + int64_t largestMinimumObjectSize; ///< maximum of all Disk::minObjectSize()s + int64_t largestMaximumObjectSize; ///< maximum of all Disk::maxObjectSize()s + int64_t secondLargestMaximumObjectSize; ///< the second-biggest Disk::maxObjectSize() }; } // namespace Store diff --git a/src/store_swapout.cc b/src/store_swapout.cc index d8ff59d6c2..ac18bdacfb 100644 --- a/src/store_swapout.cc +++ b/src/store_swapout.cc @@ -427,20 +427,11 @@ StoreEntry::mayStartSwapOut() // prevent final default swPossible answer for yet unknown length if (expectedEnd < 0 && store_status != STORE_OK) { - const int64_t maxKnownSize = mem_obj->availableForSwapOut(); - debugs(20, 7, HERE << "maxKnownSize= " << maxKnownSize); - /* - * NOTE: the store_maxobjsize here is the global maximum - * size of object cacheable in any of Squid cache stores - * both disk and memory stores. - * - * However, I am worried that this - * deferance may consume a lot of memory in some cases. - * Should we add an option to limit this memory consumption? - */ - debugs(20, 5, HERE << "Deferring swapout start for " << - (store_maxobjsize - maxKnownSize) << " bytes"); - return true; // may still fit, but no final decision yet + const int64_t more = Store::Root().accumulateMore(*this); + if (more > 0) { + debugs(20, 5, "got " << currentEnd << "; defer decision for " << more << " more bytes"); + return true; // may still fit, but no final decision yet + } } } diff --git a/src/tests/stub_SwapDir.cc b/src/tests/stub_SwapDir.cc index 9e0d5a961d..773fbf0222 100644 --- a/src/tests/stub_SwapDir.cc +++ b/src/tests/stub_SwapDir.cc @@ -22,6 +22,7 @@ void SwapDir::stat(StoreEntry &) const STUB void SwapDir::statfs(StoreEntry &)const STUB void SwapDir::maintain() STUB uint64_t SwapDir::minSize() const STUB_RETVAL(0) +int64_t SwapDir::minObjectSize() const STUB_RETVAL(0) int64_t SwapDir::maxObjectSize() const STUB_RETVAL(0) void SwapDir::maxObjectSize(int64_t) STUB void SwapDir::reference(StoreEntry &) STUB -- 2.47.2