From c203754768afb698246d9dd1f9f650403eca0382 Mon Sep 17 00:00:00 2001 From: Eduard Bagdasaryan Date: Wed, 23 Jan 2019 03:51:12 +0000 Subject: [PATCH] Initialize StoreMapSlice when reserving a new cache slot (#350) Rock sets the StoreMapSlice::next field when sending a slice to disk. To avoid writing slice A twice, Rock allocates a new slice B to prime A.next right before writing A. Scheduling A's writing and, sometimes, lack of data to fill B create a gap between B's allocation and B's writing (which sets B.next). During that time, A.next points to B, but B.next is untouched. If writing slice A or swapout in general fails, the chain of failed entry slices (now containing both A and B) is freed. If untouched B.next contains garbage, then freeChainAt() adds "random" slices after B to the free slice pool. Subsequent swapouts use those incorrectly freed slices, effectively overwriting portions of random cache entries, corrupting the cache. How did B.next get dirty in the first place? freeChainAt() cleans the slices it frees, but Rock also makes direct noteFreeMapSlice() calls. Shared memory cache may have avoided this corruption because it makes no such calls. Ipc::StoreMap::prepFreeSlice() now clears allocated slices. Long-term, we may be able to move free slice management into StoreMap to automate this cleanup. Also simplified and polished slot allocation code a little, removing the Rock::IoState::reserveSlotForWriting() middleman. This change also improves the symmetry between Rock and shared memory cache code. --- src/MemStore.cc | 14 +++++++++----- src/fs/rock/RockIoState.cc | 19 ++----------------- src/fs/rock/RockIoState.h | 1 - src/fs/rock/RockSwapDir.cc | 24 +++++++++++++++++------- src/fs/rock/RockSwapDir.h | 6 ++++-- src/ipc/StoreMap.cc | 11 +++++++++-- src/ipc/StoreMap.h | 6 ++++++ 7 files changed, 47 insertions(+), 34 deletions(-) diff --git a/src/MemStore.cc b/src/MemStore.cc index a315f7adef..af7624eb6c 100644 --- a/src/MemStore.cc +++ b/src/MemStore.cc @@ -772,13 +772,15 @@ MemStore::reserveSapForWriting(Ipc::Mem::PageId &page) { Ipc::Mem::PageId slot; if (freeSlots->pop(slot)) { - debugs(20, 5, "got a previously free slot: " << slot); + const auto slotId = slot.number - 1; + debugs(20, 5, "got a previously free slot: " << slotId); if (Ipc::Mem::GetPage(Ipc::Mem::PageId::cachePage, page)) { debugs(20, 5, "and got a previously free page: " << page); - return slot.number - 1; + map->prepFreeSlice(slotId); + return slotId; } else { - debugs(20, 3, "but there is no free page, returning " << slot); + debugs(20, 3, "but there is no free page, returning " << slotId); freeSlots->push(slot); } } @@ -791,8 +793,10 @@ MemStore::reserveSapForWriting(Ipc::Mem::PageId &page) assert(!waitingFor); // noteFreeMapSlice() should have cleared it assert(slot.set()); assert(page.set()); - debugs(20, 5, "got previously busy " << slot << " and " << page); - return slot.number - 1; + const auto slotId = slot.number - 1; + map->prepFreeSlice(slotId); + debugs(20, 5, "got previously busy " << slotId << " and " << page); + return slotId; } assert(waitingFor.slot == &slot && waitingFor.page == &page); waitingFor.slot = NULL; diff --git a/src/fs/rock/RockIoState.cc b/src/fs/rock/RockIoState.cc index bba24ae5c4..50b6e6d66c 100644 --- a/src/fs/rock/RockIoState.cc +++ b/src/fs/rock/RockIoState.cc @@ -190,7 +190,7 @@ Rock::IoState::tryWrite(char const *buf, size_t size, off_t coreOff) // allocate the first slice during the first write if (!coreOff) { assert(sidCurrent < 0); - sidCurrent = reserveSlotForWriting(); // throws on failures + sidCurrent = dir->reserveSlotForWriting(); // throws on failures assert(sidCurrent >= 0); writeAnchor().start = sidCurrent; } @@ -207,7 +207,7 @@ Rock::IoState::tryWrite(char const *buf, size_t size, off_t coreOff) // We do not write a full buffer without overflow because // we would not yet know what to set the nextSlot to. if (overflow) { - const SlotId sidNext = reserveSlotForWriting(); // throws + const auto sidNext = dir->reserveSlotForWriting(); // throws assert(sidNext >= 0); writeToDisk(sidNext); } else if (Store::Root().transientReaders(*e)) { @@ -306,21 +306,6 @@ Rock::IoState::writeBufToDisk(const SlotId sidNext, const bool eof, const bool l theFile->write(r); } -/// finds and returns a free db slot to fill or throws -Rock::SlotId -Rock::IoState::reserveSlotForWriting() -{ - Ipc::Mem::PageId pageId; - if (dir->useFreeSlot(pageId)) - return pageId.number-1; - - // This may happen when the number of available db slots is close to the - // number of concurrent requests reading or writing those slots, which may - // happen when the db is "small" compared to the request traffic OR when we - // are rebuilding and have not loaded "many" entries or empty slots yet. - throw TexcHere("ran out of free db slots"); -} - void Rock::IoState::finishedWriting(const int errFlag) { diff --git a/src/fs/rock/RockIoState.h b/src/fs/rock/RockIoState.h index 3c213558e6..b8f2061e42 100644 --- a/src/fs/rock/RockIoState.h +++ b/src/fs/rock/RockIoState.h @@ -66,7 +66,6 @@ private: size_t writeToBuffer(char const *buf, size_t size); void writeToDisk(const SlotId nextSlot); void writeBufToDisk(const SlotId nextSlot, const bool eof, const bool lastWrite); - SlotId reserveSlotForWriting(); void callBack(int errflag); diff --git a/src/fs/rock/RockSwapDir.cc b/src/fs/rock/RockSwapDir.cc index 7eec16381c..894c582c90 100644 --- a/src/fs/rock/RockSwapDir.cc +++ b/src/fs/rock/RockSwapDir.cc @@ -710,12 +710,16 @@ Rock::SwapDir::diskOffsetLimit() const return diskOffset(map->sliceLimit()); } -bool -Rock::SwapDir::useFreeSlot(Ipc::Mem::PageId &pageId) +Rock::SlotId +Rock::SwapDir::reserveSlotForWriting() { + Ipc::Mem::PageId pageId; + if (freeSlots->pop(pageId)) { - debugs(47, 5, "got a previously free slot: " << pageId); - return true; + const auto slotId = pageId.number - 1; + debugs(47, 5, "got a previously free slot: " << slotId); + map->prepFreeSlice(slotId); + return slotId; } // catch free slots delivered to noteFreeMapSlice() @@ -724,14 +728,20 @@ Rock::SwapDir::useFreeSlot(Ipc::Mem::PageId &pageId) if (map->purgeOne()) { assert(!waitingForPage); // noteFreeMapSlice() should have cleared it assert(pageId.set()); - debugs(47, 5, "got a previously busy slot: " << pageId); - return true; + const auto slotId = pageId.number - 1; + debugs(47, 5, "got a previously busy slot: " << slotId); + map->prepFreeSlice(slotId); + return slotId; } assert(waitingForPage == &pageId); waitingForPage = NULL; + // This may happen when the number of available db slots is close to the + // number of concurrent requests reading or writing those slots, which may + // happen when the db is "small" compared to the request traffic OR when we + // are rebuilding and have not loaded "many" entries or empty slots yet. debugs(47, 3, "cannot get a slot; entries: " << map->entryCount()); - return false; + throw TexcHere("ran out of free db slots"); } bool diff --git a/src/fs/rock/RockSwapDir.h b/src/fs/rock/RockSwapDir.h index 879bd45ed0..aa88283758 100644 --- a/src/fs/rock/RockSwapDir.h +++ b/src/fs/rock/RockSwapDir.h @@ -62,10 +62,12 @@ public: int64_t slotLimitAbsolute() const; ///< Rock store implementation limit int64_t slotLimitActual() const; ///< total number of slots in this db - /// removes a slot from a list of free slots or returns false - bool useFreeSlot(Ipc::Mem::PageId &pageId); /// whether the given slot ID may point to a slot in this db bool validSlotId(const SlotId slotId) const; + + /// finds and returns a free db slot to fill or throws + SlotId reserveSlotForWriting(); + /// purges one or more entries to make full() false and free some slots void purgeSome(); diff --git a/src/ipc/StoreMap.cc b/src/ipc/StoreMap.cc index 203b1804ef..072e3fc014 100644 --- a/src/ipc/StoreMap.cc +++ b/src/ipc/StoreMap.cc @@ -346,8 +346,7 @@ Ipc::StoreMap::freeChainAt(SliceId sliceId, const SliceId splicingPoint) while (sliceId >= 0) { Slice &slice = sliceAt(sliceId); const SliceId nextId = slice.next; - slice.size = 0; - slice.next = -1; + slice.clear(); if (cleaner) cleaner->noteFreeMapSlice(sliceId); // might change slice state if (sliceId == splicingPoint) { @@ -360,6 +359,14 @@ Ipc::StoreMap::freeChainAt(SliceId sliceId, const SliceId splicingPoint) debugs(54, 7, "freed chain #" << chainId << " in " << path); } +void +Ipc::StoreMap::prepFreeSlice(const SliceId sliceId) +{ + // TODO: Move freeSlots here, along with reserveSlotForWriting() logic. + assert(validSlice(sliceId)); + sliceAt(sliceId).clear(); +} + Ipc::StoreMap::SliceId Ipc::StoreMap::sliceContaining(const sfileno fileno, const uint64_t bytesNeeded) const { diff --git a/src/ipc/StoreMap.h b/src/ipc/StoreMap.h index 5d44379b61..8a4909e779 100644 --- a/src/ipc/StoreMap.h +++ b/src/ipc/StoreMap.h @@ -42,6 +42,9 @@ public: return *this; } + /// restore default-constructed state + void clear() { size = 0; next = -1; } + std::atomic size; ///< slice contents size std::atomic next; ///< ID of the next entry slice }; @@ -292,6 +295,9 @@ public: /// readable anchor for the entry created by openForReading() const Anchor &readableEntry(const AnchorId anchorId) const; + /// prepare a chain-unaffiliated slice for being added to an entry chain + void prepFreeSlice(const SliceId sliceId); + /// Returns the ID of the entry slice containing n-th byte or /// a negative ID if the entry does not store that many bytes (yet). /// Requires a read lock. -- 2.39.5