Fixed and redefined total peering time (%<tt) (#1828)

author Alex Rousskov <rousskov@measurement-factory.com>

Mon, 19 Aug 2024 17:20:06 +0000 (17:20 +0000)

committer Squid Anubis <squid-anubis@squid-cache.org>

Tue, 20 Aug 2024 20:44:07 +0000 (20:44 +0000)
author Alex Rousskov <rousskov@measurement-factory.com>
Mon, 19 Aug 2024 17:20:06 +0000 (17:20 +0000)
committer Squid Anubis <squid-anubis@squid-cache.org>
Tue, 20 Aug 2024 20:44:07 +0000 (20:44 +0000)
diff --git a/src/FwdState.cc b/src/FwdState.cc

index bac0aa5df87143c60901b2305edcbd86efbcdabc..bbca57e56f6ea16c4f4bd9c50cd56664484b7e6d 100644 (file)
--- a/src/FwdState.cc
+++ b/src/FwdState.cc
@@ -128,7 +128,8 @@ FwdState::FwdState(const Comm::ConnectionPointer &client, StoreEntry * e, HttpRe
      waitingForDispatched(false),
      destinations(new ResolvedPeers()),
      pconnRace(raceImpossible),
-    storedWholeReply_(nullptr)
+    storedWholeReply_(nullptr),
+    peeringTimer(r)
  {
      debugs(17, 2, "Forwarding client request " << client << ", url=" << e->url());
      HTTPMSGLOCK(request);
@@ -186,6 +187,8 @@ FwdState::stopAndDestroy(const char *reason)
  
      cancelStep(reason);
  
+    peeringTimer.stop();
+
      PeerSelectionInitiator::subscribed = false; // may already be false
      self = nullptr; // we hope refcounting destroys us soon; may already be nil
      /* do not place any code here as this object may be gone by now */
@@ -259,8 +262,6 @@ FwdState::completed()
  
      flags.forward_completed = true;
  
-    request->hier.stopPeerClock(false);
-
      if (EBIT_TEST(entry->flags, ENTRY_ABORTED)) {
          debugs(17, 3, "entry aborted");
          return ;
@@ -780,8 +781,6 @@ FwdState::retryOrBail()
      // TODO: should we call completed() here and move doneWithRetries there?
      doneWithRetries();
  
-    request->hier.stopPeerClock(false);
-
      if (self != nullptr && !err && shutting_down && entry->isEmpty()) {
          const auto anErr = new ErrorState(ERR_SHUTTING_DOWN, Http::scServiceUnavailable, request, al);
          errorAppendEntry(entry, anErr);
@@ -1128,8 +1127,6 @@ FwdState::connectStart()
      err = nullptr;
      request->clearError();
  
-    request->hier.startPeerClock();
-
      const auto callback = asyncCallback(17, 5, FwdState::noteConnection, this);
      HttpRequest::Pointer cause = request;
      const auto cs = new HappyConnOpener(destinations, callback, cause, start_t, n_tries, al);
@@ -1572,3 +1569,25 @@ ResetMarkingsToServer(HttpRequest * request, Comm::Connection &conn)
          Ip::Qos::setSockNfmark(&conn, conn.nfmark);
  }
  
+/* PeeringActivityTimer */
+
+// The simple methods below are not inlined to avoid exposing some of the
+// current FwdState.h users to a full HttpRequest definition they do not need.
+
+PeeringActivityTimer::PeeringActivityTimer(const HttpRequestPointer &r): request(r)
+{
+    Assure(request);
+    timer().resume();
+}
+
+PeeringActivityTimer::~PeeringActivityTimer()
+{
+    stop();
+}
+
+Stopwatch &
+PeeringActivityTimer::timer()
+{
+    return request->hier.totalPeeringTime;
+}
+
diff --git a/src/FwdState.h b/src/FwdState.h

index c435609e3bcffed8d4fd73bb68ca67afc07824d4..316243c9030c4fe65475503abbc3eaf33ee8460a 100644 (file)
--- a/src/FwdState.h
+++ b/src/FwdState.h
@@ -49,6 +49,40 @@ void ResetMarkingsToServer(HttpRequest *, Comm::Connection &);
  
  class HelperReply;
  
+/// Eliminates excessive Stopwatch pause() calls in a task with multiple code
+/// locations that pause a stopwatch. Ideally, there would be just one such
+/// location (e.g., a task class destructor), but current code idiosyncrasies
+/// necessitate this state. For simplicity sake, this class currently manages a
+/// Stopwatch at a hard-coded location: HttpRequest::hier.totalPeeringTime.
+class PeeringActivityTimer
+{
+public:
+    PeeringActivityTimer(const HttpRequestPointer &); ///< resumes timer
+    ~PeeringActivityTimer(); ///< \copydoc stop()
+
+    /// pauses timer if stop() has not been called
+    void stop()
+    {
+        if (!stopped) {
+            timer().pause();
+            stopped = true;
+        }
+    }
+
+private:
+    /// managed Stopwatch object within HierarchyLogEntry
+    Stopwatch &timer();
+
+    /// the owner of managed HierarchyLogEntry
+    HttpRequestPointer request;
+
+    // We cannot rely on timer().ran(): This class eliminates excessive calls
+    // within a single task (e.g., an AsyncJob) while the timer (and its ran()
+    // state) may be shared/affected by multiple concurrent tasks.
+    /// Whether the task is done participating in the managed activity.
+    bool stopped = false;
+};
+
  class FwdState: public RefCountable, public PeerSelectionInitiator
  {
      CBDATA_CHILD(FwdState);
@@ -212,6 +246,9 @@ private:
      /// Whether the entire reply (including any body) was written to Store.
      /// The string literal value is only used for debugging.
      const char *storedWholeReply_;
+
+    /// Measures time spent on selecting and communicating with peers.
+    PeeringActivityTimer peeringTimer;
  };
  
  class acl_tos;
diff --git a/src/HierarchyLogEntry.h b/src/HierarchyLogEntry.h

index 5f5d8b13f41e2397fed400dfdb6783be1366f31f..053cf9b134ab8a651f2b19bfd5a1d9e8a1051fea 100644 (file)
--- a/src/HierarchyLogEntry.h
+++ b/src/HierarchyLogEntry.h
@@ -9,6 +9,7 @@
  #ifndef SQUID_SRC_HIERARCHYLOGENTRY_H
  #define SQUID_SRC_HIERARCHYLOGENTRY_H
  
+#include "base/Stopwatch.h"
  #include "comm/Connection.h"
  #include "enums.h"
  #include "hier_code.h"
@@ -37,22 +38,10 @@ public:
      /// Call this after each peer socket write(2), including failed ones.
      void notePeerWrite();
  
-    /// Start recording total time spent communicating with peers
-    void startPeerClock();
-    /**
-     * Record total time spent communicating with peers
-     * \param force whether to overwrite old recorded value if any
-     */
-    void stopPeerClock(const bool force);
-
      /// Estimates response generation and sending delay at the last peer.
      /// \returns whether the estimate (stored in `responseTime`) is available.
      bool peerResponseTime(struct timeval &responseTime);
  
-    /// Estimates the total time spent communicating with peers.
-    /// \returns whether the estimate (stored in `responseTime`) is available.
-    bool totalResponseTime(struct timeval &responseTime);
-
  public:
      hier_code code;
      char host[SQUIDHOSTNAMELEN];
@@ -70,13 +59,14 @@ public:
      Comm::ConnectionPointer tcpServer; ///< TCP/IP level details of the last peer/server connection
      int64_t bodyBytesRead;  ///< number of body bytes received from the next hop or -1
  
+    /// cumulative time spent (so far) communicating with all peers (see %<tt)
+    Stopwatch totalPeeringTime;
+
  private:
      void clearPeerNotes();
  
-    timeval firstConnStart_; ///< first connection use among all peers
      struct timeval peer_last_read_; ///< time of the last read from the last peer
      struct timeval peer_last_write_; ///< time of the last write to the last peer
-    struct timeval totalResponseTime_; ///< cumulative for all peers
  };
  
  #endif /* SQUID_SRC_HIERARCHYLOGENTRY_H */
diff --git a/src/cf.data.pre b/src/cf.data.pre

index bf8eb45ee480ccfa909391c71299163f24df32f5..b90076188ae9bf83a0a7e19fe217826868e84f5d 100644 (file)
--- a/src/cf.data.pre
+++ b/src/cf.data.pre
@@ -4954,10 +4954,34 @@ DOC_START
                 [http::]<pt     Peer response time in milliseconds. The timer starts
                                 when the last request byte is sent to the next hop
                                 and stops when the last response byte is received.
-               [http::]<tt     Total time in milliseconds. The timer
-                               starts with the first connect request (or write I/O)
-                               sent to the first selected peer. The timer stops
-                               with the last I/O with the last peer.
+               [http::]<tt     Total time spent forwarding to origin servers or
+                               cache_peers (milliseconds).
+
+                               The timer starts when Squid decides to forward the request (to
+                               an origin server or cache_peer) and peer selection begins. The
+                               timer stops when relevant forwarding activities (including any
+                               retries) end.
+
+                               Between those two timer events, Squid may perform DNS lookups,
+                               query external ACL helpers, adapt responses using pre-cache
+                               RESPMOD services, and participate in other concurrent
+                               secondary activities. Most secondary activities increase
+                               peering time. In some cases, a secondary activity may start
+                               before the timer starts or end after the timer stops, leading
+                               to misleading results of simple computations like %<tt - %dt.
+
+                               If this logformat %code is used before its timer starts, the
+                               corresponding measurement has no value (and the %code expands
+                               to a single dash ("-") character).
+
+                               If this code is used while its timer is running, the time
+                               spent so far is used as the measurement value.
+
+                               When Squid re-forwards the request (e.g., after certain cache
+                               revalidation failures), the timer may restart. In this case,
+                               the new measurement is added to the value accumulated from
+                               previous forwarding attempts. The time interval between
+                               forwarding attempts is not added to the final result.
  
         Squid handling related format codes:
  
diff --git a/src/clients/Client.cc b/src/clients/Client.cc

index 551e82e936bf09d20f156c1149e1ccf15a59cd57..4f68b48e1785b5dfde1d20e5d053385c4a9d1fd8 100644 (file)
--- a/src/clients/Client.cc
+++ b/src/clients/Client.cc
@@ -187,7 +187,6 @@ Client::serverComplete()
      }
  
      completed = true;
-    originalRequest()->hier.stopPeerClock(true);
  
      if (requestBodySource != nullptr)
          stopConsumingFrom(requestBodySource);
diff --git a/src/format/Format.cc b/src/format/Format.cc

index 7685ad319f8a8ed45bccc1c9ec37109cfa0d9fc8..bd0faf69471a2a300449702f069699aba9895a28 100644 (file)
--- a/src/format/Format.cc
+++ b/src/format/Format.cc
@@ -635,9 +635,16 @@ Format::Format::assemble(MemBuf &mb, const AccessLogEntry::Pointer &al, int logS
              break;
  
          case LFT_TOTAL_SERVER_SIDE_RESPONSE_TIME: {
-            struct timeval totalResponseTime;
-            if (al->hier.totalResponseTime(totalResponseTime)) {
-                outtv = totalResponseTime;
+            // XXX: al->hier.totalPeeringTime is not updated until prepareLogWithRequestDetails().
+            // TODO: Avoid the need for updates by keeping totalPeeringTime (or even ALE::hier) in one place.
+            const auto &timer = (!al->hier.totalPeeringTime.ran() && al->request) ?
+                                al->request->hier.totalPeeringTime : al->hier.totalPeeringTime;
+            if (timer.ran()) {
+                using namespace std::chrono_literals;
+                const auto duration = timer.total();
+                outtv.tv_sec = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
+                const auto totalUsec = std::chrono::duration_cast<std::chrono::microseconds>(duration);
+                outtv.tv_usec = (totalUsec % std::chrono::microseconds(1s)).count();
                  doMsec = 1;
              }
          }
diff --git a/src/log/access_log.cc b/src/log/access_log.cc

index ff01005f4658230fc266bc5c4b84de4c84de9219..1a53bf5941291ae8244f00ef2cd345da2308c537 100644 (file)
--- a/src/log/access_log.cc
+++ b/src/log/access_log.cc
@@ -187,12 +187,6 @@ HierarchyLogEntry::HierarchyLogEntry() :
      store_complete_stop.tv_usec =0;
  
      clearPeerNotes();
-
-    totalResponseTime_.tv_sec = -1;
-    totalResponseTime_.tv_usec = 0;
-
-    firstConnStart_.tv_sec = 0;
-    firstConnStart_.tv_usec = 0;
  }
  
  void
@@ -241,26 +235,6 @@ HierarchyLogEntry::notePeerWrite()
      peer_last_write_ = current_time;
  }
  
-void
-HierarchyLogEntry::startPeerClock()
-{
-    if (!firstConnStart_.tv_sec)
-        firstConnStart_ = current_time;
-}
-
-void
-HierarchyLogEntry::stopPeerClock(const bool force)
-{
-    debugs(46, 5, "First connection started: " << firstConnStart_ <<
-           ", current total response time value: " << (totalResponseTime_.tv_sec * 1000 +  totalResponseTime_.tv_usec/1000) <<
-           (force ? ", force fixing" : ""));
-    if (!force && totalResponseTime_.tv_sec != -1)
-        return;
-
-    if (firstConnStart_.tv_sec)
-        tvSub(totalResponseTime_, firstConnStart_, current_time);
-}
-
  bool
  HierarchyLogEntry::peerResponseTime(struct timeval &responseTime)
  {
@@ -288,19 +262,6 @@ HierarchyLogEntry::peerResponseTime(struct timeval &responseTime)
      return true;
  }
  
-bool
-HierarchyLogEntry::totalResponseTime(struct timeval &responseTime)
-{
-    // This should not really happen, but there may be rare code
-    // paths that lead to FwdState discarded (or transaction logged)
-    // without (or before) a stopPeerClock() call.
-    if (firstConnStart_.tv_sec && totalResponseTime_.tv_sec == -1)
-        stopPeerClock(false);
-
-    responseTime = totalResponseTime_;
-    return responseTime.tv_sec >= 0 && responseTime.tv_usec >= 0;
-}
-
  static void
  accessLogRegisterWithCacheManager(void)
  {
diff --git a/src/tests/stub_access_log.cc b/src/tests/stub_access_log.cc

index 93bfd75174f69f75f253aff4068a7753baedb41b..8dbed03787f2ef41367ed3aba0fa7afe1a872f39 100644 (file)
--- a/src/tests/stub_access_log.cc
+++ b/src/tests/stub_access_log.cc
@@ -17,7 +17,6 @@ HierarchyLogEntry::HierarchyLogEntry() STUB
  void HierarchyLogEntry::notePeerRead() STUB
  void HierarchyLogEntry::notePeerWrite() STUB
  bool HierarchyLogEntry::peerResponseTime(struct timeval &) STUB_RETVAL(false)
-bool HierarchyLogEntry::totalResponseTime(struct timeval &) STUB_RETVAL(false)
  
  ping_data::ping_data() :
      n_sent(0),
diff --git a/src/tunnel.cc b/src/tunnel.cc

index 2f527200498c4252ec3269625abf6bc7472f29d6..ff85638f0592de3e4b68392b99bd35b15bf99584 100644 (file)
--- a/src/tunnel.cc
+++ b/src/tunnel.cc
@@ -211,6 +211,9 @@ public:
      /// over the (encrypted, if needed) transport connection to that cache_peer
      JobWait<Http::Tunneler> peerWait;
  
+    /// Measures time spent on selecting and communicating with peers.
+    PeeringActivityTimer peeringTimer;
+
      void copyRead(Connection &from, IOCB *completion);
  
      /// continue to set up connection to a peer, going async for SSL peers
@@ -310,7 +313,7 @@ TunnelStateData::serverClosed()
  {
      server.noteClosure();
  
-    request->hier.stopPeerClock(false);
+    peeringTimer.stop();
  
      finishWritingAndDelete(client);
  }
@@ -386,6 +389,16 @@ TunnelStateData::deleteThis()
      delete this;
  }
  
+// TODO: Replace with a reusable API guaranteeing non-nil pointer forwarding.
+/// safely extracts HttpRequest from a never-nil ClientHttpRequest pointer
+static auto &
+guaranteedRequest(const ClientHttpRequest * const cr)
+{
+    Assure(cr);
+    Assure(cr->request);
+    return *cr->request;
+}
+
  TunnelStateData::TunnelStateData(ClientHttpRequest *clientRequest) :
      startTime(squid_curtime),
      destinations(new ResolvedPeers()),
@@ -393,7 +406,8 @@ TunnelStateData::TunnelStateData(ClientHttpRequest *clientRequest) :
      committedToServer(false),
      n_tries(0),
      banRetries(nullptr),
-    codeContext(CodeContext::Current())
+    codeContext(CodeContext::Current()),
+    peeringTimer(&guaranteedRequest(clientRequest))
  {
      debugs(26, 3, "TunnelStateData constructed this=" << this);
      client.readPendingFunc = &tunnelDelayedClientRead;
@@ -477,8 +491,7 @@ TunnelStateData::retryOrBail(const char *context)
  
      /* bail */
  
-    if (request)
-        request->hier.stopPeerClock(false);
+    peeringTimer.stop();
  
      // TODO: Add sendSavedErrorOr(err_type type, Http::StatusCode, context).
      // Then, the remaining method code (below) should become the common part of
@@ -1403,8 +1416,7 @@ TunnelStateData::sendError(ErrorState *finalError, const char *reason)
  {
      debugs(26, 3, "aborting transaction for " << reason);
  
-    if (request)
-        request->hier.stopPeerClock(false);
+    peeringTimer.stop();
  
      cancelStep(reason);
  
@@ -1438,15 +1450,12 @@ TunnelStateData::cancelStep(const char *reason)
  void
  TunnelStateData::startConnecting()
  {
-    if (request)
-        request->hier.startPeerClock();
-
      assert(!destinations->empty());
      assert(!transporting());
  
      delete savedError; // may still be nil
      savedError = nullptr;
-    request->hier.peer_reply_status = Http::scNone; // TODO: Move to startPeerClock()?
+    request->hier.peer_reply_status = Http::scNone;
  
      const auto callback = asyncCallback(17, 5, TunnelStateData::noteConnection, this);
      const auto cs = new HappyConnOpener(destinations, callback, request, startTime, n_tries, al);
author	Alex Rousskov <rousskov@measurement-factory.com>
	Mon, 19 Aug 2024 17:20:06 +0000 (17:20 +0000)
committer	Squid Anubis <squid-anubis@squid-cache.org>
	Tue, 20 Aug 2024 20:44:07 +0000 (20:44 +0000)
src/FwdState.cc		patch \| blob \| blame \| history
src/FwdState.h		patch \| blob \| blame \| history
src/HierarchyLogEntry.h		patch \| blob \| blame \| history
src/cf.data.pre		patch \| blob \| blame \| history
src/clients/Client.cc		patch \| blob \| blame \| history
src/format/Format.cc		patch \| blob \| blame \| history
src/log/access_log.cc		patch \| blob \| blame \| history
src/tests/stub_access_log.cc		patch \| blob \| blame \| history
src/tunnel.cc		patch \| blob \| blame \| history