Maintenance: Remove FIXME and \todo labels (#647)

[thirdparty/squid.git] / src / parser / Tokenizer.cc
diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc

index ecb27296314b7869e2b7e2577767ad0adb01f26b..09369f1611acdc97b12202761d7dcbb530b1670c 100644 (file)
--- a/src/parser/Tokenizer.cc
+++ b/src/parser/Tokenizer.cc
@@ -1,110 +1,252 @@
+/*
+ * Copyright (C) 1996-2020 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+/* DEBUG: section 24    SBuf */
+
  #include "squid.h"
+#include "Debug.h"
+#include "parser/forward.h"
  #include "parser/Tokenizer.h"
+#include "sbuf/Stream.h"
  
  #include <cerrno>
  #if HAVE_CTYPE_H
  #include <ctype.h>
  #endif
-#if HAVE_STDINT_H
-#include <stdint.h>
-#endif
-#ifndef INT64_MIN
-/* Native 64 bit system without strtoll() */
-#if defined(LONG_MIN) && (SIZEOF_LONG == 8)
-#define INT64_MIN LONG_MIN
-#else
-/* 32 bit system */
-#define INT64_MIN       (-9223372036854775807LL-1LL)
-#endif
-#endif
  
-#ifndef INT64_MAX
-/* Native 64 bit system without strtoll() */
-#if defined(LONG_MAX) && (SIZEOF_LONG == 8)
-#define INT64_MAX LONG_MAX
-#else
-/* 32 bit system */
-#define INT64_MAX       9223372036854775807LL
-#endif
-#endif
+/// convenience method: consumes up to n bytes, counts, and returns them
+SBuf
+Parser::Tokenizer::consume(const SBuf::size_type n)
+{
+    // careful: n may be npos!
+    debugs(24, 5, "consuming " << n << " bytes");
+    const SBuf result = buf_.consume(n);
+    parsed_ += result.length();
+    return result;
+}
+
+/// convenience method: consume()s up to n bytes and returns their count
+SBuf::size_type
+Parser::Tokenizer::success(const SBuf::size_type n)
+{
+    return consume(n).length();
+}
+
+/// convenience method: consumes up to n last bytes and returns them
+SBuf
+Parser::Tokenizer::consumeTrailing(const SBuf::size_type n)
+{
+    debugs(24, 5, "consuming " << n << " bytes");
+
+    // If n is npos, we consume everything from buf_ (and nothing from result).
+    const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
+
+    SBuf result = buf_;
+    buf_ = result.consume(buf_.length() - parsed);
+    parsed_ += parsed;
+    return result;
+}
+
+/// convenience method: consumes up to n last bytes and returns their count
+SBuf::size_type
+Parser::Tokenizer::successTrailing(const SBuf::size_type n)
+{
+    return consumeTrailing(n).length();
+}
  
  bool
  Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
  {
-    const SBuf savebuf(buf_);
-    skip(delimiters);
+    const Tokenizer saved(*this);
+    skipAll(delimiters);
      const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
-    if (tokenLen == SBuf::npos && !delimiters['\0']) {
-        // no delimiter found, nor is NUL/EOS/npos acceptible as one
-        buf_ = savebuf;
+    if (tokenLen == SBuf::npos) {
+        debugs(24, 8, "no token found for delimiters " << delimiters.name);
+        *this = saved;
          return false;
      }
-    const SBuf retval = buf_.consume(tokenLen);
-    skip(delimiters);
-    returnedToken = retval;
+    returnedToken = consume(tokenLen); // cannot be empty
+    skipAll(delimiters);
+    debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
+           returnedToken << '\'');
      return true;
  }
  
  bool
  Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
  {
-    const SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
-    if (prefixLen == 0)
+    SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
+    if (prefixLen == 0) {
+        debugs(24, 8, "no prefix for set " << tokenChars.name);
          return false;
-    returnedToken = buf_.consume(prefixLen);
+    }
+    if (prefixLen == SBuf::npos && (atEnd() || limit == 0)) {
+        debugs(24, 8, "no char in set " << tokenChars.name << " while looking for prefix");
+        return false;
+    }
+    if (prefixLen == SBuf::npos && limit > 0) {
+        debugs(24, 8, "whole haystack matched");
+        prefixLen = limit;
+    }
+    debugs(24, 8, "found with length " << prefixLen);
+    returnedToken = consume(prefixLen); // cannot be empty after the npos check
      return true;
  }
  
+SBuf
+Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
+{
+    if (atEnd())
+        throw InsufficientInput();
+
+    SBuf result;
+
+    if (!prefix(result, tokenChars, limit))
+        throw TexcHere(ToSBuf("cannot parse ", description));
+
+    if (atEnd())
+        throw InsufficientInput();
+
+    return result;
+}
+
  bool
-Parser::Tokenizer::skip(const CharacterSet &tokenChars)
+Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
  {
-    const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
-    if (prefixLen == 0)
+    SBuf span = buf_;
+
+    if (limit < buf_.length())
+        span.consume(buf_.length() - limit); // ignore the N prefix characters
+
+    auto i = span.rbegin();
+    SBuf::size_type found = 0;
+    while (i != span.rend() && tokenChars[*i]) {
+        ++i;
+        ++found;
+    }
+    if (!found)
          return false;
-    buf_.consume(prefixLen);
+    returnedToken = consumeTrailing(found);
      return true;
  }
  
+SBuf::size_type
+Parser::Tokenizer::skipAll(const CharacterSet &tokenChars)
+{
+    const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
+    if (prefixLen == 0) {
+        debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
+        return 0;
+    }
+    debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
+    return success(prefixLen);
+}
+
+bool
+Parser::Tokenizer::skipOne(const CharacterSet &chars)
+{
+    if (!buf_.isEmpty() && chars[buf_[0]]) {
+        debugs(24, 8, "skipping one-of " << chars.name);
+        return success(1);
+    }
+    debugs(24, 8, "no match while skipping one-of " << chars.name);
+    return false;
+}
+
+bool
+Parser::Tokenizer::skipSuffix(const SBuf &tokenToSkip)
+{
+    if (buf_.length() < tokenToSkip.length())
+        return false;
+
+    SBuf::size_type offset = 0;
+    if (tokenToSkip.length() < buf_.length())
+        offset = buf_.length() - tokenToSkip.length();
+
+    if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
+        debugs(24, 8, "skipping " << tokenToSkip.length());
+        return successTrailing(tokenToSkip.length());
+    }
+    return false;
+}
+
  bool
  Parser::Tokenizer::skip(const SBuf &tokenToSkip)
  {
      if (buf_.startsWith(tokenToSkip)) {
-        buf_.consume(tokenToSkip.length());
-        return true;
+        debugs(24, 8, "skipping " << tokenToSkip.length());
+        return success(tokenToSkip.length());
      }
+    debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
      return false;
  }
  
  bool
  Parser::Tokenizer::skip(const char tokenChar)
  {
-    if (buf_[0] == tokenChar) {
-        buf_.consume(1);
-        return true;
+    if (!buf_.isEmpty() && buf_[0] == tokenChar) {
+        debugs(24, 8, "skipping char '" << tokenChar << '\'');
+        return success(1);
+    }
+    debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
+    return false;
+}
+
+bool
+Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable)
+{
+    if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
+        debugs(24, 8, "skipping one-of " << skippable.name);
+        return successTrailing(1);
      }
+    debugs(24, 8, "no match while skipping one-of " << skippable.name);
      return false;
  }
  
+SBuf::size_type
+Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable)
+{
+    const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
+    const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
+                                      0 : (prefixEnd + 1);
+    const SBuf::size_type suffixLen = buf_.length() - prefixLen;
+    if (suffixLen == 0) {
+        debugs(24, 8, "no match when trying to skip " << skippable.name);
+        return 0;
+    }
+    debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
+    return successTrailing(suffixLen);
+}
+
  /* reworked from compat/strtoll.c */
  bool
-Parser::Tokenizer::int64(int64_t & result, int base)
+Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
  {
-    if (buf_.isEmpty())
+    if (atEnd() || limit == 0)
          return false;
  
-    //fixme: account for buf_.size()
+    const SBuf range(buf_.substr(0,limit));
+
+    // XXX: account for buf_.size()
      bool neg = false;
-    const char *s = buf_.rawContent();
-    const char *end = buf_.rawContent() + buf_.length();
-
-    if (*s == '-') {
-        neg = true;
-        ++s;
-    } else if (*s == '+') {
-        ++s;
+    const char *s = range.rawContent();
+    const char *end = range.rawContent() + range.length();
+
+    if (allowSign) {
+        if (*s == '-') {
+            neg = true;
+            ++s;
+        } else if (*s == '+') {
+            ++s;
+        }
+        if (s >= end) return false;
      }
-    if (s >= end) return false;
-    if (( base == 0 || base == 16) && *s == '0' && (s+1 <= end ) &&
+    if (( base == 0 || base == 16) && *s == '0' && (s+1 < end ) &&
              tolower(*(s+1)) == 'x') {
          s += 2;
          base = 16;
@@ -127,7 +269,8 @@ Parser::Tokenizer::int64(int64_t & result, int base)
  
      int any = 0, c;
      int64_t acc = 0;
-    for (c = *s++; s <= end; c = *s++) {
+    do {
+        c = *s;
          if (xisdigit(c)) {
              c -= '0';
          } else if (xisalpha(c)) {
@@ -144,7 +287,7 @@ Parser::Tokenizer::int64(int64_t & result, int base)
              acc *= base;
              acc += c;
          }
-    }
+    } while (++s < end);
  
      if (any == 0) // nothing was parsed
          return false;
@@ -156,6 +299,27 @@ Parser::Tokenizer::int64(int64_t & result, int base)
          acc = -acc;
  
      result = acc;
-    buf_.consume(s - buf_.rawContent() -1);
-    return true;
+    return success(s - range.rawContent());
+}
+
+int64_t
+Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
+{
+    if (atEnd())
+        throw InsufficientInput();
+
+    int64_t result = 0;
+
+    // Since we only support unsigned decimals, a parsing failure with a
+    // non-empty input always implies invalid/malformed input (or a buggy
+    // limit=0 caller). TODO: Support signed and non-decimal integers by
+    // refactoring int64() to detect insufficient input.
+    if (!int64(result, 10, false, limit))
+        throw TexcHere(ToSBuf("cannot parse ", description));
+
+    if (atEnd())
+        throw InsufficientInput(); // more digits may be coming
+
+    return result;
  }
+