src/parser/Tokenizer.cc

   1 /*
   2  * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 24    SBuf */
  10
  11 #include "squid.h"
  12 #include "debug/Stream.h"
  13 #include "parser/forward.h"
  14 #include "parser/Tokenizer.h"
  15 #include "sbuf/Stream.h"
  16
  17 #include <cctype>
  18 #include <cerrno>
  19
  20 /// convenience method: consumes up to n bytes, counts, and returns them
  21 SBuf
  22 Parser::Tokenizer::consume(const SBuf::size_type n)
  23 {
  24     // careful: n may be npos!
  25     debugs(24, 5, "consuming " << n << " bytes");
  26     const SBuf result = buf_.consume(n);
  27     parsed_ += result.length();
  28     return result;
  29 }
  30
  31 /// convenience method: consume()s up to n bytes and returns their count
  32 SBuf::size_type
  33 Parser::Tokenizer::success(const SBuf::size_type n)
  34 {
  35     return consume(n).length();
  36 }
  37
  38 /// convenience method: consumes up to n last bytes and returns them
  39 SBuf
  40 Parser::Tokenizer::consumeTrailing(const SBuf::size_type n)
  41 {
  42     debugs(24, 5, "consuming " << n << " bytes");
  43
  44     // If n is npos, we consume everything from buf_ (and nothing from result).
  45     const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
  46
  47     SBuf result = buf_;
  48     buf_ = result.consume(buf_.length() - parsed);
  49     parsed_ += parsed;
  50     return result;
  51 }
  52
  53 /// convenience method: consumes up to n last bytes and returns their count
  54 SBuf::size_type
  55 Parser::Tokenizer::successTrailing(const SBuf::size_type n)
  56 {
  57     return consumeTrailing(n).length();
  58 }
  59
  60 bool
  61 Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
  62 {
  63     const Tokenizer saved(*this);
  64     skipAll(delimiters);
  65     const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
  66     if (tokenLen == SBuf::npos) {
  67         debugs(24, 8, "no token found for delimiters " << delimiters.name);
  68         *this = saved;
  69         return false;
  70     }
  71     returnedToken = consume(tokenLen); // cannot be empty
  72     skipAll(delimiters);
  73     debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
  74            returnedToken << '\'');
  75     return true;
  76 }
  77
  78 bool
  79 Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
  80 {
  81     SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
  82     if (prefixLen == 0) {
  83         debugs(24, 8, "no prefix for set " << tokenChars.name);
  84         return false;
  85     }
  86     if (prefixLen == SBuf::npos && (atEnd() || limit == 0)) {
  87         debugs(24, 8, "no char in set " << tokenChars.name << " while looking for prefix");
  88         return false;
  89     }
  90     if (prefixLen == SBuf::npos && limit > 0) {
  91         debugs(24, 8, "whole haystack matched");
  92         prefixLen = limit;
  93     }
  94     debugs(24, 8, "found with length " << prefixLen);
  95     returnedToken = consume(prefixLen); // cannot be empty after the npos check
  96     return true;
  97 }
  98
  99 SBuf
 100 Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
 101 {
 102     if (atEnd())
 103         throw InsufficientInput();
 104
 105     SBuf result;
 106
 107     if (!prefix(result, tokenChars, limit))
 108         throw TexcHere(ToSBuf("cannot parse ", description));
 109
 110     if (atEnd())
 111         throw InsufficientInput();
 112
 113     return result;
 114 }
 115
 116 bool
 117 Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
 118 {
 119     SBuf span = buf_;
 120
 121     if (limit < buf_.length())
 122         span.consume(buf_.length() - limit); // ignore the N prefix characters
 123
 124     auto i = span.rbegin();
 125     SBuf::size_type found = 0;
 126     while (i != span.rend() && tokenChars[*i]) {
 127         ++i;
 128         ++found;
 129     }
 130     if (!found)
 131         return false;
 132     returnedToken = consumeTrailing(found);
 133     return true;
 134 }
 135
 136 SBuf::size_type
 137 Parser::Tokenizer::skipAll(const CharacterSet &tokenChars)
 138 {
 139     const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
 140     if (prefixLen == 0) {
 141         debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
 142         return 0;
 143     }
 144     debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
 145     return success(prefixLen);
 146 }
 147
 148 void
 149 Parser::Tokenizer::skipRequired(const char *description, const SBuf &tokenToSkip)
 150 {
 151     if (skip(tokenToSkip) || tokenToSkip.isEmpty())
 152         return;
 153
 154     if (tokenToSkip.startsWith(buf_))
 155         throw InsufficientInput();
 156
 157     throw TextException(ToSBuf("cannot skip ", description), Here());
 158 }
 159
 160 bool
 161 Parser::Tokenizer::skipOne(const CharacterSet &chars)
 162 {
 163     if (!buf_.isEmpty() && chars[buf_[0]]) {
 164         debugs(24, 8, "skipping one-of " << chars.name);
 165         return success(1);
 166     }
 167     debugs(24, 8, "no match while skipping one-of " << chars.name);
 168     return false;
 169 }
 170
 171 bool
 172 Parser::Tokenizer::skipSuffix(const SBuf &tokenToSkip)
 173 {
 174     if (buf_.length() < tokenToSkip.length())
 175         return false;
 176
 177     SBuf::size_type offset = 0;
 178     if (tokenToSkip.length() < buf_.length())
 179         offset = buf_.length() - tokenToSkip.length();
 180
 181     if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
 182         debugs(24, 8, "skipping " << tokenToSkip.length());
 183         return successTrailing(tokenToSkip.length());
 184     }
 185     return false;
 186 }
 187
 188 bool
 189 Parser::Tokenizer::skip(const SBuf &tokenToSkip)
 190 {
 191     if (buf_.startsWith(tokenToSkip)) {
 192         debugs(24, 8, "skipping " << tokenToSkip.length());
 193         return success(tokenToSkip.length());
 194     }
 195     debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
 196     return false;
 197 }
 198
 199 bool
 200 Parser::Tokenizer::skip(const char tokenChar)
 201 {
 202     if (!buf_.isEmpty() && buf_[0] == tokenChar) {
 203         debugs(24, 8, "skipping char '" << tokenChar << '\'');
 204         return success(1);
 205     }
 206     debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
 207     return false;
 208 }
 209
 210 bool
 211 Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable)
 212 {
 213     if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
 214         debugs(24, 8, "skipping one-of " << skippable.name);
 215         return successTrailing(1);
 216     }
 217     debugs(24, 8, "no match while skipping one-of " << skippable.name);
 218     return false;
 219 }
 220
 221 SBuf::size_type
 222 Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable)
 223 {
 224     const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
 225     const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
 226                                       0 : (prefixEnd + 1);
 227     const SBuf::size_type suffixLen = buf_.length() - prefixLen;
 228     if (suffixLen == 0) {
 229         debugs(24, 8, "no match when trying to skip " << skippable.name);
 230         return 0;
 231     }
 232     debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
 233     return successTrailing(suffixLen);
 234 }
 235
 236 /* reworked from compat/strtoll.c */
 237 bool
 238 Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
 239 {
 240     if (atEnd() || limit == 0)
 241         return false;
 242
 243     const SBuf range(buf_.substr(0,limit));
 244
 245     // XXX: account for buf_.size()
 246     bool neg = false;
 247     const char *s = range.rawContent();
 248     const char *end = range.rawContent() + range.length();
 249
 250     if (allowSign) {
 251         if (*s == '-') {
 252             neg = true;
 253             ++s;
 254         } else if (*s == '+') {
 255             ++s;
 256         }
 257         if (s >= end) return false;
 258     }
 259     if (( base == 0 || base == 16) && *s == '0' && (s+1 < end ) &&
 260             tolower(*(s+1)) == 'x') {
 261         s += 2;
 262         base = 16;
 263     }
 264     if (base == 0) {
 265         if ( *s == '0') {
 266             base = 8;
 267             ++s;
 268         } else {
 269             base = 10;
 270         }
 271     }
 272     if (s >= end) return false;
 273
 274     uint64_t cutoff;
 275
 276     cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
 277     const int cutlim = cutoff % static_cast<int64_t>(base);
 278     cutoff /= static_cast<uint64_t>(base);
 279
 280     int any = 0, c;
 281     int64_t acc = 0;
 282     do {
 283         c = *s;
 284         if (xisdigit(c)) {
 285             c -= '0';
 286         } else if (xisalpha(c)) {
 287             c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
 288         } else {
 289             break;
 290         }
 291         if (c >= base)
 292             break;
 293         if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
 294             any = -1;
 295         else {
 296             any = 1;
 297             acc *= base;
 298             acc += c;
 299         }
 300     } while (++s < end);
 301
 302     if (any == 0) // nothing was parsed
 303         return false;
 304     if (any < 0) {
 305         acc = neg ? INT64_MIN : INT64_MAX;
 306         errno = ERANGE;
 307         return false;
 308     } else if (neg)
 309         acc = -acc;
 310
 311     result = acc;
 312     return success(s - range.rawContent());
 313 }
 314
 315 int64_t
 316 Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
 317 {
 318     if (atEnd())
 319         throw InsufficientInput();
 320
 321     int64_t result = 0;
 322
 323     // Since we only support unsigned decimals, a parsing failure with a
 324     // non-empty input always implies invalid/malformed input (or a buggy
 325     // limit=0 caller). TODO: Support signed and non-decimal integers by
 326     // refactoring int64() to detect insufficient input.
 327     if (!int64(result, 10, false, limit))
 328         throw TexcHere(ToSBuf("cannot parse ", description));
 329
 330     if (atEnd())
 331         throw InsufficientInput(); // more digits may be coming
 332
 333     return result;
 334 }
 335