src/parser/Tokenizer.cc

   1 /*
   2  * Copyright (C) 1996-2020 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 /* DEBUG: section 24    SBuf */
  10
  11 #include "squid.h"
  12 #include "Debug.h"
  13 #include "parser/forward.h"
  14 #include "parser/Tokenizer.h"
  15 #include "sbuf/Stream.h"
  16
  17 #include <cerrno>
  18 #if HAVE_CTYPE_H
  19 #include <ctype.h>
  20 #endif
  21
  22 /// convenience method: consumes up to n bytes, counts, and returns them
  23 SBuf
  24 Parser::Tokenizer::consume(const SBuf::size_type n)
  25 {
  26     // careful: n may be npos!
  27     debugs(24, 5, "consuming " << n << " bytes");
  28     const SBuf result = buf_.consume(n);
  29     parsed_ += result.length();
  30     return result;
  31 }
  32
  33 /// convenience method: consume()s up to n bytes and returns their count
  34 SBuf::size_type
  35 Parser::Tokenizer::success(const SBuf::size_type n)
  36 {
  37     return consume(n).length();
  38 }
  39
  40 /// convenience method: consumes up to n last bytes and returns them
  41 SBuf
  42 Parser::Tokenizer::consumeTrailing(const SBuf::size_type n)
  43 {
  44     debugs(24, 5, "consuming " << n << " bytes");
  45
  46     // If n is npos, we consume everything from buf_ (and nothing from result).
  47     const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
  48
  49     SBuf result = buf_;
  50     buf_ = result.consume(buf_.length() - parsed);
  51     parsed_ += parsed;
  52     return result;
  53 }
  54
  55 /// convenience method: consumes up to n last bytes and returns their count
  56 SBuf::size_type
  57 Parser::Tokenizer::successTrailing(const SBuf::size_type n)
  58 {
  59     return consumeTrailing(n).length();
  60 }
  61
  62 bool
  63 Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
  64 {
  65     const Tokenizer saved(*this);
  66     skipAll(delimiters);
  67     const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
  68     if (tokenLen == SBuf::npos) {
  69         debugs(24, 8, "no token found for delimiters " << delimiters.name);
  70         *this = saved;
  71         return false;
  72     }
  73     returnedToken = consume(tokenLen); // cannot be empty
  74     skipAll(delimiters);
  75     debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
  76            returnedToken << '\'');
  77     return true;
  78 }
  79
  80 bool
  81 Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
  82 {
  83     SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
  84     if (prefixLen == 0) {
  85         debugs(24, 8, "no prefix for set " << tokenChars.name);
  86         return false;
  87     }
  88     if (prefixLen == SBuf::npos && (atEnd() || limit == 0)) {
  89         debugs(24, 8, "no char in set " << tokenChars.name << " while looking for prefix");
  90         return false;
  91     }
  92     if (prefixLen == SBuf::npos && limit > 0) {
  93         debugs(24, 8, "whole haystack matched");
  94         prefixLen = limit;
  95     }
  96     debugs(24, 8, "found with length " << prefixLen);
  97     returnedToken = consume(prefixLen); // cannot be empty after the npos check
  98     return true;
  99 }
 100
 101 SBuf
 102 Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
 103 {
 104     if (atEnd())
 105         throw InsufficientInput();
 106
 107     SBuf result;
 108
 109     if (!prefix(result, tokenChars, limit))
 110         throw TexcHere(ToSBuf("cannot parse ", description));
 111
 112     if (atEnd())
 113         throw InsufficientInput();
 114
 115     return result;
 116 }
 117
 118 bool
 119 Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
 120 {
 121     SBuf span = buf_;
 122
 123     if (limit < buf_.length())
 124         span.consume(buf_.length() - limit); // ignore the N prefix characters
 125
 126     auto i = span.rbegin();
 127     SBuf::size_type found = 0;
 128     while (i != span.rend() && tokenChars[*i]) {
 129         ++i;
 130         ++found;
 131     }
 132     if (!found)
 133         return false;
 134     returnedToken = consumeTrailing(found);
 135     return true;
 136 }
 137
 138 SBuf::size_type
 139 Parser::Tokenizer::skipAll(const CharacterSet &tokenChars)
 140 {
 141     const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
 142     if (prefixLen == 0) {
 143         debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
 144         return 0;
 145     }
 146     debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
 147     return success(prefixLen);
 148 }
 149
 150 bool
 151 Parser::Tokenizer::skipOne(const CharacterSet &chars)
 152 {
 153     if (!buf_.isEmpty() && chars[buf_[0]]) {
 154         debugs(24, 8, "skipping one-of " << chars.name);
 155         return success(1);
 156     }
 157     debugs(24, 8, "no match while skipping one-of " << chars.name);
 158     return false;
 159 }
 160
 161 bool
 162 Parser::Tokenizer::skipSuffix(const SBuf &tokenToSkip)
 163 {
 164     if (buf_.length() < tokenToSkip.length())
 165         return false;
 166
 167     SBuf::size_type offset = 0;
 168     if (tokenToSkip.length() < buf_.length())
 169         offset = buf_.length() - tokenToSkip.length();
 170
 171     if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
 172         debugs(24, 8, "skipping " << tokenToSkip.length());
 173         return successTrailing(tokenToSkip.length());
 174     }
 175     return false;
 176 }
 177
 178 bool
 179 Parser::Tokenizer::skip(const SBuf &tokenToSkip)
 180 {
 181     if (buf_.startsWith(tokenToSkip)) {
 182         debugs(24, 8, "skipping " << tokenToSkip.length());
 183         return success(tokenToSkip.length());
 184     }
 185     debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
 186     return false;
 187 }
 188
 189 bool
 190 Parser::Tokenizer::skip(const char tokenChar)
 191 {
 192     if (!buf_.isEmpty() && buf_[0] == tokenChar) {
 193         debugs(24, 8, "skipping char '" << tokenChar << '\'');
 194         return success(1);
 195     }
 196     debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
 197     return false;
 198 }
 199
 200 bool
 201 Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable)
 202 {
 203     if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
 204         debugs(24, 8, "skipping one-of " << skippable.name);
 205         return successTrailing(1);
 206     }
 207     debugs(24, 8, "no match while skipping one-of " << skippable.name);
 208     return false;
 209 }
 210
 211 SBuf::size_type
 212 Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable)
 213 {
 214     const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
 215     const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
 216                                       0 : (prefixEnd + 1);
 217     const SBuf::size_type suffixLen = buf_.length() - prefixLen;
 218     if (suffixLen == 0) {
 219         debugs(24, 8, "no match when trying to skip " << skippable.name);
 220         return 0;
 221     }
 222     debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
 223     return successTrailing(suffixLen);
 224 }
 225
 226 /* reworked from compat/strtoll.c */
 227 bool
 228 Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
 229 {
 230     if (atEnd() || limit == 0)
 231         return false;
 232
 233     const SBuf range(buf_.substr(0,limit));
 234
 235     // XXX: account for buf_.size()
 236     bool neg = false;
 237     const char *s = range.rawContent();
 238     const char *end = range.rawContent() + range.length();
 239
 240     if (allowSign) {
 241         if (*s == '-') {
 242             neg = true;
 243             ++s;
 244         } else if (*s == '+') {
 245             ++s;
 246         }
 247         if (s >= end) return false;
 248     }
 249     if (( base == 0 || base == 16) && *s == '0' && (s+1 < end ) &&
 250             tolower(*(s+1)) == 'x') {
 251         s += 2;
 252         base = 16;
 253     }
 254     if (base == 0) {
 255         if ( *s == '0') {
 256             base = 8;
 257             ++s;
 258         } else {
 259             base = 10;
 260         }
 261     }
 262     if (s >= end) return false;
 263
 264     uint64_t cutoff;
 265
 266     cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
 267     const int cutlim = cutoff % static_cast<int64_t>(base);
 268     cutoff /= static_cast<uint64_t>(base);
 269
 270     int any = 0, c;
 271     int64_t acc = 0;
 272     do {
 273         c = *s;
 274         if (xisdigit(c)) {
 275             c -= '0';
 276         } else if (xisalpha(c)) {
 277             c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
 278         } else {
 279             break;
 280         }
 281         if (c >= base)
 282             break;
 283         if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
 284             any = -1;
 285         else {
 286             any = 1;
 287             acc *= base;
 288             acc += c;
 289         }
 290     } while (++s < end);
 291
 292     if (any == 0) // nothing was parsed
 293         return false;
 294     if (any < 0) {
 295         acc = neg ? INT64_MIN : INT64_MAX;
 296         errno = ERANGE;
 297         return false;
 298     } else if (neg)
 299         acc = -acc;
 300
 301     result = acc;
 302     return success(s - range.rawContent());
 303 }
 304
 305 int64_t
 306 Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
 307 {
 308     if (atEnd())
 309         throw InsufficientInput();
 310
 311     int64_t result = 0;
 312
 313     // Since we only support unsigned decimals, a parsing failure with a
 314     // non-empty input always implies invalid/malformed input (or a buggy
 315     // limit=0 caller). TODO: Support signed and non-decimal integers by
 316     // refactoring int64() to detect insufficient input.
 317     if (!int64(result, 10, false, limit))
 318         throw TexcHere(ToSBuf("cannot parse ", description));
 319
 320     if (atEnd())
 321         throw InsufficientInput(); // more digits may be coming
 322
 323     return result;
 324 }
 325