src/tests/testTokenizer.cc

   1 /*
   2  * Copyright (C) 1996-2021 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 #include "squid.h"
  10 #include "base/CharacterSet.h"
  11 #include "parser/Tokenizer.h"
  12 #include "tests/testTokenizer.h"
  13 #include "unitTestMain.h"
  14
  15 CPPUNIT_TEST_SUITE_REGISTRATION( testTokenizer );
  16
  17 SBuf text("GET http://resource.com/path HTTP/1.1\r\n"
  18           "Host: resource.com\r\n"
  19           "Cookie: laijkpk3422r j1noin \r\n"
  20           "\r\n");
  21 const CharacterSet alpha("alpha","abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ");
  22 const CharacterSet whitespace("whitespace"," \r\n");
  23 const CharacterSet crlf("crlf","\r\n");
  24 const CharacterSet tab("tab","\t");
  25 const CharacterSet numbers("numbers","0123456789");
  26
  27 void
  28 testTokenizer::testTokenizerPrefix()
  29 {
  30     const SBuf canary("This text should not be changed.");
  31
  32     Parser::Tokenizer t(text);
  33     SBuf s;
  34
  35     CharacterSet all(whitespace);
  36     all += alpha;
  37     all += crlf;
  38     all += numbers;
  39     all.add(':').add('.').add('/');
  40
  41     // an empty prefix should return false (the full output buffer case)
  42     s = canary;
  43     const SBuf before = t.remaining();
  44     CPPUNIT_ASSERT(!t.prefix(s, all, 0));
  45     // ... and a false return value means no parameter changes
  46     CPPUNIT_ASSERT_EQUAL(canary, s);
  47     // ... and a false return value means no input buffer changes
  48     CPPUNIT_ASSERT_EQUAL(before, t.remaining());
  49
  50     // successful prefix tokenization
  51     CPPUNIT_ASSERT(t.prefix(s,alpha));
  52     CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
  53     CPPUNIT_ASSERT(t.prefix(s,whitespace));
  54     CPPUNIT_ASSERT_EQUAL(SBuf(" "),s);
  55
  56     //no match (first char is not in the prefix set)
  57     CPPUNIT_ASSERT(!t.prefix(s,whitespace));
  58     CPPUNIT_ASSERT_EQUAL(SBuf(" "),s);
  59
  60     // one more match to set S to something meaningful
  61     CPPUNIT_ASSERT(t.prefix(s,alpha));
  62     CPPUNIT_ASSERT_EQUAL(SBuf("http"),s);
  63
  64     //no match (no characters from the character set in the prefix)
  65     CPPUNIT_ASSERT(!t.prefix(s,tab));
  66     CPPUNIT_ASSERT_EQUAL(SBuf("http"),s); //output SBuf left untouched
  67
  68     // match until the end of the sample
  69     CPPUNIT_ASSERT(t.prefix(s,all));
  70     CPPUNIT_ASSERT_EQUAL(SBuf(),t.remaining());
  71
  72     // empty prefix should return false (the empty input buffer case)
  73     s = canary;
  74     CPPUNIT_ASSERT(!t.prefix(s, all));
  75     // ... and a false return value means no parameter changes
  76     CPPUNIT_ASSERT_EQUAL(canary, s);
  77 }
  78
  79 void
  80 testTokenizer::testTokenizerSkip()
  81 {
  82     Parser::Tokenizer t(text);
  83     SBuf s;
  84
  85     // first scenario: patterns match
  86     // prep for test
  87     CPPUNIT_ASSERT(t.prefix(s,alpha));
  88     CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
  89
  90     // test skipping one character from a character set
  91     CPPUNIT_ASSERT(t.skipOne(whitespace));
  92     // check that skip was right
  93     CPPUNIT_ASSERT(t.prefix(s,alpha));
  94     CPPUNIT_ASSERT_EQUAL(SBuf("http"),s);
  95
  96     //check skip prefix
  97     CPPUNIT_ASSERT(t.skip(SBuf("://")));
  98     // verify
  99     CPPUNIT_ASSERT(t.prefix(s,alpha));
 100     CPPUNIT_ASSERT_EQUAL(SBuf("resource"),s);
 101
 102     // no skip
 103     CPPUNIT_ASSERT(!t.skipOne(alpha));
 104     CPPUNIT_ASSERT(!t.skip(SBuf("://")));
 105     CPPUNIT_ASSERT(!t.skip('a'));
 106
 107     // test skipping all characters from a character set while looking at .com
 108     CPPUNIT_ASSERT(t.skip('.'));
 109     CPPUNIT_ASSERT_EQUAL(static_cast<SBuf::size_type>(3), t.skipAll(alpha));
 110     CPPUNIT_ASSERT(t.remaining().startsWith(SBuf("/path")));
 111 }
 112
 113 void
 114 testTokenizer::testTokenizerToken()
 115 {
 116     Parser::Tokenizer t(text);
 117     SBuf s;
 118
 119     // first scenario: patterns match
 120     CPPUNIT_ASSERT(t.token(s,whitespace));
 121     CPPUNIT_ASSERT_EQUAL(SBuf("GET"),s);
 122     CPPUNIT_ASSERT(t.token(s,whitespace));
 123     CPPUNIT_ASSERT_EQUAL(SBuf("http://resource.com/path"),s);
 124     CPPUNIT_ASSERT(t.token(s,whitespace));
 125     CPPUNIT_ASSERT_EQUAL(SBuf("HTTP/1.1"),s);
 126     CPPUNIT_ASSERT(t.token(s,whitespace));
 127     CPPUNIT_ASSERT_EQUAL(SBuf("Host:"),s);
 128
 129 }
 130
 131 void
 132 testTokenizer::testTokenizerSuffix()
 133 {
 134     const SBuf canary("This text should not be changed.");
 135
 136     Parser::Tokenizer t(text);
 137     SBuf s;
 138
 139     CharacterSet all(whitespace);
 140     all += alpha;
 141     all += crlf;
 142     all += numbers;
 143     all.add(':').add('.').add('/');
 144
 145     // an empty suffix should return false (the full output buffer case)
 146     s = canary;
 147     const SBuf before = t.remaining();
 148     CPPUNIT_ASSERT(!t.suffix(s, all, 0));
 149     // ... and a false return value means no parameter changes
 150     CPPUNIT_ASSERT_EQUAL(canary, s);
 151     // ... and a false return value means no input buffer changes
 152     CPPUNIT_ASSERT_EQUAL(before, t.remaining());
 153
 154     // consume suffix until the last CRLF, including that last CRLF
 155     SBuf::size_type remaining = t.remaining().length();
 156     while (t.remaining().findLastOf(crlf) != SBuf::npos) {
 157         CPPUNIT_ASSERT(t.remaining().length() > 0);
 158         CPPUNIT_ASSERT(t.skipOneTrailing(all));
 159         // ensure steady progress
 160         CPPUNIT_ASSERT_EQUAL(remaining, t.remaining().length() + 1);
 161         --remaining;
 162     }
 163
 164     // no match (last char is not in the suffix set)
 165     CPPUNIT_ASSERT(!t.suffix(s, crlf));
 166     CPPUNIT_ASSERT(!t.suffix(s, whitespace));
 167
 168     // successful suffix tokenization
 169     CPPUNIT_ASSERT(t.suffix(s, numbers));
 170     CPPUNIT_ASSERT_EQUAL(SBuf("1"), s);
 171     CPPUNIT_ASSERT(t.skipSuffix(SBuf("1.")));
 172     CPPUNIT_ASSERT(t.skipSuffix(SBuf("/")));
 173     CPPUNIT_ASSERT(t.suffix(s, alpha));
 174     CPPUNIT_ASSERT_EQUAL(SBuf("HTTP"), s);
 175     CPPUNIT_ASSERT(t.suffix(s, whitespace));
 176     CPPUNIT_ASSERT_EQUAL(SBuf(" "), s);
 177
 178     // match until the end of the sample
 179     CPPUNIT_ASSERT(t.suffix(s, all));
 180     CPPUNIT_ASSERT_EQUAL(SBuf(), t.remaining());
 181
 182     // an empty buffer does not end with a token
 183     s = canary;
 184     CPPUNIT_ASSERT(!t.suffix(s, all));
 185     CPPUNIT_ASSERT_EQUAL(canary, s); // no parameter changes
 186
 187     // we cannot skip an empty suffix, even in an empty buffer
 188     CPPUNIT_ASSERT(!t.skipSuffix(SBuf()));
 189 }
 190
 191 void
 192 testTokenizer::testCharacterSet()
 193 {
 194
 195 }
 196
 197 void
 198 testTokenizer::testTokenizerInt64()
 199 {
 200     // successful parse in base 10
 201     {
 202         int64_t rv;
 203         Parser::Tokenizer t(SBuf("1234"));
 204         const int64_t benchmark = 1234;
 205         CPPUNIT_ASSERT(t.int64(rv, 10));
 206         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 207         CPPUNIT_ASSERT(t.buf().isEmpty());
 208     }
 209
 210     // successful parse, autodetect base
 211     {
 212         int64_t rv;
 213         Parser::Tokenizer t(SBuf("1234"));
 214         const int64_t benchmark = 1234;
 215         CPPUNIT_ASSERT(t.int64(rv));
 216         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 217         CPPUNIT_ASSERT(t.buf().isEmpty());
 218     }
 219
 220     // successful parse, autodetect base
 221     {
 222         int64_t rv;
 223         Parser::Tokenizer t(SBuf("01234"));
 224         const int64_t benchmark = 01234;
 225         CPPUNIT_ASSERT(t.int64(rv));
 226         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 227         CPPUNIT_ASSERT(t.buf().isEmpty());
 228     }
 229
 230     // successful parse, autodetect base
 231     {
 232         int64_t rv;
 233         Parser::Tokenizer t(SBuf("0x12f4"));
 234         const int64_t benchmark = 0x12f4;
 235         CPPUNIT_ASSERT(t.int64(rv));
 236         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 237         CPPUNIT_ASSERT(t.buf().isEmpty());
 238     }
 239
 240     // API mismatch: don't eat leading space
 241     {
 242         int64_t rv;
 243         Parser::Tokenizer t(SBuf(" 1234"));
 244         CPPUNIT_ASSERT(!t.int64(rv));
 245         CPPUNIT_ASSERT_EQUAL(SBuf(" 1234"), t.buf());
 246     }
 247
 248     // API mismatch: don't eat multiple leading spaces
 249     {
 250         int64_t rv;
 251         Parser::Tokenizer t(SBuf("  1234"));
 252         CPPUNIT_ASSERT(!t.int64(rv));
 253         CPPUNIT_ASSERT_EQUAL(SBuf("  1234"), t.buf());
 254     }
 255
 256     // trailing spaces
 257     {
 258         int64_t rv;
 259         Parser::Tokenizer t(SBuf("1234  foo"));
 260         const int64_t benchmark = 1234;
 261         CPPUNIT_ASSERT(t.int64(rv));
 262         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 263         CPPUNIT_ASSERT_EQUAL(SBuf("  foo"), t.buf());
 264     }
 265
 266     // trailing nonspaces
 267     {
 268         int64_t rv;
 269         Parser::Tokenizer t(SBuf("1234foo"));
 270         const int64_t benchmark = 1234;
 271         CPPUNIT_ASSERT(t.int64(rv));
 272         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 273         CPPUNIT_ASSERT_EQUAL(SBuf("foo"), t.buf());
 274     }
 275
 276     // trailing nonspaces
 277     {
 278         int64_t rv;
 279         Parser::Tokenizer t(SBuf("0x1234foo"));
 280         const int64_t benchmark = 0x1234f;
 281         CPPUNIT_ASSERT(t.int64(rv));
 282         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 283         CPPUNIT_ASSERT_EQUAL(SBuf("oo"), t.buf());
 284     }
 285
 286     // overflow
 287     {
 288         int64_t rv;
 289         Parser::Tokenizer t(SBuf("1029397752385698678762234"));
 290         CPPUNIT_ASSERT(!t.int64(rv));
 291         CPPUNIT_ASSERT_EQUAL(SBuf("1029397752385698678762234"), t.buf());
 292     }
 293
 294     // buffered sub-string parsing
 295     {
 296         int64_t rv;
 297         SBuf base("1029397752385698678762234");
 298         const int64_t benchmark = 22;
 299         Parser::Tokenizer t(base.substr(base.length()-4,2));
 300         CPPUNIT_ASSERT_EQUAL(SBuf("22"),t.buf());
 301         CPPUNIT_ASSERT(t.int64(rv));
 302         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 303         CPPUNIT_ASSERT(t.buf().isEmpty());
 304     }
 305
 306     // base-16, prefix
 307     {
 308         int64_t rv;
 309         SBuf base("deadbeefrow");
 310         const int64_t benchmark=0xdeadbeef;
 311         Parser::Tokenizer t(base);
 312         CPPUNIT_ASSERT(t.int64(rv,16));
 313         CPPUNIT_ASSERT_EQUAL(benchmark,rv);
 314         CPPUNIT_ASSERT_EQUAL(SBuf("row"),t.buf());
 315
 316     }
 317 }
 318