pdftops/Lexer.cxx

   1 //========================================================================
   2 //
   3 // Lexer.cc
   4 //
   5 // Copyright 1996-2003 Glyph & Cog, LLC
   6 //
   7 //========================================================================
   8
   9 #include <config.h>
  10
  11 #ifdef USE_GCC_PRAGMAS
  12 #pragma implementation
  13 #endif
  14
  15 #include <stdlib.h>
  16 #include <stddef.h>
  17 #include <string.h>
  18 #include <ctype.h>
  19 #include "Lexer.h"
  20 #include "Error.h"
  21
  22 //------------------------------------------------------------------------
  23
  24 // A '1' in this array means the character is white space.  A '1' or
  25 // '2' means the character ends a name or command.
  26 static char specialChars[256] = {
  27   1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,   // 0x
  28   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 1x
  29   1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2,   // 2x
  30   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,   // 3x
  31   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 4x
  32   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 5x
  33   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 6x
  34   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0,   // 7x
  35   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 8x
  36   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // 9x
  37   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ax
  38   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // bx
  39   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // cx
  40   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // dx
  41   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // ex
  42   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    // fx
  43 };
  44
  45 //------------------------------------------------------------------------
  46 // Lexer
  47 //------------------------------------------------------------------------
  48
  49 Lexer::Lexer(XRef *xref, Stream *str) {
  50   Object obj;
  51
  52   curStr.initStream(str);
  53   streams = new Array(xref);
  54   streams->add(curStr.copy(&obj));
  55   strPtr = 0;
  56   freeArray = gTrue;
  57   curStr.streamReset();
  58 }
  59
  60 Lexer::Lexer(XRef *xref, Object *obj) {
  61   Object obj2;
  62
  63   if (obj->isStream()) {
  64     streams = new Array(xref);
  65     freeArray = gTrue;
  66     streams->add(obj->copy(&obj2));
  67   } else {
  68     streams = obj->getArray();
  69     freeArray = gFalse;
  70   }
  71   strPtr = 0;
  72   if (streams->getLength() > 0) {
  73     streams->get(strPtr, &curStr);
  74     curStr.streamReset();
  75   }
  76 }
  77
  78 Lexer::~Lexer() {
  79   if (!curStr.isNone()) {
  80     curStr.streamClose();
  81     curStr.free();
  82   }
  83   if (freeArray) {
  84     delete streams;
  85   }
  86 }
  87
  88 int Lexer::getChar() {
  89   int c;
  90
  91   c = EOF;
  92   while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) {
  93     curStr.streamClose();
  94     curStr.free();
  95     ++strPtr;
  96     if (strPtr < streams->getLength()) {
  97       streams->get(strPtr, &curStr);
  98       curStr.streamReset();
  99     }
 100   }
 101   return c;
 102 }
 103
 104 int Lexer::lookChar() {
 105   if (curStr.isNone()) {
 106     return EOF;
 107   }
 108   return curStr.streamLookChar();
 109 }
 110
 111 Object *Lexer::getObj(Object *obj) {
 112   char *p;
 113   int c, c2;
 114   GBool comment, neg, done;
 115   int numParen;
 116   int xi;
 117   double xf, scale;
 118   GString *s;
 119   int n, m;
 120
 121   // skip whitespace and comments
 122   comment = gFalse;
 123   while (1) {
 124     if ((c = getChar()) == EOF) {
 125       return obj->initEOF();
 126     }
 127     if (comment) {
 128       if (c == '\r' || c == '\n')
 129         comment = gFalse;
 130     } else if (c == '%') {
 131       comment = gTrue;
 132     } else if (specialChars[c] != 1) {
 133       break;
 134     }
 135   }
 136
 137   // start reading token
 138   switch (c) {
 139
 140   // number
 141   case '0': case '1': case '2': case '3': case '4':
 142   case '5': case '6': case '7': case '8': case '9':
 143   case '-': case '.':
 144     neg = gFalse;
 145     xi = 0;
 146     if (c == '-') {
 147       neg = gTrue;
 148     } else if (c == '.') {
 149       goto doReal;
 150     } else {
 151       xi = c - '0';
 152     }
 153     while (1) {
 154       c = lookChar();
 155       if (isdigit(c)) {
 156         getChar();
 157         xi = xi * 10 + (c - '0');
 158       } else if (c == '.') {
 159         getChar();
 160         goto doReal;
 161       } else {
 162         break;
 163       }
 164     }
 165     if (neg)
 166       xi = -xi;
 167     obj->initInt(xi);
 168     break;
 169   doReal:
 170     xf = xi;
 171     scale = 0.1;
 172     while (1) {
 173       c = lookChar();
 174       if (!isdigit(c)) {
 175         break;
 176       }
 177       getChar();
 178       xf = xf + scale * (c - '0');
 179       scale *= 0.1;
 180     }
 181     if (neg)
 182       xf = -xf;
 183     obj->initReal(xf);
 184     break;
 185
 186   // string
 187   case '(':
 188     p = tokBuf;
 189     n = 0;
 190     numParen = 1;
 191     done = gFalse;
 192     s = NULL;
 193     do {
 194       c2 = EOF;
 195       switch (c = getChar()) {
 196
 197       case EOF:
 198 #if 0
 199       // This breaks some PDF files, e.g., ones from Photoshop.
 200       case '\r':
 201       case '\n':
 202 #endif
 203         error(getPos(), "Unterminated string");
 204         done = gTrue;
 205         break;
 206
 207       case '(':
 208         ++numParen;
 209         c2 = c;
 210         break;
 211
 212       case ')':
 213         if (--numParen == 0) {
 214           done = gTrue;
 215         } else {
 216           c2 = c;
 217         }
 218         break;
 219
 220       case '\\':
 221         switch (c = getChar()) {
 222         case 'n':
 223           c2 = '\n';
 224           break;
 225         case 'r':
 226           c2 = '\r';
 227           break;
 228         case 't':
 229           c2 = '\t';
 230           break;
 231         case 'b':
 232           c2 = '\b';
 233           break;
 234         case 'f':
 235           c2 = '\f';
 236           break;
 237         case '\\':
 238         case '(':
 239         case ')':
 240           c2 = c;
 241           break;
 242         case '0': case '1': case '2': case '3':
 243         case '4': case '5': case '6': case '7':
 244           c2 = c - '0';
 245           c = lookChar();
 246           if (c >= '0' && c <= '7') {
 247             getChar();
 248             c2 = (c2 << 3) + (c - '0');
 249             c = lookChar();
 250             if (c >= '0' && c <= '7') {
 251               getChar();
 252               c2 = (c2 << 3) + (c - '0');
 253             }
 254           }
 255           break;
 256         case '\r':
 257           c = lookChar();
 258           if (c == '\n') {
 259             getChar();
 260           }
 261           break;
 262         case '\n':
 263           break;
 264         case EOF:
 265           error(getPos(), "Unterminated string");
 266           done = gTrue;
 267           break;
 268         default:
 269           c2 = c;
 270           break;
 271         }
 272         break;
 273
 274       default:
 275         c2 = c;
 276         break;
 277       }
 278
 279       if (c2 != EOF) {
 280         if (n == tokBufSize) {
 281           if (!s)
 282             s = new GString(tokBuf, tokBufSize);
 283           else
 284             s->append(tokBuf, tokBufSize);
 285           p = tokBuf;
 286           n = 0;
 287         }
 288         *p++ = (char)c2;
 289         ++n;
 290       }
 291     } while (!done);
 292     if (!s)
 293       s = new GString(tokBuf, n);
 294     else
 295       s->append(tokBuf, n);
 296     obj->initString(s);
 297     break;
 298
 299   // name
 300   case '/':
 301     p = tokBuf;
 302     n = 0;
 303     while ((c = lookChar()) != EOF && !specialChars[c]) {
 304       getChar();
 305       if (c == '#') {
 306         c2 = lookChar();
 307         if (c2 >= '0' && c2 <= '9') {
 308           c = c2 - '0';
 309         } else if (c2 >= 'A' && c2 <= 'F') {
 310           c = c2 - 'A' + 10;
 311         } else if (c2 >= 'a' && c2 <= 'f') {
 312           c = c2 - 'a' + 10;
 313         } else {
 314           goto notEscChar;
 315         }
 316         getChar();
 317         c <<= 4;
 318         c2 = getChar();
 319         if (c2 >= '0' && c2 <= '9') {
 320           c += c2 - '0';
 321         } else if (c2 >= 'A' && c2 <= 'F') {
 322           c += c2 - 'A' + 10;
 323         } else if (c2 >= 'a' && c2 <= 'f') {
 324           c += c2 - 'a' + 10;
 325         } else {
 326           error(getPos(), "Illegal digit in hex char in name");
 327         }
 328       }
 329      notEscChar:
 330       if (++n == tokBufSize) {
 331         error(getPos(), "Name token too long");
 332         break;
 333       }
 334       *p++ = c;
 335     }
 336     *p = '\0';
 337     obj->initName(tokBuf);
 338     break;
 339
 340   // array punctuation
 341   case '[':
 342   case ']':
 343     tokBuf[0] = c;
 344     tokBuf[1] = '\0';
 345     obj->initCmd(tokBuf);
 346     break;
 347
 348   // hex string or dict punctuation
 349   case '<':
 350     c = lookChar();
 351
 352     // dict punctuation
 353     if (c == '<') {
 354       getChar();
 355       tokBuf[0] = tokBuf[1] = '<';
 356       tokBuf[2] = '\0';
 357       obj->initCmd(tokBuf);
 358
 359     // hex string
 360     } else {
 361       p = tokBuf;
 362       m = n = 0;
 363       c2 = 0;
 364       s = NULL;
 365       while (1) {
 366         c = getChar();
 367         if (c == '>') {
 368           break;
 369         } else if (c == EOF) {
 370           error(getPos(), "Unterminated hex string");
 371           break;
 372         } else if (specialChars[c] != 1) {
 373           c2 = c2 << 4;
 374           if (c >= '0' && c <= '9')
 375             c2 += c - '0';
 376           else if (c >= 'A' && c <= 'F')
 377             c2 += c - 'A' + 10;
 378           else if (c >= 'a' && c <= 'f')
 379             c2 += c - 'a' + 10;
 380           else
 381             error(getPos(), "Illegal character <%02x> in hex string", c);
 382           if (++m == 2) {
 383             if (n == tokBufSize) {
 384               if (!s)
 385                 s = new GString(tokBuf, tokBufSize);
 386               else
 387                 s->append(tokBuf, tokBufSize);
 388               p = tokBuf;
 389               n = 0;
 390             }
 391             *p++ = (char)c2;
 392             ++n;
 393             c2 = 0;
 394             m = 0;
 395           }
 396         }
 397       }
 398       if (!s)
 399         s = new GString(tokBuf, n);
 400       else
 401         s->append(tokBuf, n);
 402       if (m == 1)
 403         s->append((char)(c2 << 4));
 404       obj->initString(s);
 405     }
 406     break;
 407
 408   // dict punctuation
 409   case '>':
 410     c = lookChar();
 411     if (c == '>') {
 412       getChar();
 413       tokBuf[0] = tokBuf[1] = '>';
 414       tokBuf[2] = '\0';
 415       obj->initCmd(tokBuf);
 416     } else {
 417       error(getPos(), "Illegal character '>'");
 418       obj->initError();
 419     }
 420     break;
 421
 422   // error
 423   case ')':
 424   case '{':
 425   case '}':
 426     error(getPos(), "Illegal character '%c'", c);
 427     obj->initError();
 428     break;
 429
 430   // command
 431   default:
 432     p = tokBuf;
 433     *p++ = c;
 434     n = 1;
 435     while ((c = lookChar()) != EOF && !specialChars[c]) {
 436       getChar();
 437       if (++n == tokBufSize) {
 438         error(getPos(), "Command token too long");
 439         break;
 440       }
 441       *p++ = c;
 442     }
 443     *p = '\0';
 444     if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) {
 445       obj->initBool(gTrue);
 446     } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) {
 447       obj->initBool(gFalse);
 448     } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) {
 449       obj->initNull();
 450     } else {
 451       obj->initCmd(tokBuf);
 452     }
 453     break;
 454   }
 455
 456   return obj;
 457 }
 458
 459 void Lexer::skipToNextLine() {
 460   int c;
 461
 462   while (1) {
 463     c = getChar();
 464     if (c == EOF || c == '\n') {
 465       return;
 466     }
 467     if (c == '\r') {
 468       if ((c = lookChar()) == '\n') {
 469         getChar();
 470       }
 471       return;
 472     }
 473   }
 474 }