]>
Commit | Line | Data |
---|---|---|
ef416fc2 | 1 | //======================================================================== |
2 | // | |
3 | // Lexer.cc | |
4 | // | |
5 | // Copyright 1996-2003 Glyph & Cog, LLC | |
6 | // | |
7 | //======================================================================== | |
8 | ||
9 | #include <config.h> | |
10 | ||
11 | #ifdef USE_GCC_PRAGMAS | |
12 | #pragma implementation | |
13 | #endif | |
14 | ||
15 | #include <stdlib.h> | |
16 | #include <stddef.h> | |
17 | #include <string.h> | |
18 | #include <ctype.h> | |
19 | #include "Lexer.h" | |
20 | #include "Error.h" | |
21 | ||
22 | //------------------------------------------------------------------------ | |
23 | ||
24 | // A '1' in this array means the character is white space. A '1' or | |
25 | // '2' means the character ends a name or command. | |
26 | static char specialChars[256] = { | |
27 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, // 0x | |
28 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x | |
29 | 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, // 2x | |
30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, // 3x | |
31 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x | |
32 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 5x | |
33 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6x | |
34 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, // 7x | |
35 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x | |
36 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x | |
37 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ax | |
38 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // bx | |
39 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // cx | |
40 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // dx | |
41 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // ex | |
42 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // fx | |
43 | }; | |
44 | ||
45 | //------------------------------------------------------------------------ | |
46 | // Lexer | |
47 | //------------------------------------------------------------------------ | |
48 | ||
49 | Lexer::Lexer(XRef *xref, Stream *str) { | |
50 | Object obj; | |
51 | ||
52 | curStr.initStream(str); | |
53 | streams = new Array(xref); | |
54 | streams->add(curStr.copy(&obj)); | |
55 | strPtr = 0; | |
56 | freeArray = gTrue; | |
57 | curStr.streamReset(); | |
58 | } | |
59 | ||
60 | Lexer::Lexer(XRef *xref, Object *obj) { | |
61 | Object obj2; | |
62 | ||
63 | if (obj->isStream()) { | |
64 | streams = new Array(xref); | |
65 | freeArray = gTrue; | |
66 | streams->add(obj->copy(&obj2)); | |
67 | } else { | |
68 | streams = obj->getArray(); | |
69 | freeArray = gFalse; | |
70 | } | |
71 | strPtr = 0; | |
72 | if (streams->getLength() > 0) { | |
73 | streams->get(strPtr, &curStr); | |
74 | curStr.streamReset(); | |
75 | } | |
76 | } | |
77 | ||
78 | Lexer::~Lexer() { | |
79 | if (!curStr.isNone()) { | |
80 | curStr.streamClose(); | |
81 | curStr.free(); | |
82 | } | |
83 | if (freeArray) { | |
84 | delete streams; | |
85 | } | |
86 | } | |
87 | ||
88 | int Lexer::getChar() { | |
89 | int c; | |
90 | ||
91 | c = EOF; | |
92 | while (!curStr.isNone() && (c = curStr.streamGetChar()) == EOF) { | |
93 | curStr.streamClose(); | |
94 | curStr.free(); | |
95 | ++strPtr; | |
96 | if (strPtr < streams->getLength()) { | |
97 | streams->get(strPtr, &curStr); | |
98 | curStr.streamReset(); | |
99 | } | |
100 | } | |
101 | return c; | |
102 | } | |
103 | ||
104 | int Lexer::lookChar() { | |
105 | if (curStr.isNone()) { | |
106 | return EOF; | |
107 | } | |
108 | return curStr.streamLookChar(); | |
109 | } | |
110 | ||
111 | Object *Lexer::getObj(Object *obj) { | |
112 | char *p; | |
113 | int c, c2; | |
114 | GBool comment, neg, done; | |
115 | int numParen; | |
116 | int xi; | |
117 | double xf, scale; | |
118 | GString *s; | |
119 | int n, m; | |
120 | ||
121 | // skip whitespace and comments | |
122 | comment = gFalse; | |
123 | while (1) { | |
124 | if ((c = getChar()) == EOF) { | |
125 | return obj->initEOF(); | |
126 | } | |
127 | if (comment) { | |
128 | if (c == '\r' || c == '\n') | |
129 | comment = gFalse; | |
130 | } else if (c == '%') { | |
131 | comment = gTrue; | |
132 | } else if (specialChars[c] != 1) { | |
133 | break; | |
134 | } | |
135 | } | |
136 | ||
137 | // start reading token | |
138 | switch (c) { | |
139 | ||
140 | // number | |
141 | case '0': case '1': case '2': case '3': case '4': | |
142 | case '5': case '6': case '7': case '8': case '9': | |
143 | case '-': case '.': | |
144 | neg = gFalse; | |
145 | xi = 0; | |
146 | if (c == '-') { | |
147 | neg = gTrue; | |
148 | } else if (c == '.') { | |
149 | goto doReal; | |
150 | } else { | |
151 | xi = c - '0'; | |
152 | } | |
153 | while (1) { | |
154 | c = lookChar(); | |
155 | if (isdigit(c)) { | |
156 | getChar(); | |
157 | xi = xi * 10 + (c - '0'); | |
158 | } else if (c == '.') { | |
159 | getChar(); | |
160 | goto doReal; | |
161 | } else { | |
162 | break; | |
163 | } | |
164 | } | |
165 | if (neg) | |
166 | xi = -xi; | |
167 | obj->initInt(xi); | |
168 | break; | |
169 | doReal: | |
170 | xf = xi; | |
171 | scale = 0.1; | |
172 | while (1) { | |
173 | c = lookChar(); | |
174 | if (c == '-') { | |
175 | // ignore minus signs in the middle of numbers to match | |
176 | // Adobe's behavior | |
177 | error(getPos(), "Badly formatted number"); | |
178 | getChar(); | |
179 | continue; | |
180 | } | |
181 | if (!isdigit(c)) { | |
182 | break; | |
183 | } | |
184 | getChar(); | |
185 | xf = xf + scale * (c - '0'); | |
186 | scale *= 0.1; | |
187 | } | |
188 | if (neg) | |
189 | xf = -xf; | |
190 | obj->initReal(xf); | |
191 | break; | |
192 | ||
193 | // string | |
194 | case '(': | |
195 | p = tokBuf; | |
196 | n = 0; | |
197 | numParen = 1; | |
198 | done = gFalse; | |
199 | s = NULL; | |
200 | do { | |
201 | c2 = EOF; | |
202 | switch (c = getChar()) { | |
203 | ||
204 | case EOF: | |
205 | #if 0 | |
206 | // This breaks some PDF files, e.g., ones from Photoshop. | |
207 | case '\r': | |
208 | case '\n': | |
209 | #endif | |
210 | error(getPos(), "Unterminated string"); | |
211 | done = gTrue; | |
212 | break; | |
213 | ||
214 | case '(': | |
215 | ++numParen; | |
216 | c2 = c; | |
217 | break; | |
218 | ||
219 | case ')': | |
220 | if (--numParen == 0) { | |
221 | done = gTrue; | |
222 | } else { | |
223 | c2 = c; | |
224 | } | |
225 | break; | |
226 | ||
227 | case '\\': | |
228 | switch (c = getChar()) { | |
229 | case 'n': | |
230 | c2 = '\n'; | |
231 | break; | |
232 | case 'r': | |
233 | c2 = '\r'; | |
234 | break; | |
235 | case 't': | |
236 | c2 = '\t'; | |
237 | break; | |
238 | case 'b': | |
239 | c2 = '\b'; | |
240 | break; | |
241 | case 'f': | |
242 | c2 = '\f'; | |
243 | break; | |
244 | case '\\': | |
245 | case '(': | |
246 | case ')': | |
247 | c2 = c; | |
248 | break; | |
249 | case '0': case '1': case '2': case '3': | |
250 | case '4': case '5': case '6': case '7': | |
251 | c2 = c - '0'; | |
252 | c = lookChar(); | |
253 | if (c >= '0' && c <= '7') { | |
254 | getChar(); | |
255 | c2 = (c2 << 3) + (c - '0'); | |
256 | c = lookChar(); | |
257 | if (c >= '0' && c <= '7') { | |
258 | getChar(); | |
259 | c2 = (c2 << 3) + (c - '0'); | |
260 | } | |
261 | } | |
262 | break; | |
263 | case '\r': | |
264 | c = lookChar(); | |
265 | if (c == '\n') { | |
266 | getChar(); | |
267 | } | |
268 | break; | |
269 | case '\n': | |
270 | break; | |
271 | case EOF: | |
272 | error(getPos(), "Unterminated string"); | |
273 | done = gTrue; | |
274 | break; | |
275 | default: | |
276 | c2 = c; | |
277 | break; | |
278 | } | |
279 | break; | |
280 | ||
281 | default: | |
282 | c2 = c; | |
283 | break; | |
284 | } | |
285 | ||
286 | if (c2 != EOF) { | |
287 | if (n == tokBufSize) { | |
288 | if (!s) | |
289 | s = new GString(tokBuf, tokBufSize); | |
290 | else | |
291 | s->append(tokBuf, tokBufSize); | |
292 | p = tokBuf; | |
293 | n = 0; | |
294 | } | |
295 | *p++ = (char)c2; | |
296 | ++n; | |
297 | } | |
298 | } while (!done); | |
299 | if (!s) | |
300 | s = new GString(tokBuf, n); | |
301 | else | |
302 | s->append(tokBuf, n); | |
303 | obj->initString(s); | |
304 | break; | |
305 | ||
306 | // name | |
307 | case '/': | |
308 | p = tokBuf; | |
309 | n = 0; | |
310 | while ((c = lookChar()) != EOF && !specialChars[c]) { | |
311 | getChar(); | |
312 | if (c == '#') { | |
313 | c2 = lookChar(); | |
314 | if (c2 >= '0' && c2 <= '9') { | |
315 | c = c2 - '0'; | |
316 | } else if (c2 >= 'A' && c2 <= 'F') { | |
317 | c = c2 - 'A' + 10; | |
318 | } else if (c2 >= 'a' && c2 <= 'f') { | |
319 | c = c2 - 'a' + 10; | |
320 | } else { | |
321 | goto notEscChar; | |
322 | } | |
323 | getChar(); | |
324 | c <<= 4; | |
325 | c2 = getChar(); | |
326 | if (c2 >= '0' && c2 <= '9') { | |
327 | c += c2 - '0'; | |
328 | } else if (c2 >= 'A' && c2 <= 'F') { | |
329 | c += c2 - 'A' + 10; | |
330 | } else if (c2 >= 'a' && c2 <= 'f') { | |
331 | c += c2 - 'a' + 10; | |
332 | } else { | |
333 | error(getPos(), "Illegal digit in hex char in name"); | |
334 | } | |
335 | } | |
336 | notEscChar: | |
337 | if (++n == tokBufSize) { | |
338 | error(getPos(), "Name token too long"); | |
339 | break; | |
340 | } | |
341 | *p++ = c; | |
342 | } | |
343 | *p = '\0'; | |
344 | obj->initName(tokBuf); | |
345 | break; | |
346 | ||
347 | // array punctuation | |
348 | case '[': | |
349 | case ']': | |
350 | tokBuf[0] = c; | |
351 | tokBuf[1] = '\0'; | |
352 | obj->initCmd(tokBuf); | |
353 | break; | |
354 | ||
355 | // hex string or dict punctuation | |
356 | case '<': | |
357 | c = lookChar(); | |
358 | ||
359 | // dict punctuation | |
360 | if (c == '<') { | |
361 | getChar(); | |
362 | tokBuf[0] = tokBuf[1] = '<'; | |
363 | tokBuf[2] = '\0'; | |
364 | obj->initCmd(tokBuf); | |
365 | ||
366 | // hex string | |
367 | } else { | |
368 | p = tokBuf; | |
369 | m = n = 0; | |
370 | c2 = 0; | |
371 | s = NULL; | |
372 | while (1) { | |
373 | c = getChar(); | |
374 | if (c == '>') { | |
375 | break; | |
376 | } else if (c == EOF) { | |
377 | error(getPos(), "Unterminated hex string"); | |
378 | break; | |
379 | } else if (specialChars[c] != 1) { | |
380 | c2 = c2 << 4; | |
381 | if (c >= '0' && c <= '9') | |
382 | c2 += c - '0'; | |
383 | else if (c >= 'A' && c <= 'F') | |
384 | c2 += c - 'A' + 10; | |
385 | else if (c >= 'a' && c <= 'f') | |
386 | c2 += c - 'a' + 10; | |
387 | else | |
388 | error(getPos(), "Illegal character <%02x> in hex string", c); | |
389 | if (++m == 2) { | |
390 | if (n == tokBufSize) { | |
391 | if (!s) | |
392 | s = new GString(tokBuf, tokBufSize); | |
393 | else | |
394 | s->append(tokBuf, tokBufSize); | |
395 | p = tokBuf; | |
396 | n = 0; | |
397 | } | |
398 | *p++ = (char)c2; | |
399 | ++n; | |
400 | c2 = 0; | |
401 | m = 0; | |
402 | } | |
403 | } | |
404 | } | |
405 | if (!s) | |
406 | s = new GString(tokBuf, n); | |
407 | else | |
408 | s->append(tokBuf, n); | |
409 | if (m == 1) | |
410 | s->append((char)(c2 << 4)); | |
411 | obj->initString(s); | |
412 | } | |
413 | break; | |
414 | ||
415 | // dict punctuation | |
416 | case '>': | |
417 | c = lookChar(); | |
418 | if (c == '>') { | |
419 | getChar(); | |
420 | tokBuf[0] = tokBuf[1] = '>'; | |
421 | tokBuf[2] = '\0'; | |
422 | obj->initCmd(tokBuf); | |
423 | } else { | |
424 | error(getPos(), "Illegal character '>'"); | |
425 | obj->initError(); | |
426 | } | |
427 | break; | |
428 | ||
429 | // error | |
430 | case ')': | |
431 | case '{': | |
432 | case '}': | |
433 | error(getPos(), "Illegal character '%c'", c); | |
434 | obj->initError(); | |
435 | break; | |
436 | ||
437 | // command | |
438 | default: | |
439 | p = tokBuf; | |
440 | *p++ = c; | |
441 | n = 1; | |
442 | while ((c = lookChar()) != EOF && !specialChars[c]) { | |
443 | getChar(); | |
444 | if (++n == tokBufSize) { | |
445 | error(getPos(), "Command token too long"); | |
446 | break; | |
447 | } | |
448 | *p++ = c; | |
449 | } | |
450 | *p = '\0'; | |
451 | if (tokBuf[0] == 't' && !strcmp(tokBuf, "true")) { | |
452 | obj->initBool(gTrue); | |
453 | } else if (tokBuf[0] == 'f' && !strcmp(tokBuf, "false")) { | |
454 | obj->initBool(gFalse); | |
455 | } else if (tokBuf[0] == 'n' && !strcmp(tokBuf, "null")) { | |
456 | obj->initNull(); | |
457 | } else { | |
458 | obj->initCmd(tokBuf); | |
459 | } | |
460 | break; | |
461 | } | |
462 | ||
463 | return obj; | |
464 | } | |
465 | ||
466 | void Lexer::skipToNextLine() { | |
467 | int c; | |
468 | ||
469 | while (1) { | |
470 | c = getChar(); | |
471 | if (c == EOF || c == '\n') { | |
472 | return; | |
473 | } | |
474 | if (c == '\r') { | |
475 | if ((c = lookChar()) == '\n') { | |
476 | getChar(); | |
477 | } | |
478 | return; | |
479 | } | |
480 | } | |
481 | } | |
482 | ||
483 | GBool Lexer::isSpace(int c) { | |
484 | return c >= 0 && c <= 0xff && specialChars[c] == 1; | |
485 | } |