]>
Commit | Line | Data |
---|---|---|
ef416fc2 | 1 | //======================================================================== |
2 | // | |
3 | // CharCodeToUnicode.cc | |
4 | // | |
5 | // Copyright 2001-2003 Glyph & Cog, LLC | |
6 | // | |
7 | //======================================================================== | |
8 | ||
9 | #include <config.h> | |
10 | ||
11 | #ifdef USE_GCC_PRAGMAS | |
12 | #pragma implementation | |
13 | #endif | |
14 | ||
15 | #include <stdio.h> | |
16 | #include <string.h> | |
17 | #include "gmem.h" | |
18 | #include "gfile.h" | |
19 | #include "GString.h" | |
20 | #include "Error.h" | |
21 | #include "GlobalParams.h" | |
22 | #include "PSTokenizer.h" | |
23 | #include "CharCodeToUnicode.h" | |
24 | ||
25 | //------------------------------------------------------------------------ | |
26 | ||
27 | #define maxUnicodeString 8 | |
28 | ||
29 | struct CharCodeToUnicodeString { | |
30 | CharCode c; | |
31 | Unicode u[maxUnicodeString]; | |
32 | int len; | |
33 | }; | |
34 | ||
35 | //------------------------------------------------------------------------ | |
36 | ||
37 | static int getCharFromString(void *data) { | |
38 | char *p; | |
39 | int c; | |
40 | ||
41 | p = *(char **)data; | |
42 | if (*p) { | |
43 | c = *p++; | |
44 | *(char **)data = p; | |
45 | } else { | |
46 | c = EOF; | |
47 | } | |
48 | return c; | |
49 | } | |
50 | ||
51 | static int getCharFromFile(void *data) { | |
52 | return fgetc((FILE *)data); | |
53 | } | |
54 | ||
55 | //------------------------------------------------------------------------ | |
56 | ||
57 | CharCodeToUnicode *CharCodeToUnicode::parseCIDToUnicode(GString *fileName, | |
58 | GString *collection) { | |
59 | FILE *f; | |
60 | Unicode *mapA; | |
61 | CharCode size, mapLenA; | |
62 | char buf[64]; | |
63 | Unicode u; | |
64 | CharCodeToUnicode *ctu; | |
65 | ||
66 | if (!(f = fopen(fileName->getCString(), "r"))) { | |
67 | error(-1, "Couldn't open cidToUnicode file '%s'", | |
68 | fileName->getCString()); | |
69 | return NULL; | |
70 | } | |
71 | ||
72 | size = 32768; | |
73 | mapA = (Unicode *)gmallocn(size, sizeof(Unicode)); | |
74 | mapLenA = 0; | |
75 | ||
76 | while (getLine(buf, sizeof(buf), f)) { | |
77 | if (mapLenA == size) { | |
78 | size *= 2; | |
79 | mapA = (Unicode *)greallocn(mapA, size, sizeof(Unicode)); | |
80 | } | |
81 | if (sscanf(buf, "%x", &u) == 1) { | |
82 | mapA[mapLenA] = u; | |
83 | } else { | |
84 | error(-1, "Bad line (%d) in cidToUnicode file '%s'", | |
85 | (int)(mapLenA + 1), fileName->getCString()); | |
86 | mapA[mapLenA] = 0; | |
87 | } | |
88 | ++mapLenA; | |
89 | } | |
90 | fclose(f); | |
91 | ||
92 | ctu = new CharCodeToUnicode(collection->copy(), mapA, mapLenA, gTrue, | |
93 | NULL, 0, 0); | |
94 | gfree(mapA); | |
95 | return ctu; | |
96 | } | |
97 | ||
98 | CharCodeToUnicode *CharCodeToUnicode::parseUnicodeToUnicode( | |
99 | GString *fileName) { | |
100 | FILE *f; | |
101 | Unicode *mapA; | |
102 | CharCodeToUnicodeString *sMapA; | |
103 | CharCode size, oldSize, len, sMapSizeA, sMapLenA; | |
104 | char buf[256]; | |
105 | char *tok; | |
106 | Unicode u0; | |
107 | Unicode uBuf[maxUnicodeString]; | |
108 | CharCodeToUnicode *ctu; | |
109 | int line, n, i; | |
110 | ||
111 | if (!(f = fopen(fileName->getCString(), "r"))) { | |
112 | error(-1, "Couldn't open unicodeToUnicode file '%s'", | |
113 | fileName->getCString()); | |
114 | return NULL; | |
115 | } | |
116 | ||
117 | size = 4096; | |
118 | mapA = (Unicode *)gmallocn(size, sizeof(Unicode)); | |
119 | memset(mapA, 0, size * sizeof(Unicode)); | |
120 | len = 0; | |
121 | sMapA = NULL; | |
122 | sMapSizeA = sMapLenA = 0; | |
123 | ||
124 | line = 0; | |
125 | while (getLine(buf, sizeof(buf), f)) { | |
126 | ++line; | |
127 | if (!(tok = strtok(buf, " \t\r\n")) || | |
128 | sscanf(tok, "%x", &u0) != 1) { | |
129 | error(-1, "Bad line (%d) in unicodeToUnicode file '%s'", | |
130 | line, fileName->getCString()); | |
131 | continue; | |
132 | } | |
133 | n = 0; | |
134 | while (n < maxUnicodeString) { | |
135 | if (!(tok = strtok(NULL, " \t\r\n"))) { | |
136 | break; | |
137 | } | |
138 | if (sscanf(tok, "%x", &uBuf[n]) != 1) { | |
139 | error(-1, "Bad line (%d) in unicodeToUnicode file '%s'", | |
140 | line, fileName->getCString()); | |
141 | break; | |
142 | } | |
143 | ++n; | |
144 | } | |
145 | if (n < 1) { | |
146 | error(-1, "Bad line (%d) in unicodeToUnicode file '%s'", | |
147 | line, fileName->getCString()); | |
148 | continue; | |
149 | } | |
150 | if (u0 >= size) { | |
151 | oldSize = size; | |
152 | while (u0 >= size) { | |
153 | size *= 2; | |
154 | } | |
155 | mapA = (Unicode *)greallocn(mapA, size, sizeof(Unicode)); | |
156 | memset(mapA + oldSize, 0, (size - oldSize) * sizeof(Unicode)); | |
157 | } | |
158 | if (n == 1) { | |
159 | mapA[u0] = uBuf[0]; | |
160 | } else { | |
161 | mapA[u0] = 0; | |
162 | if (sMapLenA == sMapSizeA) { | |
163 | sMapSizeA += 16; | |
164 | sMapA = (CharCodeToUnicodeString *) | |
165 | greallocn(sMapA, sMapSizeA, sizeof(CharCodeToUnicodeString)); | |
166 | } | |
167 | sMapA[sMapLenA].c = u0; | |
168 | for (i = 0; i < n; ++i) { | |
169 | sMapA[sMapLenA].u[i] = uBuf[i]; | |
170 | } | |
171 | sMapA[sMapLenA].len = n; | |
172 | ++sMapLenA; | |
173 | } | |
174 | if (u0 >= len) { | |
175 | len = u0 + 1; | |
176 | } | |
177 | } | |
178 | fclose(f); | |
179 | ||
180 | ctu = new CharCodeToUnicode(fileName->copy(), mapA, len, gTrue, | |
181 | sMapA, sMapLenA, sMapSizeA); | |
182 | gfree(mapA); | |
183 | return ctu; | |
184 | } | |
185 | ||
186 | CharCodeToUnicode *CharCodeToUnicode::make8BitToUnicode(Unicode *toUnicode) { | |
187 | return new CharCodeToUnicode(NULL, toUnicode, 256, gTrue, NULL, 0, 0); | |
188 | } | |
189 | ||
190 | CharCodeToUnicode *CharCodeToUnicode::parseCMap(GString *buf, int nBits) { | |
191 | CharCodeToUnicode *ctu; | |
192 | char *p; | |
193 | ||
194 | ctu = new CharCodeToUnicode(NULL); | |
195 | p = buf->getCString(); | |
196 | ctu->parseCMap1(&getCharFromString, &p, nBits); | |
197 | return ctu; | |
198 | } | |
199 | ||
200 | void CharCodeToUnicode::mergeCMap(GString *buf, int nBits) { | |
201 | char *p; | |
202 | ||
203 | p = buf->getCString(); | |
204 | parseCMap1(&getCharFromString, &p, nBits); | |
205 | } | |
206 | ||
207 | void CharCodeToUnicode::parseCMap1(int (*getCharFunc)(void *), void *data, | |
208 | int nBits) { | |
209 | PSTokenizer *pst; | |
210 | char tok1[256], tok2[256], tok3[256]; | |
211 | int nDigits, n1, n2, n3; | |
212 | CharCode i; | |
213 | CharCode code1, code2; | |
214 | GString *name; | |
215 | FILE *f; | |
216 | ||
217 | nDigits = nBits / 4; | |
218 | pst = new PSTokenizer(getCharFunc, data); | |
219 | pst->getToken(tok1, sizeof(tok1), &n1); | |
220 | while (pst->getToken(tok2, sizeof(tok2), &n2)) { | |
221 | if (!strcmp(tok2, "usecmap")) { | |
222 | if (tok1[0] == '/') { | |
223 | name = new GString(tok1 + 1); | |
224 | if ((f = globalParams->findToUnicodeFile(name))) { | |
225 | parseCMap1(&getCharFromFile, f, nBits); | |
226 | fclose(f); | |
227 | } else { | |
228 | error(-1, "Couldn't find ToUnicode CMap file for '%s'", | |
229 | name->getCString()); | |
230 | } | |
231 | delete name; | |
232 | } | |
233 | pst->getToken(tok1, sizeof(tok1), &n1); | |
234 | } else if (!strcmp(tok2, "beginbfchar")) { | |
235 | while (pst->getToken(tok1, sizeof(tok1), &n1)) { | |
236 | if (!strcmp(tok1, "endbfchar")) { | |
237 | break; | |
238 | } | |
239 | if (!pst->getToken(tok2, sizeof(tok2), &n2) || | |
240 | !strcmp(tok2, "endbfchar")) { | |
241 | error(-1, "Illegal entry in bfchar block in ToUnicode CMap"); | |
242 | break; | |
243 | } | |
244 | if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' && | |
245 | tok2[0] == '<' && tok2[n2 - 1] == '>')) { | |
246 | error(-1, "Illegal entry in bfchar block in ToUnicode CMap"); | |
247 | continue; | |
248 | } | |
249 | tok1[n1 - 1] = tok2[n2 - 1] = '\0'; | |
250 | if (sscanf(tok1 + 1, "%x", &code1) != 1) { | |
251 | error(-1, "Illegal entry in bfchar block in ToUnicode CMap"); | |
252 | continue; | |
253 | } | |
254 | addMapping(code1, tok2 + 1, n2 - 2, 0); | |
255 | } | |
256 | pst->getToken(tok1, sizeof(tok1), &n1); | |
257 | } else if (!strcmp(tok2, "beginbfrange")) { | |
258 | while (pst->getToken(tok1, sizeof(tok1), &n1)) { | |
259 | if (!strcmp(tok1, "endbfrange")) { | |
260 | break; | |
261 | } | |
262 | if (!pst->getToken(tok2, sizeof(tok2), &n2) || | |
263 | !strcmp(tok2, "endbfrange") || | |
264 | !pst->getToken(tok3, sizeof(tok3), &n3) || | |
265 | !strcmp(tok3, "endbfrange")) { | |
266 | error(-1, "Illegal entry in bfrange block in ToUnicode CMap"); | |
267 | break; | |
268 | } | |
269 | if (!(n1 == 2 + nDigits && tok1[0] == '<' && tok1[n1 - 1] == '>' && | |
270 | n2 == 2 + nDigits && tok2[0] == '<' && tok2[n2 - 1] == '>')) { | |
271 | error(-1, "Illegal entry in bfrange block in ToUnicode CMap"); | |
272 | continue; | |
273 | } | |
274 | tok1[n1 - 1] = tok2[n2 - 1] = '\0'; | |
275 | if (sscanf(tok1 + 1, "%x", &code1) != 1 || | |
276 | sscanf(tok2 + 1, "%x", &code2) != 1) { | |
277 | error(-1, "Illegal entry in bfrange block in ToUnicode CMap"); | |
278 | continue; | |
279 | } | |
280 | if (!strcmp(tok3, "[")) { | |
281 | i = 0; | |
282 | while (pst->getToken(tok1, sizeof(tok1), &n1) && | |
283 | code1 + i <= code2) { | |
284 | if (!strcmp(tok1, "]")) { | |
285 | break; | |
286 | } | |
287 | if (tok1[0] == '<' && tok1[n1 - 1] == '>') { | |
288 | tok1[n1 - 1] = '\0'; | |
289 | addMapping(code1 + i, tok1 + 1, n1 - 2, 0); | |
290 | } else { | |
291 | error(-1, "Illegal entry in bfrange block in ToUnicode CMap"); | |
292 | } | |
293 | ++i; | |
294 | } | |
295 | } else if (tok3[0] == '<' && tok3[n3 - 1] == '>') { | |
296 | tok3[n3 - 1] = '\0'; | |
297 | for (i = 0; code1 <= code2; ++code1, ++i) { | |
298 | addMapping(code1, tok3 + 1, n3 - 2, i); | |
299 | } | |
300 | ||
301 | } else { | |
302 | error(-1, "Illegal entry in bfrange block in ToUnicode CMap"); | |
303 | } | |
304 | } | |
305 | pst->getToken(tok1, sizeof(tok1), &n1); | |
306 | } else { | |
307 | strcpy(tok1, tok2); | |
308 | } | |
309 | } | |
310 | delete pst; | |
311 | } | |
312 | ||
313 | void CharCodeToUnicode::addMapping(CharCode code, char *uStr, int n, | |
314 | int offset) { | |
315 | CharCode oldLen, i; | |
316 | Unicode u; | |
317 | char uHex[5]; | |
318 | int j; | |
319 | ||
320 | if (code >= mapLen) { | |
321 | oldLen = mapLen; | |
322 | mapLen = (code + 256) & ~255; | |
323 | map = (Unicode *)greallocn(map, mapLen, sizeof(Unicode)); | |
324 | for (i = oldLen; i < mapLen; ++i) { | |
325 | map[i] = 0; | |
326 | } | |
327 | } | |
328 | if (n <= 4) { | |
329 | if (sscanf(uStr, "%x", &u) != 1) { | |
330 | error(-1, "Illegal entry in ToUnicode CMap"); | |
331 | return; | |
332 | } | |
333 | map[code] = u + offset; | |
334 | } else { | |
335 | if (sMapLen >= sMapSize) { | |
336 | sMapSize = sMapSize + 16; | |
337 | sMap = (CharCodeToUnicodeString *) | |
338 | greallocn(sMap, sMapSize, sizeof(CharCodeToUnicodeString)); | |
339 | } | |
340 | map[code] = 0; | |
341 | sMap[sMapLen].c = code; | |
342 | sMap[sMapLen].len = n / 4; | |
343 | for (j = 0; j < sMap[sMapLen].len && j < maxUnicodeString; ++j) { | |
344 | strncpy(uHex, uStr + j*4, 4); | |
345 | uHex[4] = '\0'; | |
346 | if (sscanf(uHex, "%x", &sMap[sMapLen].u[j]) != 1) { | |
347 | error(-1, "Illegal entry in ToUnicode CMap"); | |
348 | } | |
349 | } | |
350 | sMap[sMapLen].u[sMap[sMapLen].len - 1] += offset; | |
351 | ++sMapLen; | |
352 | } | |
353 | } | |
354 | ||
355 | CharCodeToUnicode::CharCodeToUnicode(GString *tagA) { | |
356 | CharCode i; | |
357 | ||
358 | tag = tagA; | |
359 | mapLen = 256; | |
360 | map = (Unicode *)gmallocn(mapLen, sizeof(Unicode)); | |
361 | for (i = 0; i < mapLen; ++i) { | |
362 | map[i] = 0; | |
363 | } | |
364 | sMap = NULL; | |
365 | sMapLen = sMapSize = 0; | |
366 | refCnt = 1; | |
367 | #if MULTITHREADED | |
368 | gInitMutex(&mutex); | |
369 | #endif | |
370 | } | |
371 | ||
372 | CharCodeToUnicode::CharCodeToUnicode(GString *tagA, Unicode *mapA, | |
373 | CharCode mapLenA, GBool copyMap, | |
374 | CharCodeToUnicodeString *sMapA, | |
375 | int sMapLenA, int sMapSizeA) { | |
376 | tag = tagA; | |
377 | mapLen = mapLenA; | |
378 | if (copyMap) { | |
379 | map = (Unicode *)gmallocn(mapLen, sizeof(Unicode)); | |
380 | memcpy(map, mapA, mapLen * sizeof(Unicode)); | |
381 | } else { | |
382 | map = mapA; | |
383 | } | |
384 | sMap = sMapA; | |
385 | sMapLen = sMapLenA; | |
386 | sMapSize = sMapSizeA; | |
387 | refCnt = 1; | |
388 | #if MULTITHREADED | |
389 | gInitMutex(&mutex); | |
390 | #endif | |
391 | } | |
392 | ||
393 | CharCodeToUnicode::~CharCodeToUnicode() { | |
394 | if (tag) { | |
395 | delete tag; | |
396 | } | |
397 | gfree(map); | |
398 | if (sMap) { | |
399 | gfree(sMap); | |
400 | } | |
401 | #if MULTITHREADED | |
402 | gDestroyMutex(&mutex); | |
403 | #endif | |
404 | } | |
405 | ||
406 | void CharCodeToUnicode::incRefCnt() { | |
407 | #if MULTITHREADED | |
408 | gLockMutex(&mutex); | |
409 | #endif | |
410 | ++refCnt; | |
411 | #if MULTITHREADED | |
412 | gUnlockMutex(&mutex); | |
413 | #endif | |
414 | } | |
415 | ||
416 | void CharCodeToUnicode::decRefCnt() { | |
417 | GBool done; | |
418 | ||
419 | #if MULTITHREADED | |
420 | gLockMutex(&mutex); | |
421 | #endif | |
422 | done = --refCnt == 0; | |
423 | #if MULTITHREADED | |
424 | gUnlockMutex(&mutex); | |
425 | #endif | |
426 | if (done) { | |
427 | delete this; | |
428 | } | |
429 | } | |
430 | ||
431 | GBool CharCodeToUnicode::match(GString *tagA) { | |
432 | return tag && !tag->cmp(tagA); | |
433 | } | |
434 | ||
435 | void CharCodeToUnicode::setMapping(CharCode c, Unicode *u, int len) { | |
436 | int i, j; | |
437 | ||
438 | if (len == 1) { | |
439 | map[c] = u[0]; | |
440 | } else { | |
441 | for (i = 0; i < sMapLen; ++i) { | |
442 | if (sMap[i].c == c) { | |
443 | break; | |
444 | } | |
445 | } | |
446 | if (i == sMapLen) { | |
447 | if (sMapLen == sMapSize) { | |
448 | sMapSize += 8; | |
449 | sMap = (CharCodeToUnicodeString *) | |
450 | greallocn(sMap, sMapSize, sizeof(CharCodeToUnicodeString)); | |
451 | } | |
452 | ++sMapLen; | |
453 | } | |
454 | map[c] = 0; | |
455 | sMap[i].c = c; | |
456 | sMap[i].len = len; | |
457 | for (j = 0; j < len && j < maxUnicodeString; ++j) { | |
458 | sMap[i].u[j] = u[j]; | |
459 | } | |
460 | } | |
461 | } | |
462 | ||
463 | int CharCodeToUnicode::mapToUnicode(CharCode c, Unicode *u, int size) { | |
464 | int i, j; | |
465 | ||
466 | if (c >= mapLen) { | |
467 | return 0; | |
468 | } | |
469 | if (map[c]) { | |
470 | u[0] = map[c]; | |
471 | return 1; | |
472 | } | |
473 | for (i = 0; i < sMapLen; ++i) { | |
474 | if (sMap[i].c == c) { | |
475 | for (j = 0; j < sMap[i].len && j < size; ++j) { | |
476 | u[j] = sMap[i].u[j]; | |
477 | } | |
478 | return j; | |
479 | } | |
480 | } | |
481 | return 0; | |
482 | } | |
483 | ||
484 | //------------------------------------------------------------------------ | |
485 | ||
486 | CharCodeToUnicodeCache::CharCodeToUnicodeCache(int sizeA) { | |
487 | int i; | |
488 | ||
489 | size = sizeA; | |
490 | cache = (CharCodeToUnicode **)gmallocn(size, sizeof(CharCodeToUnicode *)); | |
491 | for (i = 0; i < size; ++i) { | |
492 | cache[i] = NULL; | |
493 | } | |
494 | } | |
495 | ||
496 | CharCodeToUnicodeCache::~CharCodeToUnicodeCache() { | |
497 | int i; | |
498 | ||
499 | for (i = 0; i < size; ++i) { | |
500 | if (cache[i]) { | |
501 | cache[i]->decRefCnt(); | |
502 | } | |
503 | } | |
504 | gfree(cache); | |
505 | } | |
506 | ||
507 | CharCodeToUnicode *CharCodeToUnicodeCache::getCharCodeToUnicode(GString *tag) { | |
508 | CharCodeToUnicode *ctu; | |
509 | int i, j; | |
510 | ||
511 | if (cache[0] && cache[0]->match(tag)) { | |
512 | cache[0]->incRefCnt(); | |
513 | return cache[0]; | |
514 | } | |
515 | for (i = 1; i < size; ++i) { | |
516 | if (cache[i] && cache[i]->match(tag)) { | |
517 | ctu = cache[i]; | |
518 | for (j = i; j >= 1; --j) { | |
519 | cache[j] = cache[j - 1]; | |
520 | } | |
521 | cache[0] = ctu; | |
522 | ctu->incRefCnt(); | |
523 | return ctu; | |
524 | } | |
525 | } | |
526 | return NULL; | |
527 | } | |
528 | ||
529 | void CharCodeToUnicodeCache::add(CharCodeToUnicode *ctu) { | |
530 | int i; | |
531 | ||
532 | if (cache[size - 1]) { | |
533 | cache[size - 1]->decRefCnt(); | |
534 | } | |
535 | for (i = size - 1; i >= 1; --i) { | |
536 | cache[i] = cache[i - 1]; | |
537 | } | |
538 | cache[0] = ctu; | |
539 | ctu->incRefCnt(); | |
540 | } |