]> git.ipfire.org Git - thirdparty/newt.git/blame - eawidth.c
0.52.24
[thirdparty/newt.git] / eawidth.c
CommitLineData
d3661c9a 1/* #define TEST_GET_EAST_ASIA_STR_WIDTH 1 */
2
3#include <assert.h>
4#include <locale.h>
5#include <limits.h>
6#include <stdlib.h>
7#include <string.h>
8
9#include "eawidth.h"
10
11/*
12 * If the amount of columns the cursor advances on a TAB character depends
13 * on the current position, set this to a negative number (i.e. -8 for tab
14 * stops every eight columns. If static, set to a positive number. Zero if
15 * tabs are ignored.
16 */
17static const int tab_width = -8;
18
19typedef struct {
20 unsigned short start, end;
21 east_asia_type type;
22} eaw_db_type;
23
24static const eaw_db_type eaw_db[] = {
25 { 0x0020,0x007E,narrow },
6f481af2 26 { 0x00A1,0x00A1,ambiguous }, /*INVERTED EXCLAMATION MARK*/
d3661c9a 27 { 0x00A2,0x00A3,narrow },
6f481af2 28 { 0x00A4,0x00A4,ambiguous }, /*CURRENCY SIGN*/
d3661c9a 29 { 0x00A5,0x00A6,narrow },
30 { 0x00A7,0x00A8,ambiguous },
6f481af2 31 { 0x00AA,0x00AA,ambiguous }, /*FEMININE ORDINAL INDICATOR*/
32 { 0x00AC,0x00AC,narrow }, /*NOT SIGN*/
33 { 0x00AD,0x00AD,ambiguous }, /*SOFT HYPHEN*/
34 { 0x00AF,0x00AF,narrow }, /*MACRON*/
d3661c9a 35 { 0x00B0,0x00B4,ambiguous },
36 { 0x00B6,0x00BA,ambiguous },
37 { 0x00BC,0x00BF,ambiguous },
6f481af2 38 { 0x00C6,0x00C6,ambiguous }, /*LATIN CAPITAL LETTER AE*/
39 { 0x00D0,0x00D0,ambiguous }, /*LATIN CAPITAL LETTER ETH*/
d3661c9a 40 { 0x00D7,0x00D8,ambiguous },
41 { 0x00DE,0x00E1,ambiguous },
6f481af2 42 { 0x00E6,0x00E6,ambiguous }, /*LATIN SMALL LETTER AE*/
d3661c9a 43 { 0x00E8,0x00EA,ambiguous },
44 { 0x00EC,0x00ED,ambiguous },
6f481af2 45 { 0x00F0,0x00F0,ambiguous }, /*LATIN SMALL LETTER ETH*/
d3661c9a 46 { 0x00F2,0x00F3,ambiguous },
47 { 0x00F7,0x00FA,ambiguous },
6f481af2 48 { 0x00FC,0x00FC,ambiguous }, /*LATIN SMALL LETTER U WITH DIAERESIS*/
49 { 0x00FE,0x00FE,ambiguous }, /*LATIN SMALL LETTER THORN*/
50 { 0x0101,0x0101,ambiguous }, /*LATIN SMALL LETTER A WITH MACRON*/
51 { 0x0111,0x0111,ambiguous }, /*LATIN SMALL LETTER D WITH STROKE*/
52 { 0x0113,0x0113,ambiguous }, /*LATIN SMALL LETTER E WITH MACRON*/
53 { 0x011B,0x011B,ambiguous }, /*LATIN SMALL LETTER E WITH CARON*/
d3661c9a 54 { 0x0126,0x0127,ambiguous },
6f481af2 55 { 0x012B,0x012B,ambiguous }, /*LATIN SMALL LETTER I WITH MACRON*/
d3661c9a 56 { 0x0131,0x0133,ambiguous },
6f481af2 57 { 0x0138,0x0138,ambiguous }, /*LATIN SMALL LETTER KRA*/
d3661c9a 58 { 0x013F,0x0142,ambiguous },
6f481af2 59 { 0x0144,0x0144,ambiguous }, /*LATIN SMALL LETTER N WITH ACUTE*/
d3661c9a 60 { 0x0148,0x014A,ambiguous },
6f481af2 61 { 0x014D,0x014D,ambiguous }, /*LATIN SMALL LETTER O WITH MACRON*/
d3661c9a 62 { 0x0152,0x0153,ambiguous },
63 { 0x0166,0x0167,ambiguous },
6f481af2 64 { 0x016B,0x016B,ambiguous }, /*LATIN SMALL LETTER U WITH MACRON*/
65 { 0x01CE,0x01CE,ambiguous }, /*LATIN SMALL LETTER A WITH CARON*/
66 { 0x01D0,0x01D0,ambiguous }, /*LATIN SMALL LETTER I WITH CARON*/
67 { 0x01D2,0x01D2,ambiguous }, /*LATIN SMALL LETTER O WITH CARON*/
68 { 0x01D4,0x01D4,ambiguous }, /*LATIN SMALL LETTER U WITH CARON*/
69 { 0x01D6,0x01D6,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+MACRON*/
70 { 0x01D8,0x01D8,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+ACUTE*/
71 { 0x01DA,0x01DA,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+CARON*/
72 { 0x01DC,0x01DC,ambiguous }, /*LATIN SMALL LETTER U W/DIAERESIS+GRAVE*/
73 { 0x0251,0x0251,ambiguous }, /*LATIN SMALL LETTER ALPHA*/
74 { 0x0261,0x0261,ambiguous }, /*LATIN SMALL LETTER SCRIPT G*/
75 { 0x02C7,0x02C7,ambiguous }, /*CARON*/
d3661c9a 76 { 0x02C9,0x02CB,ambiguous },
6f481af2 77 { 0x02CD,0x02CD,ambiguous }, /*MODIFIER LETTER LOW MACRON*/
78 { 0x02D0,0x02D0,ambiguous }, /*MODIFIER LETTER TRIANGULAR COLON*/
d3661c9a 79 { 0x02D8,0x02DB,ambiguous },
6f481af2 80 { 0x02DD,0x02DD,ambiguous }, /*DOUBLE ACUTE ACCENT*/
d3661c9a 81 { 0x0300,0x0362,ambiguous },
82 { 0x0391,0x03A9,ambiguous },
83 { 0x03B1,0x03C1,ambiguous },
84 { 0x03C3,0x03C9,ambiguous },
6f481af2 85 { 0x0401,0x0401,ambiguous }, /*CYRILLIC CAPITAL LETTER IO*/
d3661c9a 86 { 0x0410,0x044F,ambiguous },
6f481af2 87 { 0x0451,0x0451,ambiguous }, /*CYRILLIC SMALL LETTER IO*/
d3661c9a 88 { 0x1100,0x115F,wide },
6f481af2 89 { 0x2010,0x2010,ambiguous }, /*HYPHEN*/
d3661c9a 90 { 0x2013,0x2016,ambiguous },
91 { 0x2018,0x2019,ambiguous },
92 { 0x201C,0x201D,ambiguous },
93 { 0x2020,0x2021,ambiguous },
94 { 0x2025,0x2027,ambiguous },
6f481af2 95 { 0x2030,0x2030,ambiguous }, /*PER MILLE SIGN*/
d3661c9a 96 { 0x2032,0x2033,ambiguous },
6f481af2 97 { 0x2035,0x2035,ambiguous }, /*REVERSED PRIME*/
98 { 0x203B,0x203B,ambiguous }, /*REFERENCE MARK*/
99 { 0x2074,0x2074,ambiguous }, /*SUPERSCRIPT FOUR*/
100 { 0x207F,0x207F,ambiguous }, /*SUPERSCRIPT LATIN SMALL LETTER N*/
d3661c9a 101 { 0x2081,0x2084,ambiguous },
6f481af2 102 { 0x20A9,0x20A9,half_width }, /*WON SIGN*/
103 { 0x20AC,0x20AC,ambiguous }, /*EURO SIGN*/
104 { 0x2103,0x2103,ambiguous }, /*DEGREE CELSIUS*/
105 { 0x2105,0x2105,ambiguous }, /*CARE OF*/
106 { 0x2109,0x2109,ambiguous }, /*DEGREE FAHRENHEIT*/
107 { 0x2113,0x2113,ambiguous }, /*SCRIPT SMALL L*/
d3661c9a 108 { 0x2121,0x2122,ambiguous },
6f481af2 109 { 0x2126,0x2126,ambiguous }, /*OHM SIGN*/
110 { 0x212B,0x212B,ambiguous }, /*ANGSTROM SIGN*/
d3661c9a 111 { 0x2154,0x2155,ambiguous },
6f481af2 112 { 0x215B,0x215B,ambiguous }, /*VULGAR FRACTION ONE EIGHTH*/
113 { 0x215E,0x215E,ambiguous }, /*VULGAR FRACTION SEVEN EIGHTHS*/
d3661c9a 114 { 0x2160,0x216B,ambiguous },
115 { 0x2170,0x2179,ambiguous },
116 { 0x2190,0x2199,ambiguous },
6f481af2 117 { 0x21D2,0x21D2,ambiguous }, /*RIGHTWARDS DOUBLE ARROW*/
118 { 0x21D4,0x21D4,ambiguous }, /*LEFT RIGHT DOUBLE ARROW*/
119 { 0x2200,0x2200,ambiguous }, /*FOR ALL*/
d3661c9a 120 { 0x2202,0x2203,ambiguous },
121 { 0x2207,0x2208,ambiguous },
6f481af2 122 { 0x220B,0x220B,ambiguous }, /*CONTAINS AS MEMBER*/
123 { 0x220F,0x220F,ambiguous }, /*N-ARY PRODUCT*/
124 { 0x2211,0x2211,ambiguous }, /*N-ARY SUMMATION*/
125 { 0x2215,0x2215,ambiguous }, /*DIVISION SLASH*/
126 { 0x221A,0x221A,ambiguous }, /*SQUARE ROOT*/
d3661c9a 127 { 0x221D,0x2220,ambiguous },
6f481af2 128 { 0x2223,0x2223,ambiguous }, /*DIVIDES*/
129 { 0x2225,0x2225,ambiguous }, /*PARALLEL TO*/
d3661c9a 130 { 0x2227,0x222C,ambiguous },
6f481af2 131 { 0x222E,0x222E,ambiguous }, /*CONTOUR INTEGRAL*/
d3661c9a 132 { 0x2234,0x2237,ambiguous },
133 { 0x223C,0x223D,ambiguous },
6f481af2 134 { 0x2248,0x2248,ambiguous }, /*ALMOST EQUAL TO*/
135 { 0x224C,0x224C,ambiguous }, /*ALL EQUAL TO*/
136 { 0x2252,0x2252,ambiguous }, /*APPROXIMATELY EQUAL TO OR THE IMAGE OF*/
d3661c9a 137 { 0x2260,0x2261,ambiguous },
138 { 0x2264,0x2267,ambiguous },
139 { 0x226A,0x226B,ambiguous },
140 { 0x226E,0x226F,ambiguous },
141 { 0x2282,0x2283,ambiguous },
142 { 0x2286,0x2287,ambiguous },
6f481af2 143 { 0x2295,0x2295,ambiguous }, /*CIRCLED PLUS*/
144 { 0x2299,0x2299,ambiguous }, /*CIRCLED DOT OPERATOR*/
145 { 0x22A5,0x22A5,ambiguous }, /*UP TACK*/
146 { 0x22BF,0x22BF,ambiguous }, /*RIGHT TRIANGLE*/
147 { 0x2312,0x2312,ambiguous }, /*ARC*/
d3661c9a 148 { 0x2460,0x24BF,ambiguous },
149 { 0x24D0,0x24E9,ambiguous },
150 { 0x2500,0x254B,ambiguous },
151 { 0x2550,0x2574,ambiguous },
152 { 0x2580,0x258F,ambiguous },
153 { 0x2592,0x25A1,ambiguous },
154 { 0x25A3,0x25A9,ambiguous },
155 { 0x25B2,0x25B3,ambiguous },
156 { 0x25B6,0x25B7,ambiguous },
157 { 0x25BC,0x25BD,ambiguous },
158 { 0x25C0,0x25C1,ambiguous },
159 { 0x25C6,0x25C8,ambiguous },
6f481af2 160 { 0x25CB,0x25CB,ambiguous }, /*WHITE CIRCLE*/
d3661c9a 161 { 0x25CE,0x25D1,ambiguous },
162 { 0x25E2,0x25E5,ambiguous },
6f481af2 163 { 0x25EF,0x25EF,ambiguous }, /*LARGE CIRCLE*/
d3661c9a 164 { 0x2605,0x2606,ambiguous },
6f481af2 165 { 0x2609,0x2609,ambiguous }, /*SUN*/
d3661c9a 166 { 0x260E,0x260F,ambiguous },
6f481af2 167 { 0x261C,0x261C,ambiguous }, /*WHITE LEFT POINTING INDEX*/
168 { 0x261E,0x261E,ambiguous }, /*WHITE RIGHT POINTING INDEX*/
169 { 0x2640,0x2640,ambiguous }, /*FEMALE SIGN*/
170 { 0x2642,0x2642,ambiguous }, /*MALE SIGN*/
d3661c9a 171 { 0x2660,0x2661,ambiguous },
172 { 0x2663,0x2665,ambiguous },
173 { 0x2667,0x266A,ambiguous },
174 { 0x266C,0x266D,ambiguous },
6f481af2 175 { 0x266F,0x266F,ambiguous }, /*MUSIC SHARP SIGN*/
d3661c9a 176 { 0x2E80,0x3009,wide },
177 { 0x300A,0x300B,ambiguous },
178 { 0x300C,0x3019,wide },
179 { 0x301A,0x301B,ambiguous },
180 { 0x301C,0x303E,wide },
181 { 0x3041,0xD7A3,wide },
182 { 0xE000,0xF8FF,ambiguous },
183 { 0xF900,0xFA2D,wide },
184 { 0xFE30,0xFE6B,wide },
185 { 0xFF01,0xFF5E,full_width },
186 { 0xFF61,0xFFDC,half_width },
187 { 0xFFE0,0xFFE6,full_width },
188 { 0xFFE8,0xFFEE,half_width },
189};
190
191static int
192eaw_db_cmp (const void *ck, const void *ce) {
193 const eaw_db_type *key = ck, *element = ce;
194
195 assert(key != NULL);
196 assert(element != NULL);
197 if (key->start < element->start) return -1;
198 else if (key->end > element->end) return 1;
199 return 0;
200}
201
202static int
203is_cjk_locale (const char *locale_name) {
204 static const char c[] = "zh"; /* Chinese */
205 static const char j[] = "ja"; /* Japanese */
206 static const char k[] = "ko"; /* Korean */
207
208 if (NULL == locale_name) return 0;
209 if (strncmp(locale_name, c, sizeof(c)) == 0) return 1;
210 if (strncmp(locale_name, j, sizeof(j)) == 0) return 1;
211 if (strncmp(locale_name, k, sizeof(k)) == 0) return 1;
212 return 0;
213}
214
215east_asia_type
216get_east_asia_type (wchar_t unicode) {
217 assert(0xFFFF != unicode && 0xFFFE != unicode);
218
219 if (unicode > 0xFFFF) {
220
221 /*
222 * Plane 2 is intended for CJK ideographs
223 */
224 if (unicode >= 0x20000 && unicode <= 0x2FFFD) return wide;
225 return ambiguous;
226 }
227 else {
228 eaw_db_type *pos, key;
229 size_t n;
230
231 n = sizeof(eaw_db) / sizeof(eaw_db_type);
232 key.start = key.end = (unsigned short) unicode;
233 pos = bsearch(&key, eaw_db, n, sizeof(eaw_db_type), eaw_db_cmp);
234 if (NULL != pos) return pos->type;
235 }
236 return neutral;
237}
238
239int
240east_asia_mblen (const char *locale_name, const char *s, size_t n, int x)
241{
6f481af2 242 wchar_t *wcs, *p;
243 int width = 0;
d3661c9a 244
6f481af2 245 if (NULL == s) s = "";
d3661c9a 246
6f481af2 247 /*
248 * Getting the locale name via setlocale() is expensive, so we prefer
249 * to have it passed to us.
250 */
251 if (NULL == locale_name) {
252 locale_name = setlocale(LC_CTYPE, NULL);
253 if (NULL == locale_name) return INT_MAX;
254 }
d3661c9a 255
6f481af2 256 wcs = (wchar_t *) calloc(n, sizeof(wchar_t));
257 if (NULL == wcs) return INT_MAX;
d3661c9a 258
259#if defined __GLIBC__ && !__GLIBC_PREREQ(2,2)
260#warning wide character support is broken. Glibc 2.2 or better needed.
261#endif
262
6f481af2 263 if ((size_t) -1 == mbstowcs(wcs, s, n)) return INT_MAX;
d3661c9a 264
6f481af2 265 switch (get_east_asia_type(*wcs)) {
266 case neutral:
d3661c9a 267
6f481af2 268 /*
269 * Put characters that print nothing here.
270 *
271 * XXX: Yes, I know there are a lot more than this in ISO-10646, but
272 * this function is intended to calculate the width of strings for
273 * fixed width terminals displaying legacy CJK character sets.
274 * State-of-the-art Unicode handling terminals probably won't need
275 * this function anyway.
276 */
277 if (0x0000 == *wcs) break; /* NULL */
278 if (0x0007 == *wcs) break; /* BELL */
279
280 /* FIXME: there will probably be ASCII chars after the escape
281 * code, which will be counted as part of the width even though they
282 * aren't displayed.
283 */
284 if (0x001B == *wcs) break; /* ESC */
285 if (0xFEFF == *wcs) break; /* ZWNBSP aka BOM (magic, signature) */
286
287 /*
288 * Special characters go here
289 */
290 if (0x0008 == *wcs) { /* BACKSPACE */
291 width = -1;
292 break;
293 }
294 if (0x0009 == *wcs) { /* TAB */
295 if (tab_width < 0) width = x % abs(tab_width);
296 else width = tab_width;
297 break;
298 }
299
300 /*FALLTHRU*/
301 case narrow:
302 case half_width:
303 width = 1;
304 break;
305 case wide:
306 case full_width:
307 width = 2;
308 break;
309 case ambiguous:
310 width = is_cjk_locale(locale_name) ? 2 : 1;
311 break;
312 default:
313 width = INT_MAX;
d3661c9a 314 }
6f481af2 315 free(wcs);
316 return width;
d3661c9a 317}
318
319int
320get_east_asia_str_n_width (const char *locale_name, const char *s, size_t n, int x)
321{
322 int total_width = 0;
323 wchar_t *wcs, *p;
324
325 if (NULL == s) s = "";
326
327 /*
328 * Getting the locale name via setlocale() is expensive, so we prefer
329 * to have it passed to us.
330 */
331 if (NULL == locale_name) {
332 locale_name = setlocale(LC_CTYPE, NULL);
333 if (NULL == locale_name) return INT_MAX;
334 }
335
336 wcs = (wchar_t *) calloc(n, sizeof(wchar_t));
337 if (NULL == wcs) return INT_MAX;
338
339#if defined __GLIBC__ && !__GLIBC_PREREQ(2,2)
340#warning wide character support is broken. Glibc 2.2 or better needed.
341#endif
342
343 if ((size_t) -1 == mbstowcs(wcs, s, n)) return INT_MAX;
344
345 for (p = wcs; L'\0' != *p; p++) {
346 int width = 0;
347
348 switch (get_east_asia_type(*p)) {
349 case neutral:
350
351 /*
352 * Put characters that print nothing here.
353 *
354 * XXX: Yes, I know there are a lot more than this in ISO-10646, but
355 * this function is intended to calculate the width of strings for
356 * fixed width terminals displaying legacy CJK character sets.
357 * State-of-the-art Unicode handling terminals probably won't need
358 * this function anyway.
359 */
360 if (0x0000 == *p) break; /* NULL */
361 if (0x0007 == *p) break; /* BELL */
362
363 /* FIXME: there will probably be ASCII chars after the escape
364 * code, which will be counted as part of the width even though they
365 * aren't displayed.
366 */
367 if (0x001B == *p) break; /* ESC */
368 if (0xFEFF == *p) break; /* ZWNBSP aka BOM (magic, signature) */
369
370 /*
371 * Special characters go here
372 */
373 if (0x0008 == *p) { /* BACKSPACE */
374 width = -1;
375 break;
376 }
377 if (0x0009 == *p) { /* TAB */
378 if (tab_width < 0) width = x % abs(tab_width);
379 else width = tab_width;
380 break;
381 }
382
383 /*FALLTHRU*/
384 case narrow:
385 case half_width:
386 width = 1;
387 break;
388 case wide:
389 case full_width:
390 width = 2;
391 break;
392 case ambiguous:
393 width = is_cjk_locale(locale_name) ? 2 : 1;
394 break;
395 default: abort(); /* Doh! */
396 }
397 x += width;
398 total_width += width;
399 }
400 free(wcs);
401 return total_width;
402}
403
404int
405get_east_asia_str_width (const char *locale_name, const char *s, int x) {
406 size_t n;
63231242 407 int rc;
d3661c9a 408
409 n = strlen(s) + 1;
63231242 410 rc = get_east_asia_str_n_width (locale_name, s, n, x);
411 if (rc == INT_MAX)
412 return strlen (s);
413 return rc;
d3661c9a 414}
415
416#if TEST_GET_EAST_ASIA_STR_WIDTH
417
418#include <stdio.h>
419
420int
421main (int argc, char *argv[]) {
422 int i;
423 char *lc;
424 const char *fmt = "word #%d ('%s') length is %zu, width is %u\n";
425
426 lc = setlocale(LC_CTYPE, "");
427 if (NULL == lc) {
428 fputs("couldn't set the default locale for LC_CTYPE\n", stderr);
429 exit(EXIT_FAILURE);
430 }
431 if (printf("character type locale is '%s'\n", lc) < 0) {
432 perror(NULL);
433 exit(EXIT_FAILURE);
434 }
435 for (i = 1; argc < 2 || i < argc; i++) {
436 char *s;
437 size_t length;
438 unsigned width;
439
440 if (argc < 2) {
441 if (scanf("%as", &s) < 1 && ferror(stdin)) {
442 perror(NULL);
443 exit(EXIT_FAILURE);
444 }
445 else if (feof(stdin)) break;
446 }
447 else s = strdup(argv[(size_t) i]);
448 if (NULL == s) {
449 perror(NULL);
450 exit(EXIT_FAILURE);
451 }
452 length = strlen(s);
453 width = get_east_asia_str_width(lc, s, 0);
454 if (printf(fmt, i, s, length, width) < 0) {
455 perror(NULL);
456 exit(EXIT_FAILURE);
457 }
458 free(s);
459 }
460 return 0;
461}
462
463#endif