]>
git.ipfire.org Git - thirdparty/glibc.git/blob - locale/programs/linereader.c
1 /* Copyright (C) 1996-2023 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published
6 by the Free Software Foundation; version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
30 #include "localedef.h"
33 #include "linereader.h"
36 /* Prototypes for local functions. */
37 static struct token
*get_toplvl_escape (struct linereader
*lr
);
38 static struct token
*get_symname (struct linereader
*lr
);
39 static struct token
*get_ident (struct linereader
*lr
);
40 static struct token
*get_string (struct linereader
*lr
,
41 const struct charmap_t
*charmap
,
42 struct localedef_t
*locale
,
43 const struct repertoire_t
*repertoire
,
45 static bool utf8_decode (struct linereader
*lr
, uint8_t ch1
, uint32_t *wch
);
49 lr_open (const char *fname
, kw_hash_fct_t hf
)
53 if (fname
== NULL
|| strcmp (fname
, "-") == 0
54 || strcmp (fname
, "/dev/stdin") == 0)
55 return lr_create (stdin
, "<stdin>", hf
);
58 fp
= fopen (fname
, "rm");
61 return lr_create (fp
, fname
, hf
);
66 lr_create (FILE *fp
, const char *fname
, kw_hash_fct_t hf
)
68 struct linereader
*result
;
71 result
= (struct linereader
*) xmalloc (sizeof (*result
));
74 result
->fname
= xstrdup (fname
);
79 result
->comment_char
= '#';
80 result
->escape_char
= '\\';
81 result
->translate_strings
= 1;
82 result
->return_widestr
= 0;
84 n
= getdelim (&result
->buf
, &result
->bufsize
, '\n', result
->fp
);
89 free ((char *) result
->fname
);
95 if (n
> 1 && result
->buf
[n
- 2] == '\\' && result
->buf
[n
- 1] == '\n')
98 result
->buf
[n
] = '\0';
100 result
->hash_fct
= hf
;
107 lr_eof (struct linereader
*lr
)
109 return lr
->bufact
= 0;
114 lr_ignore_rest (struct linereader
*lr
, int verbose
)
118 while (isspace (lr
->buf
[lr
->idx
]) && lr
->buf
[lr
->idx
] != '\n'
119 && lr
->buf
[lr
->idx
] != lr
->comment_char
)
120 if (lr
->buf
[lr
->idx
] == '\0')
122 if (lr_next (lr
) < 0)
128 if (lr
->buf
[lr
->idx
] != '\n' && ! feof (lr
->fp
)
129 && lr
->buf
[lr
->idx
] != lr
->comment_char
)
130 lr_error (lr
, _("trailing garbage at end of line"));
133 /* Ignore continued line. */
134 while (lr
->bufact
> 0 && lr
->buf
[lr
->bufact
- 1] != '\n')
135 if (lr_next (lr
) < 0)
138 lr
->idx
= lr
->bufact
;
143 lr_close (struct linereader
*lr
)
152 lr_next (struct linereader
*lr
)
156 n
= getdelim (&lr
->buf
, &lr
->bufsize
, '\n', lr
->fp
);
162 if (n
> 1 && lr
->buf
[n
- 2] == lr
->escape_char
&& lr
->buf
[n
- 1] == '\n')
165 /* XXX Is this correct? */
166 /* An escaped newline character is substituted with a single <SP>. */
168 lr
->buf
[n
- 1] = ' ';
182 /* Defined in error.c. */
183 /* This variable is incremented each time `error' is called. */
184 extern unsigned int error_message_count
;
186 /* The calling program should define program_name and set it to the
187 name of the executing program. */
188 extern char *program_name
;
192 lr_token (struct linereader
*lr
, const struct charmap_t
*charmap
,
193 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
206 lr
->token
.tok
= tok_eof
;
212 lr
->token
.tok
= tok_eol
;
216 while (isspace (ch
));
218 if (ch
!= lr
->comment_char
)
221 /* Is there an newline at the end of the buffer? */
222 if (lr
->buf
[lr
->bufact
- 1] != '\n')
224 /* No. Some people want this to mean that only the line in
225 the file not the logical, concatenated line is ignored.
227 lr
->idx
= lr
->bufact
;
231 /* Ignore rest of line. */
232 lr_ignore_rest (lr
, 0);
233 lr
->token
.tok
= tok_eol
;
237 /* Match escape sequences. */
238 if (ch
== lr
->escape_char
)
239 return get_toplvl_escape (lr
);
241 /* Match ellipsis. */
244 if (strncmp (&lr
->buf
[lr
->idx
], "...(2)....", 10) == 0)
247 for (cnt
= 0; cnt
< 10; ++cnt
)
249 lr
->token
.tok
= tok_ellipsis4_2
;
252 if (strncmp (&lr
->buf
[lr
->idx
], "...", 3) == 0)
257 lr
->token
.tok
= tok_ellipsis4
;
260 if (strncmp (&lr
->buf
[lr
->idx
], "..", 2) == 0)
264 lr
->token
.tok
= tok_ellipsis3
;
267 if (strncmp (&lr
->buf
[lr
->idx
], ".(2)..", 6) == 0)
270 for (cnt
= 0; cnt
< 6; ++cnt
)
272 lr
->token
.tok
= tok_ellipsis2_2
;
275 if (lr
->buf
[lr
->idx
] == '.')
278 lr
->token
.tok
= tok_ellipsis2
;
286 return get_symname (lr
);
289 lr
->token
.tok
= tok_number
;
290 lr
->token
.val
.num
= ch
- '0';
292 while (isdigit (ch
= lr_getc (lr
)))
294 lr
->token
.val
.num
*= 10;
295 lr
->token
.val
.num
+= ch
- '0';
298 lr_error (lr
, _("garbage at end of number"));
304 lr
->token
.tok
= tok_semicolon
;
308 lr
->token
.tok
= tok_comma
;
312 lr
->token
.tok
= tok_open_brace
;
316 lr
->token
.tok
= tok_close_brace
;
320 return get_string (lr
, charmap
, locale
, repertoire
, verbose
);
326 lr
->token
.tok
= tok_minus1
;
332 case 0x80 ... 0xff: /* UTF-8 sequence. */
335 if (!utf8_decode (lr
, ch
, &wch
))
337 lr
->token
.tok
= tok_error
;
340 lr
->token
.tok
= tok_ucs4
;
341 lr
->token
.val
.ucs4
= wch
;
346 return get_ident (lr
);
350 static struct token
*
351 get_toplvl_escape (struct linereader
*lr
)
353 /* This is supposed to be a numeric value. We return the
354 numerical value and the number of bytes. */
355 size_t start_idx
= lr
->idx
- 1;
356 unsigned char *bytes
= lr
->token
.val
.charcode
.bytes
;
362 unsigned int byte
= 0;
363 unsigned int base
= 8;
378 if ((base
== 16 && !isxdigit (ch
))
379 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
382 lr
->token
.val
.str
.startmb
= &lr
->buf
[start_idx
];
384 while (ch
!= EOF
&& !isspace (ch
))
386 lr
->token
.val
.str
.lenmb
= lr
->idx
- start_idx
;
388 lr
->token
.tok
= tok_error
;
395 byte
= tolower (ch
) - 'a' + 10;
398 if ((base
== 16 && !isxdigit (ch
))
399 || (base
!= 16 && (ch
< '0' || ch
>= (int) ('0' + base
))))
406 byte
+= tolower (ch
) - 'a' + 10;
409 if (base
!= 16 && isdigit (ch
))
417 bytes
[nbytes
++] = byte
;
419 while (ch
== lr
->escape_char
420 && nbytes
< (int) sizeof (lr
->token
.val
.charcode
.bytes
));
423 lr_error (lr
, _("garbage at end of character code specification"));
427 lr
->token
.tok
= tok_charcode
;
428 lr
->token
.val
.charcode
.nbytes
= nbytes
;
433 /* Multibyte string buffer. */
441 /* Initialize *LRB with a default-sized buffer. */
443 lr_buffer_init (struct lr_buffer
*lrb
)
447 lrb
->buf
= xmalloc (lrb
->max
);
450 /* Transfers the buffer string from *LRB to LR->token.mbstr. */
452 lr_buffer_to_token (struct lr_buffer
*lrb
, struct linereader
*lr
)
454 lr
->token
.val
.str
.startmb
= xrealloc (lrb
->buf
, lrb
->act
+ 1);
455 lr
->token
.val
.str
.startmb
[lrb
->act
] = '\0';
456 lr
->token
.val
.str
.lenmb
= lrb
->act
;
459 /* Adds CH to *LRB. */
461 addc (struct lr_buffer
*lrb
, char ch
)
463 if (lrb
->act
== lrb
->max
)
466 lrb
->buf
= xrealloc (lrb
->buf
, lrb
->max
);
468 lrb
->buf
[lrb
->act
++] = ch
;
471 /* Adds L bytes at S to *LRB. */
473 adds (struct lr_buffer
*lrb
, const unsigned char *s
, size_t l
)
475 if (lrb
->max
- lrb
->act
< l
)
477 size_t required_size
= lrb
->act
+ l
;
478 size_t new_max
= 2 * lrb
->max
;
479 if (new_max
< required_size
)
480 new_max
= required_size
;
481 lrb
->buf
= xrealloc (lrb
->buf
, new_max
);
484 memcpy (lrb
->buf
+ lrb
->act
, s
, l
);
491 if (buf2act == buf2max) \
494 buf2 = xrealloc (buf2, buf2max * 4); \
496 buf2[buf2act++] = (ch); \
501 static struct token
*
502 get_symname (struct linereader
*lr
)
504 /* Symbol in brackets. We must distinguish three kinds:
506 2. ISO 10646 position values
508 const struct keyword_t
*kw
;
510 struct lr_buffer lrb
;
512 lr_buffer_init (&lrb
);
517 if (ch
== lr
->escape_char
)
519 int c2
= lr_getc (lr
);
528 while (ch
!= '>' && ch
!= '\n');
531 lr_error (lr
, _("unterminated symbolic name"));
533 /* Test for ISO 10646 position value. */
534 if (lrb
.buf
[0] == 'U' && (lrb
.act
== 6 || lrb
.act
== 10))
536 char *cp
= lrb
.buf
+ 1;
537 while (cp
< &lrb
.buf
[lrb
.act
- 1] && isxdigit (*cp
))
540 if (cp
== &lrb
.buf
[lrb
.act
- 1])
543 lr
->token
.tok
= tok_ucs4
;
544 lr
->token
.val
.ucs4
= strtoul (lrb
.buf
+ 1, NULL
, 16);
550 /* It is a symbolic name. Test for reserved words. */
551 kw
= lr
->hash_fct (lrb
.buf
, lrb
.act
- 1);
553 if (kw
!= NULL
&& kw
->symname_or_ident
== 1)
555 lr
->token
.tok
= kw
->token
;
560 lr
->token
.tok
= tok_bsymbol
;
561 lr_buffer_to_token (&lrb
, lr
);
562 --lr
->token
.val
.str
.lenmb
; /* Hide the training '>'. */
569 static struct token
*
570 get_ident (struct linereader
*lr
)
572 const struct keyword_t
*kw
;
574 struct lr_buffer lrb
;
576 lr_buffer_init (&lrb
);
578 addc (&lrb
, lr
->buf
[lr
->idx
- 1]);
580 while (!isspace ((ch
= lr_getc (lr
))) && ch
!= '"' && ch
!= ';'
581 && ch
!= '<' && ch
!= ',' && ch
!= EOF
)
583 if (ch
== lr
->escape_char
)
586 if (ch
== '\n' || ch
== EOF
)
588 lr_error (lr
, _("invalid escape sequence"));
597 kw
= lr
->hash_fct (lrb
.buf
, lrb
.act
);
599 if (kw
!= NULL
&& kw
->symname_or_ident
== 0)
601 lr
->token
.tok
= kw
->token
;
606 lr
->token
.tok
= tok_ident
;
607 lr_buffer_to_token (&lrb
, lr
);
613 /* Process a decoded Unicode codepoint WCH in a string, placing the
614 multibyte sequence into LRB. Return false if the character is not
615 found in CHARMAP/REPERTOIRE. */
617 translate_unicode_codepoint (struct localedef_t
*locale
,
618 const struct charmap_t
*charmap
,
619 const struct repertoire_t
*repertoire
,
620 uint32_t wch
, struct lr_buffer
*lrb
)
622 /* See whether the charmap contains the Uxxxxxxxx names. */
624 snprintf (utmp
, sizeof (utmp
), "U%08X", wch
);
625 struct charseq
*seq
= charmap_find_value (charmap
, utmp
, 9);
629 /* No, this isn't the case. Now determine from
630 the repertoire the name of the character and
631 find it in the charmap. */
632 if (repertoire
!= NULL
)
634 const char *symbol
= repertoire_find_symbol (repertoire
, wch
);
636 seq
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
641 #ifndef NO_TRANSLITERATION
642 /* Transliterate if possible. */
645 if ((locale
->avail
& CTYPE_LOCALE
) == 0)
647 /* Load the CTYPE data now. */
648 int old_needed
= locale
->needed
;
651 locale
= load_locale (LC_CTYPE
, locale
->name
,
652 locale
->repertoire_name
,
654 locale
->needed
= old_needed
;
658 if ((locale
->avail
& CTYPE_LOCALE
) != 0
659 && ((translit
= find_translit (locale
, charmap
, wch
))
661 /* The CTYPE data contains a matching
664 for (int i
= 0; translit
[i
] != 0; ++i
)
666 snprintf (utmp
, sizeof (utmp
), "U%08X", translit
[i
]);
667 seq
= charmap_find_value (charmap
, utmp
, 9);
668 assert (seq
!= NULL
);
669 adds (lrb
, seq
->bytes
, seq
->nbytes
);
674 #endif /* NO_TRANSLITERATION */
676 /* Not a known name. */
683 adds (lrb
, seq
->bytes
, seq
->nbytes
);
690 /* Returns true if ch is not EOF (that is, non-negative) and a valid
691 UTF-8 trailing byte. */
693 utf8_valid_trailing (int ch
)
695 return ch
>= 0 && (ch
& 0xc0) == 0x80;
698 /* Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be
699 EOF. Always returns false. */
701 utf8_sequence_error (struct linereader
*lr
, uint8_t ch1
, int ch2
, int ch3
,
707 snprintf (buf
, sizeof (buf
), "0x%02x", ch1
);
709 snprintf (buf
, sizeof (buf
), "0x%02x 0x%02x", ch1
, ch2
);
711 snprintf (buf
, sizeof (buf
), "0x%02x 0x%02x 0x%02x", ch1
, ch2
, ch3
);
713 snprintf (buf
, sizeof (buf
), "0x%02x 0x%02x 0x%02x 0x%02x",
716 lr_error (lr
, _("invalid UTF-8 sequence %s"), buf
);
720 /* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
721 stores the decoded codepoint in *WCH. Returns false on failure and
724 utf8_decode (struct linereader
*lr
, uint8_t ch1
, uint32_t *wch
)
726 /* See RFC 3629 section 4 and __gconv_transform_utf8_internal. */
728 return utf8_sequence_error (lr
, ch1
, -1, -1, -1);
730 int ch2
= lr_getc (lr
);
731 if (!utf8_valid_trailing (ch2
))
732 return utf8_sequence_error (lr
, ch1
, ch2
, -1, -1);
736 uint32_t result
= ((ch1
& 0x1f) << 6) | (ch2
& 0x3f);
738 return utf8_sequence_error (lr
, ch1
, ch2
, -1, -1);
743 int ch3
= lr_getc (lr
);
744 if (!utf8_valid_trailing (ch3
) || ch1
< 0xe0)
745 return utf8_sequence_error (lr
, ch1
, ch2
, ch3
, -1);
749 uint32_t result
= (((ch1
& 0x0f) << 12)
750 | ((ch2
& 0x3f) << 6)
753 return utf8_sequence_error (lr
, ch1
, ch2
, ch3
, -1);
758 int ch4
= lr_getc (lr
);
759 if (!utf8_valid_trailing (ch4
) || ch1
< 0xf0 || ch1
> 0xf4)
760 return utf8_sequence_error (lr
, ch1
, ch2
, ch3
, ch4
);
762 uint32_t result
= (((ch1
& 0x07) << 18)
763 | ((ch2
& 0x3f) << 12)
764 | ((ch3
& 0x3f) << 6)
766 if (result
< 0x10000)
767 return utf8_sequence_error (lr
, ch1
, ch2
, ch3
, ch4
);
772 static struct token
*
773 get_string (struct linereader
*lr
, const struct charmap_t
*charmap
,
774 struct localedef_t
*locale
, const struct repertoire_t
*repertoire
,
777 int return_widestr
= lr
->return_widestr
;
778 struct lr_buffer lrb
;
779 wchar_t *buf2
= NULL
;
781 lr_buffer_init (&lrb
);
783 /* We know it'll be a string. */
784 lr
->token
.tok
= tok_string
;
786 /* If we need not translate the strings (i.e., expand <...> parts)
787 we can run a simple loop. */
788 if (!lr
->translate_strings
)
793 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
796 lr_error (lr
, _("illegal 8-bit character in untranslated string"));
800 /* Catch errors with trailing escape character. */
801 if (lrb
.act
> 0 && lrb
.buf
[lrb
.act
- 1] == lr
->escape_char
802 && (lrb
.act
== 1 || lrb
.buf
[lrb
.act
- 2] != lr
->escape_char
))
804 lr_error (lr
, _("illegal escape sequence at end of string"));
807 else if (ch
== '\n' || ch
== EOF
)
808 lr_error (lr
, _("unterminated string"));
814 bool illegal_string
= false;
816 size_t buf2max
= 56 * sizeof (uint32_t);
819 /* We have to provide the wide character result as well. */
821 buf2
= xmalloc (buf2max
);
823 /* Read until the end of the string (or end of the line or file). */
824 while ((ch
= lr_getc (lr
)) != '"' && ch
!= '\n' && ch
!= EOF
)
832 /* The standards leave it up to the implementation to
833 decide what to do with characters which stand for
834 themselves. This implementation treats the input
835 file as encoded in UTF-8. */
836 if (ch
== lr
->escape_char
)
841 lr_error (lr
, _("illegal 8-bit escape sequence"));
842 illegal_string
= true;
845 if (ch
== '\n' || ch
== EOF
)
855 else /* UTF-8 sequence. */
857 if (!utf8_decode (lr
, ch
, &wch
))
859 illegal_string
= true;
862 if (!translate_unicode_codepoint (locale
, charmap
,
863 repertoire
, wch
, &lrb
))
865 /* Ignore the rest of the string. Callers may
866 skip this string because it cannot be encoded
867 in the output character set. */
868 illegal_string
= true;
879 /* Now we have to search for the end of the symbolic name, i.e.,
882 while ((ch
= lr_getc (lr
)) != '>' && ch
!= '\n' && ch
!= EOF
)
884 if (ch
== lr
->escape_char
)
887 if (ch
== '\n' || ch
== EOF
)
892 if (ch
== '\n' || ch
== EOF
)
893 /* Not a correct string. */
895 if (lrb
.act
== startidx
)
897 /* <> is no correct name. Ignore it and also signal an
899 illegal_string
= true;
903 /* It might be a Uxxxx symbol. */
904 if (lrb
.buf
[startidx
] == 'U'
905 && (lrb
.act
- startidx
== 5 || lrb
.act
- startidx
== 9))
907 char *cp
= lrb
.buf
+ startidx
+ 1;
908 while (cp
< &lrb
.buf
[lrb
.act
] && isxdigit (*cp
))
911 if (cp
== &lrb
.buf
[lrb
.act
])
915 wch
= strtoul (lrb
.buf
+ startidx
+ 1, NULL
, 16);
917 /* Now forget about the name we just added. */
923 if (!translate_unicode_codepoint (locale
, charmap
,
924 repertoire
, wch
, &lrb
))
925 illegal_string
= true;
930 /* We now have the symbolic name in lrb.buf[startidx] to
931 lrb.buf[lrb.act-1]. Now find out the value for this character
932 in the charmap as well as in the repertoire map (in this
934 seq
= charmap_find_value (charmap
, &lrb
.buf
[startidx
],
939 /* This name is not in the charmap. */
940 lr_error (lr
, _("symbol `%.*s' not in charmap"),
941 (int) (lrb
.act
- startidx
), &lrb
.buf
[startidx
]);
942 illegal_string
= true;
947 /* Now the same for the multibyte representation. */
948 if (seq
!= NULL
&& seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
952 wch
= repertoire_find_value (repertoire
, &lrb
.buf
[startidx
],
958 if (wch
== ILLEGAL_CHAR_VALUE
)
960 /* This name is not in the repertoire map. */
961 lr_error (lr
, _("symbol `%.*s' not in repertoire map"),
962 (int) (lrb
.act
- startidx
), &lrb
.buf
[startidx
]);
963 illegal_string
= true;
969 /* Now forget about the name we just added. */
972 /* And copy the bytes. */
974 adds (&lrb
, seq
->bytes
, seq
->nbytes
);
977 if (ch
== '\n' || ch
== EOF
)
979 lr_error (lr
, _("unterminated string"));
980 illegal_string
= true;
987 lr
->token
.val
.str
.startmb
= NULL
;
988 lr
->token
.val
.str
.lenmb
= 0;
989 lr
->token
.val
.str
.startwc
= NULL
;
990 lr
->token
.val
.str
.lenwc
= 0;
1000 lr
->token
.val
.str
.startwc
= xrealloc (buf2
,
1001 buf2act
* sizeof (uint32_t));
1002 lr
->token
.val
.str
.lenwc
= buf2act
;
1006 lr_buffer_to_token (&lrb
, lr
);