From fab3c6d31b4913b20d6b6879488fa4de409e5d46 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=D0=A7=D0=B0=D1=81=D0=BB=D0=B0=D0=B2=20=D0=98=D0=BB=D0=B8?= =?utf8?q?=D1=9B=20=28Chusslove=20Illich=29?= Date: Mon, 27 Jul 2009 22:20:41 +0200 Subject: [PATCH] Support accented letters in recode-sr-latin. --- gettext-tools/src/ChangeLog | 7 ++ gettext-tools/src/filter-sr-latin.c | 174 +++++++++++++++++++++++--- gettext-tools/tests/ChangeLog | 5 + gettext-tools/tests/recode-sr-latin-1 | 8 ++ 4 files changed, 180 insertions(+), 14 deletions(-) diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index e917ce604..0d68bdcbd 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,10 @@ +2009-07-27 Часлав Илић (Chusslove Illich) + + Extend recode-sr-latin to also transform letters with accents. + * filter-sr-latin.c (table): Add entries for letters with accents. + (IS_UPPERCASE_CYRILLIC): Recognize also U+04E2, U+04EE. + (serbian_to_latin): Update. + 2009-06-06 Bruno Haible * x-perl.c (x_perl_prelex): Recognize the perl 5.10 operator '//'. diff --git a/gettext-tools/src/filter-sr-latin.c b/gettext-tools/src/filter-sr-latin.c index bb7e99c6c..e76fbe70a 100644 --- a/gettext-tools/src/filter-sr-latin.c +++ b/gettext-tools/src/filter-sr-latin.c @@ -1,5 +1,5 @@ /* Recode Serbian text from Cyrillic to Latin script. - Copyright (C) 2006-2007 Free Software Foundation, Inc. + Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc. Written by Danilo Šegan , 2006, and Bruno Haible , 2006. @@ -29,11 +29,11 @@ /* Table for Serbian Cyrillic to Latin transcription. - The table is indexed by the Unicode code point, in the range 0x0400..0x045f. + The table is indexed by the Unicode code point, in the range 0x0400..0x04ef. The longest table entry is three bytes long. */ -static const char table[96][3 + 1] = +static const char table[240][3 + 1] = { - /* U+0400 */ "", + /* U+0400 */ "\xC3\x88", /* "È" */ /* U+0401 */ "", /* U+0402 */ "\xC4\x90", /* "Đ" */ /* U+0403 */ "", @@ -46,7 +46,7 @@ static const char table[96][3 + 1] = /* U+040A */ "Nj", /* U+040B */ "\xC4\x86", /* "Ć" */ /* U+040C */ "", - /* U+040D */ "", + /* U+040D */ "\xC3\x8C", /* "Ì" */ /* U+040E */ "", /* U+040F */ "D\xC5\xBE", /* "Dž" */ /* U+0410 */ "A", @@ -113,7 +113,7 @@ static const char table[96][3 + 1] = /* U+044D */ "", /* U+044E */ "", /* U+044F */ "", - /* U+0450 */ "", + /* U+0450 */ "\xC3\xA8", /* "è" */ /* U+0451 */ "", /* U+0452 */ "\xC4\x91", /* "đ" */ /* U+0453 */ "", @@ -126,9 +126,153 @@ static const char table[96][3 + 1] = /* U+045A */ "nj", /* U+045B */ "\xC4\x87", /* "ć" */ /* U+045C */ "", - /* U+045D */ "", + /* U+045D */ "\xC3\xAC", /* "ì" */ /* U+045E */ "", - /* U+045F */ "d\xC5\xBE" /* "dž" */ + /* U+045F */ "d\xC5\xBE", /* "dž" */ + /* U+0460 */ "", + /* U+0461 */ "", + /* U+0462 */ "", + /* U+0463 */ "", + /* U+0464 */ "", + /* U+0465 */ "", + /* U+0466 */ "", + /* U+0467 */ "", + /* U+0468 */ "", + /* U+0469 */ "", + /* U+046A */ "", + /* U+046B */ "", + /* U+046C */ "", + /* U+046D */ "", + /* U+046E */ "", + /* U+046F */ "", + /* U+0470 */ "", + /* U+0471 */ "", + /* U+0472 */ "", + /* U+0473 */ "", + /* U+0474 */ "", + /* U+0475 */ "", + /* U+0476 */ "", + /* U+0477 */ "", + /* U+0478 */ "", + /* U+0479 */ "", + /* U+047A */ "", + /* U+047B */ "", + /* U+047C */ "", + /* U+047D */ "", + /* U+047E */ "", + /* U+047F */ "", + /* U+0480 */ "", + /* U+0481 */ "", + /* U+0482 */ "", + /* U+0483 */ "", + /* U+0484 */ "", + /* U+0485 */ "", + /* U+0486 */ "", + /* U+0487 */ "", + /* U+0488 */ "", + /* U+0489 */ "", + /* U+048A */ "", + /* U+048B */ "", + /* U+048C */ "", + /* U+048D */ "", + /* U+048E */ "", + /* U+048F */ "", + /* U+0490 */ "", + /* U+0491 */ "", + /* U+0492 */ "", + /* U+0493 */ "", + /* U+0494 */ "", + /* U+0495 */ "", + /* U+0496 */ "", + /* U+0497 */ "", + /* U+0498 */ "", + /* U+0499 */ "", + /* U+049A */ "", + /* U+049B */ "", + /* U+049C */ "", + /* U+049D */ "", + /* U+049E */ "", + /* U+049F */ "", + /* U+04A0 */ "", + /* U+04A1 */ "", + /* U+04A2 */ "", + /* U+04A3 */ "", + /* U+04A4 */ "", + /* U+04A5 */ "", + /* U+04A6 */ "", + /* U+04A7 */ "", + /* U+04A8 */ "", + /* U+04A9 */ "", + /* U+04AA */ "", + /* U+04AB */ "", + /* U+04AC */ "", + /* U+04AD */ "", + /* U+04AE */ "", + /* U+04AF */ "", + /* U+04B0 */ "", + /* U+04B1 */ "", + /* U+04B2 */ "", + /* U+04B3 */ "", + /* U+04B4 */ "", + /* U+04B5 */ "", + /* U+04B6 */ "", + /* U+04B7 */ "", + /* U+04B8 */ "", + /* U+04B9 */ "", + /* U+04BA */ "", + /* U+04BB */ "", + /* U+04BC */ "", + /* U+04BD */ "", + /* U+04BE */ "", + /* U+04BF */ "", + /* U+04C0 */ "", + /* U+04C1 */ "", + /* U+04C2 */ "", + /* U+04C3 */ "", + /* U+04C4 */ "", + /* U+04C5 */ "", + /* U+04C6 */ "", + /* U+04C7 */ "", + /* U+04C8 */ "", + /* U+04C9 */ "", + /* U+04CA */ "", + /* U+04CB */ "", + /* U+04CC */ "", + /* U+04CD */ "", + /* U+04CE */ "", + /* U+04CF */ "", + /* U+04D0 */ "", + /* U+04D1 */ "", + /* U+04D2 */ "", + /* U+04D3 */ "", + /* U+04D4 */ "", + /* U+04D5 */ "", + /* U+04D6 */ "", + /* U+04D7 */ "", + /* U+04D8 */ "", + /* U+04D9 */ "", + /* U+04DA */ "", + /* U+04DB */ "", + /* U+04DC */ "", + /* U+04DD */ "", + /* U+04DE */ "", + /* U+04DF */ "", + /* U+04E0 */ "", + /* U+04E1 */ "", + /* U+04E2 */ "\xC4\xAA", /* "Ī" */ + /* U+04E3 */ "\xC4\xAB", /* "ī" */ + /* U+04E4 */ "", + /* U+04E5 */ "", + /* U+04E6 */ "", + /* U+04E7 */ "", + /* U+04E8 */ "", + /* U+04E9 */ "", + /* U+04EA */ "", + /* U+04EB */ "", + /* U+04EC */ "", + /* U+04ED */ "", + /* U+04EE */ "\xC5\xAA", /* "Ū" */ + /* U+04EF */ "\xC5\xAB" /* "ū" */ }; /* Quick test for an uppercase character in the range U+0041..U+005A. @@ -136,23 +280,25 @@ static const char table[96][3 + 1] = #define IS_UPPERCASE_LATIN(byte) \ ((unsigned char) ((byte) - 'A') <= 'Z' - 'A') -/* Quick test for an uppercase character in the range U+0400..U+042F. +/* Quick test for an uppercase character in the range U+0400..U+042F, + or exactly U+04E2 or U+04EE. The arguments must be bytes in the range 0..UCHAR_MAX. */ #define IS_UPPERCASE_CYRILLIC(byte1,byte2) \ - ((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) + (((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \ + || ((byte1) == 0xd3 && ((byte2) == 0xa2 || (byte2) == 0xae))) void serbian_to_latin (const char *input, size_t input_len, char **output_p, size_t *output_len_p) { /* Loop through the input string, producing a replacement for each character. - Only characters in the range U+0400..U+045F (\xD0\x80..\xD1\x9F) need to + Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to be handled, and more precisely only those for which a replacement exists in the table. Other characters are copied without modification. The characters U+0409, U+040A, U+040F are transliterated to uppercase or mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DŽ" / "Dž"), depending on the case of the surrounding characters. - Since we assume UTF-8 encoding, the bytes \xD0..\xD1 can only occur at the + Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the beginning of a character; the second and further bytes of a character are all in the range \x80..\xBF. */ @@ -170,7 +316,7 @@ serbian_to_latin (const char *input, size_t input_len, unsigned char byte = (unsigned char) *ip; /* Test for the first byte of a Cyrillic character. */ - if ((byte >= 0xd0 && byte <= 0xd1) && (ip + 1 < input_end)) + if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end)) { unsigned char second_byte = (unsigned char) ip[1]; @@ -179,7 +325,7 @@ serbian_to_latin (const char *input, size_t input_len, { unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f); - if (uc >= 0x0400 && uc <= 0x045f) + if (uc >= 0x0400 && uc <= 0x04ef) { /* Look up replacement from the table. */ const char *repl = table[uc - 0x0400]; diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index f76744a98..cc6b4d167 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,8 @@ +2009-07-27 Часлав Илић (Chusslove Illich) + Bruno Haible + + * gettext-tools/tests/recode-sr-latin-1: Add test cases with accents. + 2009-06-06 Bruno Haible * xgettext-perl-1: Add test of // operator. diff --git a/gettext-tools/tests/recode-sr-latin-1 b/gettext-tools/tests/recode-sr-latin-1 index c71f87982..9565a3689 100755 --- a/gettext-tools/tests/recode-sr-latin-1 +++ b/gettext-tools/tests/recode-sr-latin-1 @@ -28,6 +28,10 @@ tmpfiles="$tmpfiles rec-srl-1.in" cat <<\EOF > rec-srl-1.in Исправни аргументи су неједнозначан аргумент +љубазни фењерџија чађавог лица хоће да ми покаже штос +ЉУБАЗНИ ФЕЊЕРЏИЈА ЧАЂАВОГ ЛИЦА ХОЋЕ ДА МИ ПОКАЖЕ ШТОС +ЉУБЉАНА Љубљана ЏИВЏАН Џивџан ЊУЊОРИ Њуњори +ѐѝӣӯ ЍЀӢӮ ЉӮ ӮЉ EOF tmpfiles="$tmpfiles rec-srl-1.out" @@ -40,6 +44,10 @@ tmpfiles="$tmpfiles rec-srl-1.ok" cat <<\EOF > rec-srl-1.ok Ispravni argumenti su nejednoznačan argument +ljubazni fenjerdžija čađavog lica hoće da mi pokaže štos +LJUBAZNI FENJERDŽIJA ČAĐAVOG LICA HOĆE DA MI POKAŽE ŠTOS +LJUBLJANA Ljubljana DŽIVDŽAN Dživdžan NJUNJORI Njunjori +èìīū ÌÈĪŪ LJŪ ŪLJ EOF : ${DIFF=diff} -- 2.47.2