From fab3c6d31b4913b20d6b6879488fa4de409e5d46 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=D0=A7=D0=B0=D1=81=D0=BB=D0=B0=D0=B2=20=D0=98=D0=BB=D0=B8?=
 =?utf8?q?=D1=9B=20=28Chusslove=20Illich=29?= <caslav.ilic@gmx.net>
Date: Mon, 27 Jul 2009 22:20:41 +0200
Subject: [PATCH] Support accented letters in recode-sr-latin.

---
 gettext-tools/src/ChangeLog           |   7 ++
 gettext-tools/src/filter-sr-latin.c   | 174 +++++++++++++++++++++++---
 gettext-tools/tests/ChangeLog         |   5 +
 gettext-tools/tests/recode-sr-latin-1 |   8 ++
 4 files changed, 180 insertions(+), 14 deletions(-)

diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index e917ce604..0d68bdcbd 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,10 @@
+2009-07-27  Ð§Ð°ÑÐ»Ð°Ð² ÐÐ»Ð¸Ñ (Chusslove Illich)  <caslav.ilic@gmx.net>
+
+	Extend recode-sr-latin to also transform letters with accents.
+	* filter-sr-latin.c (table): Add entries for letters with accents.
+	(IS_UPPERCASE_CYRILLIC): Recognize also U+04E2, U+04EE.
+	(serbian_to_latin): Update.
+
 2009-06-06  Bruno Haible  <bruno@clisp.org>
 
 	* x-perl.c (x_perl_prelex): Recognize the perl 5.10 operator '//'.
diff --git a/gettext-tools/src/filter-sr-latin.c b/gettext-tools/src/filter-sr-latin.c
index bb7e99c6c..e76fbe70a 100644
--- a/gettext-tools/src/filter-sr-latin.c
+++ b/gettext-tools/src/filter-sr-latin.c
@@ -1,5 +1,5 @@
 /* Recode Serbian text from Cyrillic to Latin script.
-   Copyright (C) 2006-2007 Free Software Foundation, Inc.
+   Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc.
    Written by Danilo Å egan <danilo@gnome.org>, 2006,
    and Bruno Haible <bruno@clisp.org>, 2006.
 
@@ -29,11 +29,11 @@
 
 
 /* Table for Serbian Cyrillic to Latin transcription.
-   The table is indexed by the Unicode code point, in the range 0x0400..0x045f.
+   The table is indexed by the Unicode code point, in the range 0x0400..0x04ef.
    The longest table entry is three bytes long.  */
-static const char table[96][3 + 1] =
+static const char table[240][3 + 1] =
 {
-  /* U+0400 */ "",
+  /* U+0400 */ "\xC3\x88", /* "Ã" */
   /* U+0401 */ "",
   /* U+0402 */ "\xC4\x90", /* "Ä" */
   /* U+0403 */ "",
@@ -46,7 +46,7 @@ static const char table[96][3 + 1] =
   /* U+040A */ "Nj",
   /* U+040B */ "\xC4\x86", /* "Ä" */
   /* U+040C */ "",
-  /* U+040D */ "",
+  /* U+040D */ "\xC3\x8C", /* "Ã" */
   /* U+040E */ "",
   /* U+040F */ "D\xC5\xBE", /* "DÅ¾" */
   /* U+0410 */ "A",
@@ -113,7 +113,7 @@ static const char table[96][3 + 1] =
   /* U+044D */ "",
   /* U+044E */ "",
   /* U+044F */ "",
-  /* U+0450 */ "",
+  /* U+0450 */ "\xC3\xA8", /* "Ã¨" */
   /* U+0451 */ "",
   /* U+0452 */ "\xC4\x91", /* "Ä" */
   /* U+0453 */ "",
@@ -126,9 +126,153 @@ static const char table[96][3 + 1] =
   /* U+045A */ "nj",
   /* U+045B */ "\xC4\x87", /* "Ä" */
   /* U+045C */ "",
-  /* U+045D */ "",
+  /* U+045D */ "\xC3\xAC", /* "Ã¬" */
   /* U+045E */ "",
-  /* U+045F */ "d\xC5\xBE" /* "dÅ¾" */
+  /* U+045F */ "d\xC5\xBE", /* "dÅ¾" */
+  /* U+0460 */ "",
+  /* U+0461 */ "",
+  /* U+0462 */ "",
+  /* U+0463 */ "",
+  /* U+0464 */ "",
+  /* U+0465 */ "",
+  /* U+0466 */ "",
+  /* U+0467 */ "",
+  /* U+0468 */ "",
+  /* U+0469 */ "",
+  /* U+046A */ "",
+  /* U+046B */ "",
+  /* U+046C */ "",
+  /* U+046D */ "",
+  /* U+046E */ "",
+  /* U+046F */ "",
+  /* U+0470 */ "",
+  /* U+0471 */ "",
+  /* U+0472 */ "",
+  /* U+0473 */ "",
+  /* U+0474 */ "",
+  /* U+0475 */ "",
+  /* U+0476 */ "",
+  /* U+0477 */ "",
+  /* U+0478 */ "",
+  /* U+0479 */ "",
+  /* U+047A */ "",
+  /* U+047B */ "",
+  /* U+047C */ "",
+  /* U+047D */ "",
+  /* U+047E */ "",
+  /* U+047F */ "",
+  /* U+0480 */ "",
+  /* U+0481 */ "",
+  /* U+0482 */ "",
+  /* U+0483 */ "",
+  /* U+0484 */ "",
+  /* U+0485 */ "",
+  /* U+0486 */ "",
+  /* U+0487 */ "",
+  /* U+0488 */ "",
+  /* U+0489 */ "",
+  /* U+048A */ "",
+  /* U+048B */ "",
+  /* U+048C */ "",
+  /* U+048D */ "",
+  /* U+048E */ "",
+  /* U+048F */ "",
+  /* U+0490 */ "",
+  /* U+0491 */ "",
+  /* U+0492 */ "",
+  /* U+0493 */ "",
+  /* U+0494 */ "",
+  /* U+0495 */ "",
+  /* U+0496 */ "",
+  /* U+0497 */ "",
+  /* U+0498 */ "",
+  /* U+0499 */ "",
+  /* U+049A */ "",
+  /* U+049B */ "",
+  /* U+049C */ "",
+  /* U+049D */ "",
+  /* U+049E */ "",
+  /* U+049F */ "",
+  /* U+04A0 */ "",
+  /* U+04A1 */ "",
+  /* U+04A2 */ "",
+  /* U+04A3 */ "",
+  /* U+04A4 */ "",
+  /* U+04A5 */ "",
+  /* U+04A6 */ "",
+  /* U+04A7 */ "",
+  /* U+04A8 */ "",
+  /* U+04A9 */ "",
+  /* U+04AA */ "",
+  /* U+04AB */ "",
+  /* U+04AC */ "",
+  /* U+04AD */ "",
+  /* U+04AE */ "",
+  /* U+04AF */ "",
+  /* U+04B0 */ "",
+  /* U+04B1 */ "",
+  /* U+04B2 */ "",
+  /* U+04B3 */ "",
+  /* U+04B4 */ "",
+  /* U+04B5 */ "",
+  /* U+04B6 */ "",
+  /* U+04B7 */ "",
+  /* U+04B8 */ "",
+  /* U+04B9 */ "",
+  /* U+04BA */ "",
+  /* U+04BB */ "",
+  /* U+04BC */ "",
+  /* U+04BD */ "",
+  /* U+04BE */ "",
+  /* U+04BF */ "",
+  /* U+04C0 */ "",
+  /* U+04C1 */ "",
+  /* U+04C2 */ "",
+  /* U+04C3 */ "",
+  /* U+04C4 */ "",
+  /* U+04C5 */ "",
+  /* U+04C6 */ "",
+  /* U+04C7 */ "",
+  /* U+04C8 */ "",
+  /* U+04C9 */ "",
+  /* U+04CA */ "",
+  /* U+04CB */ "",
+  /* U+04CC */ "",
+  /* U+04CD */ "",
+  /* U+04CE */ "",
+  /* U+04CF */ "",
+  /* U+04D0 */ "",
+  /* U+04D1 */ "",
+  /* U+04D2 */ "",
+  /* U+04D3 */ "",
+  /* U+04D4 */ "",
+  /* U+04D5 */ "",
+  /* U+04D6 */ "",
+  /* U+04D7 */ "",
+  /* U+04D8 */ "",
+  /* U+04D9 */ "",
+  /* U+04DA */ "",
+  /* U+04DB */ "",
+  /* U+04DC */ "",
+  /* U+04DD */ "",
+  /* U+04DE */ "",
+  /* U+04DF */ "",
+  /* U+04E0 */ "",
+  /* U+04E1 */ "",
+  /* U+04E2 */ "\xC4\xAA", /* "Äª" */
+  /* U+04E3 */ "\xC4\xAB", /* "Ä«" */
+  /* U+04E4 */ "",
+  /* U+04E5 */ "",
+  /* U+04E6 */ "",
+  /* U+04E7 */ "",
+  /* U+04E8 */ "",
+  /* U+04E9 */ "",
+  /* U+04EA */ "",
+  /* U+04EB */ "",
+  /* U+04EC */ "",
+  /* U+04ED */ "",
+  /* U+04EE */ "\xC5\xAA", /* "Åª" */
+  /* U+04EF */ "\xC5\xAB" /* "Å«" */
 };
 
 /* Quick test for an uppercase character in the range U+0041..U+005A.
@@ -136,23 +280,25 @@ static const char table[96][3 + 1] =
 #define IS_UPPERCASE_LATIN(byte) \
   ((unsigned char) ((byte) - 'A') <= 'Z' - 'A')
 
-/* Quick test for an uppercase character in the range U+0400..U+042F.
+/* Quick test for an uppercase character in the range U+0400..U+042F,
+   or exactly U+04E2 or U+04EE.
    The arguments must be bytes in the range 0..UCHAR_MAX.  */
 #define IS_UPPERCASE_CYRILLIC(byte1,byte2) \
-  ((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30)
+  (((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \
+   || ((byte1) == 0xd3 && ((byte2) == 0xa2 || (byte2) == 0xae)))
 
 void
 serbian_to_latin (const char *input, size_t input_len,
 		  char **output_p, size_t *output_len_p)
 {
   /* Loop through the input string, producing a replacement for each character.
-     Only characters in the range U+0400..U+045F (\xD0\x80..\xD1\x9F) need to
+     Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to
      be handled, and more precisely only those for which a replacement exists
      in the table.  Other characters are copied without modification.
      The characters U+0409, U+040A, U+040F are transliterated to uppercase or
      mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DÅ½" / "DÅ¾"), depending
      on the case of the surrounding characters.
-     Since we assume UTF-8 encoding, the bytes \xD0..\xD1 can only occur at the
+     Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
      beginning of a character; the second and further bytes of a character are
      all in the range \x80..\xBF.  */
 
@@ -170,7 +316,7 @@ serbian_to_latin (const char *input, size_t input_len,
       unsigned char byte = (unsigned char) *ip;
 
       /* Test for the first byte of a Cyrillic character.  */
-      if ((byte >= 0xd0 && byte <= 0xd1) && (ip + 1 < input_end))
+      if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end))
 	{
 	  unsigned char second_byte = (unsigned char) ip[1];
 
@@ -179,7 +325,7 @@ serbian_to_latin (const char *input, size_t input_len,
 	    {
 	      unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);
 
-	      if (uc >= 0x0400 && uc <= 0x045f)
+	      if (uc >= 0x0400 && uc <= 0x04ef)
 		{
 		  /* Look up replacement from the table.  */
 		  const char *repl = table[uc - 0x0400];
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog
index f76744a98..cc6b4d167 100644
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,8 @@
+2009-07-27  Ð§Ð°ÑÐ»Ð°Ð² ÐÐ»Ð¸Ñ (Chusslove Illich)  <caslav.ilic@gmx.net>
+            Bruno Haible  <bruno@clisp.org>
+
+	* gettext-tools/tests/recode-sr-latin-1: Add test cases with accents.
+
 2009-06-06  Bruno Haible  <bruno@clisp.org>
 
 	* xgettext-perl-1: Add test of // operator.
diff --git a/gettext-tools/tests/recode-sr-latin-1 b/gettext-tools/tests/recode-sr-latin-1
index c71f87982..9565a3689 100755
--- a/gettext-tools/tests/recode-sr-latin-1
+++ b/gettext-tools/tests/recode-sr-latin-1
@@ -28,6 +28,10 @@ tmpfiles="$tmpfiles rec-srl-1.in"
 cat <<\EOF > rec-srl-1.in
 ÐÑÐ¿ÑÐ°Ð²Ð½Ð¸ Ð°ÑÐ³ÑÐ¼ÐµÐ½ÑÐ¸ ÑÑ
 Ð½ÐµÑÐµÐ´Ð½Ð¾Ð·Ð½Ð°ÑÐ°Ð½ Ð°ÑÐ³ÑÐ¼ÐµÐ½Ñ
+ÑÑÐ±Ð°Ð·Ð½Ð¸ ÑÐµÑÐµÑÑÐ¸ÑÐ° ÑÐ°ÑÐ°Ð²Ð¾Ð³ Ð»Ð¸ÑÐ° ÑÐ¾ÑÐµ Ð´Ð° Ð¼Ð¸ Ð¿Ð¾ÐºÐ°Ð¶Ðµ ÑÑÐ¾Ñ
+ÐÐ£ÐÐÐÐÐ Ð¤ÐÐÐÐ ÐÐÐÐ Ð§ÐÐÐÐÐÐ ÐÐÐ¦Ð Ð¥ÐÐÐ ÐÐ ÐÐ ÐÐÐÐÐÐ Ð¨Ð¢ÐÐ¡
+ÐÐ£ÐÐÐÐÐ ÐÑÐ±ÑÐ°Ð½Ð° ÐÐÐÐÐÐ ÐÐ¸Ð²ÑÐ°Ð½ ÐÐ£ÐÐÐ Ð ÐÑÑÐ¾ÑÐ¸
+ÑÑÓ£Ó¯ ÐÐÓ¢Ó® ÐÓ® Ó®Ð
 EOF
 
 tmpfiles="$tmpfiles rec-srl-1.out"
@@ -40,6 +44,10 @@ tmpfiles="$tmpfiles rec-srl-1.ok"
 cat <<\EOF > rec-srl-1.ok
 Ispravni argumenti su
 nejednoznaÄan argument
+ljubazni fenjerdÅ¾ija ÄaÄavog lica hoÄe da mi pokaÅ¾e Å¡tos
+LJUBAZNI FENJERDÅ½IJA ÄAÄAVOG LICA HOÄE DA MI POKAÅ½E Å TOS
+LJUBLJANA Ljubljana DÅ½IVDÅ½AN DÅ¾ivdÅ¾an NJUNJORI Njunjori
+Ã¨Ã¬Ä«Å« ÃÃÄªÅª LJÅª ÅªLJ
 EOF
 
 : ${DIFF=diff}
-- 
2.47.2