catgets/gencat.c

   1 /* Copyright (C) 1996-1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <argp.h>
  25 #include <assert.h>
  26 #include <ctype.h>
  27 #include <endian.h>
  28 #include <errno.h>
  29 #include <error.h>
  30 #include <fcntl.h>
  31 #include <iconv.h>
  32 #include <langinfo.h>
  33 #include <locale.h>
  34 #include <libintl.h>
  35 #include <limits.h>
  36 #include <nl_types.h>
  37 #include <obstack.h>
  38 #include <stdint.h>
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42 #include <unistd.h>
  43 #include <wchar.h>
  44
  45 #include "version.h"
  46
  47 #include "catgetsinfo.h"
  48
  49
  50 #define SWAPU32(w) \
  51   (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
  52
  53 struct message_list
  54 {
  55   int number;
  56   const char *message;
  57
  58   const char *fname;
  59   size_t line;
  60   const char *symbol;
  61
  62   struct message_list *next;
  63 };
  64
  65
  66 struct set_list
  67 {
  68   int number;
  69   int deleted;
  70   struct message_list *messages;
  71   int last_message;
  72
  73   const char *fname;
  74   size_t line;
  75   const char *symbol;
  76
  77   struct set_list *next;
  78 };
  79
  80
  81 struct catalog
  82 {
  83   struct set_list *all_sets;
  84   struct set_list *current_set;
  85   size_t total_messages;
  86   wint_t quote_char;
  87   int last_set;
  88
  89   struct obstack mem_pool;
  90 };
  91
  92
  93 /* If non-zero force creation of new file, not using existing one.  */
  94 static int force_new;
  95
  96 /* Name of output file.  */
  97 static const char *output_name;
  98
  99 /* Name of generated C header file.  */
 100 static const char *header_name;
 101
 102 /* Name and version of program.  */
 103 static void print_version (FILE *stream, struct argp_state *state);
 104 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
 105
 106 #define OPT_NEW 1
 107
 108 /* Definitions of arguments for argp functions.  */
 109 static const struct argp_option options[] =
 110 {
 111   { "header", 'H', N_("NAME"), 0,
 112     N_("Create C header file NAME containing symbol definitions") },
 113   { "new", OPT_NEW, NULL, 0,
 114     N_("Do not use existing catalog, force new output file") },
 115   { "output", 'o', N_("NAME"), 0, N_("Write output to file NAME") },
 116   { NULL, 0, NULL, 0, NULL }
 117 };
 118
 119 /* Short description of program.  */
 120 static const char doc[] = N_("Generate message catalog.\
 121 \vIf INPUT-FILE is -, input is read from standard input.  If OUTPUT-FILE\n\
 122 is -, output is written to standard output.\n");
 123
 124 /* Strings for arguments in help texts.  */
 125 static const char args_doc[] = N_("\
 126 -o OUTPUT-FILE [INPUT-FILE]...\n[OUTPUT-FILE [INPUT-FILE]...]");
 127
 128 /* Prototype for option handler.  */
 129 static error_t parse_opt (int key, char *arg, struct argp_state *state);
 130
 131 /* Function to print some extra text in the help message.  */
 132 static char *more_help (int key, const char *text, void *input);
 133
 134 /* Data structure to communicate with argp functions.  */
 135 static struct argp argp =
 136 {
 137   options, parse_opt, args_doc, doc, NULL, more_help
 138 };
 139
 140
 141 /* Wrapper functions with error checking for standard functions.  */
 142 extern void *xmalloc (size_t n);
 143 extern void *xcalloc (size_t n, size_t s);
 144 extern void *xrealloc (void *o, size_t n);
 145 extern char *xstrdup (const char *);
 146
 147 /* Prototypes for local functions.  */
 148 static void error_print (void);
 149 static struct catalog *read_input_file (struct catalog *current,
 150                                         const char *fname);
 151 static void write_out (struct catalog *result, const char *output_name,
 152                        const char *header_name);
 153 static struct set_list *find_set (struct catalog *current, int number);
 154 static void normalize_line (const char *fname, size_t line, iconv_t cd,
 155                             wchar_t *string, wchar_t quote_char,
 156                             wchar_t escape_char);
 157 static void read_old (struct catalog *catalog, const char *file_name);
 158 static int open_conversion (const char *codesetp, iconv_t *cd_towcp,
 159                             iconv_t *cd_tombp, wchar_t *escape_charp);
 160
 161
 162 int
 163 main (int argc, char *argv[])
 164 {
 165   struct catalog *result;
 166   int remaining;
 167
 168   /* Set program name for messages.  */
 169   error_print_progname = error_print;
 170
 171   /* Set locale via LC_ALL.  */
 172   setlocale (LC_ALL, "");
 173
 174   /* Set the text message domain.  */
 175   textdomain (PACKAGE);
 176
 177   /* Initialize local variables.  */
 178   result = NULL;
 179
 180   /* Parse and process arguments.  */
 181   argp_parse (&argp, argc, argv, 0, &remaining, NULL);
 182
 183   /* Determine output file.  */
 184   if (output_name == NULL)
 185     output_name = remaining < argc ? argv[remaining++] : "-";
 186
 187   /* Process all input files.  */
 188   setlocale (LC_CTYPE, "C");
 189   if (remaining < argc)
 190     do
 191       result = read_input_file (result, argv[remaining]);
 192     while (++remaining < argc);
 193   else
 194     result = read_input_file (NULL, "-");
 195
 196   /* Write out the result.  */
 197   if (result != NULL)
 198     write_out (result, output_name, header_name);
 199
 200   return EXIT_SUCCESS;
 201 }
 202
 203
 204 /* Handle program arguments.  */
 205 static error_t
 206 parse_opt (int key, char *arg, struct argp_state *state)
 207 {
 208   switch (key)
 209     {
 210     case 'H':
 211       header_name = arg;
 212       break;
 213     case OPT_NEW:
 214       force_new = 1;
 215       break;
 216     case 'o':
 217       output_name = arg;
 218       break;
 219     default:
 220       return ARGP_ERR_UNKNOWN;
 221     }
 222   return 0;
 223 }
 224
 225
 226 static char *
 227 more_help (int key, const char *text, void *input)
 228 {
 229   switch (key)
 230     {
 231     case ARGP_KEY_HELP_EXTRA:
 232       /* We print some extra information.  */
 233       return strdup (gettext ("\
 234 Report bugs using the `glibcbug' script to <bugs@gnu.org>.\n"));
 235     default:
 236       break;
 237     }
 238   return (char *) text;
 239 }
 240
 241 /* Print the version information.  */
 242 static void
 243 print_version (FILE *stream, struct argp_state *state)
 244 {
 245   fprintf (stream, "gencat (GNU %s) %s\n", PACKAGE, VERSION);
 246   fprintf (stream, gettext ("\
 247 Copyright (C) %s Free Software Foundation, Inc.\n\
 248 This is free software; see the source for copying conditions.  There is NO\n\
 249 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
 250 "), "2002");
 251   fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
 252 }
 253
 254
 255 /* The address of this function will be assigned to the hook in the
 256    error functions.  */
 257 static void
 258 error_print ()
 259 {
 260   /* We don't want the program name to be printed in messages.  Emacs'
 261      compile.el does not like this.  */
 262 }
 263
 264
 265 static struct catalog *
 266 read_input_file (struct catalog *current, const char *fname)
 267 {
 268   FILE *fp;
 269   char *buf;
 270   size_t len;
 271   size_t line_number;
 272   wchar_t *wbuf;
 273   size_t wbufsize;
 274   iconv_t cd_towc = (iconv_t) -1;
 275   iconv_t cd_tomb = (iconv_t) -1;
 276   wchar_t escape_char = L'\\';
 277   char *codeset = NULL;
 278
 279   if (strcmp (fname, "-") == 0 || strcmp (fname, "/dev/stdin") == 0)
 280     {
 281       fp = stdin;
 282       fname = gettext ("*standard input*");
 283     }
 284   else
 285     fp = fopen (fname, "r");
 286   if (fp == NULL)
 287     {
 288       error (0, errno, gettext ("cannot open input file `%s'"), fname);
 289       return current;
 290     }
 291
 292   /* If we haven't seen anything yet, allocate result structure.  */
 293   if (current == NULL)
 294     {
 295       current = (struct catalog *) xcalloc (1, sizeof (*current));
 296
 297 #define obstack_chunk_alloc malloc
 298 #define obstack_chunk_free free
 299       obstack_init (&current->mem_pool);
 300
 301       current->current_set = find_set (current, NL_SETD);
 302     }
 303
 304   buf = NULL;
 305   len = 0;
 306   line_number = 0;
 307
 308   wbufsize = 1024;
 309   wbuf = (wchar_t *) xmalloc (wbufsize);
 310
 311   while (!feof (fp))
 312     {
 313       int continued;
 314       int used;
 315       size_t start_line = line_number + 1;
 316       char *this_line;
 317
 318       do
 319         {
 320           int act_len;
 321
 322           act_len = getline (&buf, &len, fp);
 323           if (act_len <= 0)
 324             break;
 325           ++line_number;
 326
 327           /* It the line continued?  */
 328           continued = 0;
 329           if (buf[act_len - 1] == '\n')
 330             {
 331               --act_len;
 332
 333               /* There might be more than one backslash at the end of
 334                  the line.  Only if there is an odd number of them is
 335                  the line continued.  */
 336               if (buf[act_len - 1] == '\\')
 337                 {
 338                   int temp_act_len = act_len;
 339
 340                   do
 341                     {
 342                       --temp_act_len;
 343                       continued = !continued;
 344                     }
 345                   while (temp_act_len > 0 && buf[temp_act_len - 1] == '\\');
 346                 }
 347
 348               if (continued)
 349                 --act_len;
 350             }
 351
 352           /* Append to currently selected line.  */
 353           obstack_grow (&current->mem_pool, buf, act_len);
 354         }
 355       while (continued);
 356
 357       obstack_1grow (&current->mem_pool, '\0');
 358       this_line = (char *) obstack_finish (&current->mem_pool);
 359
 360       used = 0;
 361       if (this_line[0] == '$')
 362         {
 363           if (isblank (this_line[1]))
 364             {
 365               int cnt = 1;
 366               while (isblank (this_line[cnt]))
 367                 ++cnt;
 368               if (strncmp (&this_line[cnt], "codeset=", 8) != 0)
 369                 /* This is a comment line. Do nothing.  */;
 370               else if (codeset != NULL)
 371                 /* Ignore multiple codeset. */;
 372               else
 373                 {
 374                   int start = cnt + 8;
 375                   cnt = start;
 376                   while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
 377                     ++cnt;
 378                   if (cnt != start)
 379                     {
 380                       int len = cnt - start;
 381                       codeset = xmalloc (len + 1);
 382                       *((char *) mempcpy (codeset, &this_line[start], len))
 383                         = '\0';
 384                     }
 385                 }
 386             }
 387           else if (strncmp (&this_line[1], "set", 3) == 0)
 388             {
 389               int cnt = sizeof ("set");
 390               int set_number;
 391               const char *symbol = NULL;
 392               while (isspace (this_line[cnt]))
 393                 ++cnt;
 394
 395               if (isdigit (this_line[cnt]))
 396                 {
 397                   set_number = atol (&this_line[cnt]);
 398
 399                   /* If the given number for the character set is
 400                      higher than any we used for symbolic set names
 401                      avoid clashing by using only higher numbers for
 402                      the following symbolic definitions.  */
 403                   if (set_number > current->last_set)
 404                     current->last_set = set_number;
 405                 }
 406               else
 407                 {
 408                   /* See whether it is a reasonable identifier.  */
 409                   int start = cnt;
 410                   while (isalnum (this_line[cnt]) || this_line[cnt] == '_')
 411                     ++cnt;
 412
 413                   if (cnt == start)
 414                     {
 415                       /* No correct character found.  */
 416                       error_at_line (0, 0, fname, start_line,
 417                                      gettext ("illegal set number"));
 418                       set_number = 0;
 419                     }
 420                   else
 421                     {
 422                       /* We have found seomthing that looks like a
 423                          correct identifier.  */
 424                       struct set_list *runp;
 425
 426                       this_line[cnt] = '\0';
 427                       used = 1;
 428                       symbol = &this_line[start];
 429
 430                       /* Test whether the identifier was already used.  */
 431                       runp = current->all_sets;
 432                       while (runp != 0)
 433                         if (runp->symbol != NULL
 434                             && strcmp (runp->symbol, symbol) == 0)
 435                           break;
 436                         else
 437                           runp = runp->next;
 438
 439                       if (runp != NULL)
 440                         {
 441                           /* We cannot allow duplicate identifiers for
 442                              message sets.  */
 443                           error_at_line (0, 0, fname, start_line,
 444                                          gettext ("duplicate set definition"));
 445                           error_at_line (0, 0, runp->fname, runp->line,
 446                                          gettext ("\
 447 this is the first definition"));
 448                           set_number = 0;
 449                         }
 450                       else
 451                         /* Allocate next free message set for identifier.  */
 452                         set_number = ++current->last_set;
 453                     }
 454                 }
 455
 456               if (set_number != 0)
 457                 {
 458                   /* We found a legal set number.  */
 459                   current->current_set = find_set (current, set_number);
 460                   if (symbol != NULL)
 461                       used = 1;
 462                   current->current_set->symbol = symbol;
 463                   current->current_set->fname = fname;
 464                   current->current_set->line = start_line;
 465                 }
 466             }
 467           else if (strncmp (&this_line[1], "delset", 6) == 0)
 468             {
 469               int cnt = sizeof ("delset");
 470               size_t set_number;
 471               while (isspace (this_line[cnt]))
 472                 ++cnt;
 473
 474               if (isdigit (this_line[cnt]))
 475                 {
 476                   size_t set_number = atol (&this_line[cnt]);
 477                   struct set_list *set;
 478
 479                   /* Mark the message set with the given number as
 480                      deleted.  */
 481                   set = find_set (current, set_number);
 482                   set->deleted = 1;
 483                 }
 484               else
 485                 {
 486                   /* See whether it is a reasonable identifier.  */
 487                   int start = cnt;
 488                   while (isalnum (this_line[cnt]) || this_line[cnt] == '_')
 489                     ++cnt;
 490
 491                   if (cnt == start)
 492                     {
 493                       error_at_line (0, 0, fname, start_line,
 494                                      gettext ("illegal set number"));
 495                       set_number = 0;
 496                     }
 497                   else
 498                     {
 499                       const char *symbol;
 500                       struct set_list *runp;
 501
 502                       this_line[cnt] = '\0';
 503                       used = 1;
 504                       symbol = &this_line[start];
 505
 506                       /* We have a symbolic set name.  This name must
 507                          appear somewhere else in the catalogs read so
 508                          far.  */
 509                       set_number = 0;
 510                       for (runp = current->all_sets; runp != NULL;
 511                            runp = runp->next)
 512                         {
 513                           if (strcmp (runp->symbol, symbol) == 0)
 514                             {
 515                               runp->deleted = 1;
 516                               break;
 517                             }
 518                         }
 519                       if (runp == NULL)
 520                         /* Name does not exist before.  */
 521                         error_at_line (0, 0, fname, start_line,
 522                                        gettext ("unknown set `%s'"), symbol);
 523                     }
 524                 }
 525             }
 526           else if (strncmp (&this_line[1], "quote", 5) == 0)
 527             {
 528               char buf[2];
 529               char *bufptr;
 530               size_t buflen;
 531               char *wbufptr;
 532               size_t wbuflen;
 533               int cnt;
 534
 535               cnt = sizeof ("quote");
 536               while (isspace (this_line[cnt]))
 537                 ++cnt;
 538
 539               /* We need the conversion.  */
 540               if (cd_towc == (iconv_t) -1
 541                   && open_conversion (codeset, &cd_towc, &cd_tomb,
 542                                       &escape_char) != 0)
 543                 /* Something is wrong.  */
 544                 goto out;
 545
 546               /* Yes, the quote char can be '\0'; this means no quote
 547                  char.  The function using the information works on
 548                  wide characters so we have to convert it here.  */
 549               buf[0] = this_line[cnt];
 550               buf[1] = '\0';
 551               bufptr = buf;
 552               buflen = 2;
 553
 554               wbufptr = (char *) wbuf;
 555               wbuflen = wbufsize;
 556
 557               /* Flush the state.  */
 558               iconv (cd_towc, NULL, NULL, NULL, NULL);
 559
 560               iconv (cd_towc, &bufptr, &buflen, &wbufptr, &wbuflen);
 561               if (buflen != 0 || (wchar_t *) wbufptr != &wbuf[2])
 562                 error_at_line (0, 0, fname, start_line,
 563                                gettext ("invalid quote character"));
 564               else
 565                 /* Use the converted wide character.  */
 566                 current->quote_char = wbuf[0];
 567             }
 568           else
 569             {
 570               int cnt;
 571               cnt = 2;
 572               while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
 573                 ++cnt;
 574               this_line[cnt] = '\0';
 575               error_at_line (0, 0, fname, start_line,
 576                              gettext ("unknown directive `%s': line ignored"),
 577                              &this_line[1]);
 578             }
 579         }
 580       else if (isalnum (this_line[0]) || this_line[0] == '_')
 581         {
 582           const char *ident = this_line;
 583           char *line = this_line;
 584           int message_number;
 585
 586           do
 587             ++line;
 588           while (line[0] != '\0' && !isspace (line[0]));
 589           if (line[0] != '\0')
 590             *line++ = '\0';     /* Terminate the identifier.  */
 591
 592           /* Now we found the beginning of the message itself.  */
 593
 594           if (isdigit (ident[0]))
 595             {
 596               struct message_list *runp;
 597               struct message_list *lastp;
 598
 599               message_number = atoi (ident);
 600
 601               /* Find location to insert the new message.  */
 602               runp = current->current_set->messages;
 603               lastp = NULL;
 604               while (runp != NULL)
 605                 if (runp->number == message_number)
 606                   break;
 607                 else
 608                   {
 609                     lastp = runp;
 610                     runp = runp->next;
 611                   }
 612               if (runp != NULL)
 613                 {
 614                   /* Oh, oh.  There is already a message with this
 615                      number in the message set.  */
 616                   error_at_line (0, 0, fname, start_line,
 617                                  gettext ("duplicated message number"));
 618                   error_at_line (0, 0, runp->fname, runp->line,
 619                                  gettext ("this is the first definition"));
 620                   message_number = 0;
 621                 }
 622               ident = NULL;     /* We don't have a symbol.  */
 623
 624               if (message_number != 0
 625                   && message_number > current->current_set->last_message)
 626                 current->current_set->last_message = message_number;
 627             }
 628           else if (ident[0] != '\0')
 629             {
 630               struct message_list *runp;
 631               struct message_list *lastp;
 632
 633               /* Test whether the symbolic name was not used for
 634                  another message in this message set.  */
 635               runp = current->current_set->messages;
 636               lastp = NULL;
 637               while (runp != NULL)
 638                 if (runp->symbol != NULL && strcmp (ident, runp->symbol) == 0)
 639                   break;
 640                 else
 641                   runp = runp->next;
 642               if (runp != NULL)
 643                 {
 644                   /* The name is already used.  */
 645                   error_at_line (0, 0, fname, start_line, gettext ("\
 646 duplicated message identifier"));
 647                   error_at_line (0, 0, runp->fname, runp->line,
 648                                  gettext ("this is the first definition"));
 649                   message_number = 0;
 650                 }
 651               else
 652                 /* Give the message the next unused number.  */
 653                 message_number = ++current->current_set->last_message;
 654             }
 655           else
 656             message_number = 0;
 657
 658           if (message_number != 0)
 659             {
 660               char *inbuf;
 661               size_t inlen;
 662               char *outbuf;
 663               size_t outlen;
 664               struct message_list *newp;
 665               size_t line_len = strlen (line) + 1;
 666               size_t ident_len = 0;
 667
 668               /* We need the conversion.  */
 669               if (cd_towc == (iconv_t) -1
 670                   && open_conversion (codeset, &cd_towc, &cd_tomb,
 671                                       &escape_char) != 0)
 672                 /* Something is wrong.  */
 673                 goto out;
 674
 675               /* Convert to a wide character string.  We have to
 676                  interpret escape sequences which will be impossible
 677                  without doing the conversion if the codeset of the
 678                  message is stateful.  */
 679               while (1)
 680                 {
 681                   inbuf = line;
 682                   inlen = line_len;
 683                   outbuf = (char *) wbuf;
 684                   outlen = wbufsize;
 685
 686                   /* Flush the state.  */
 687                   iconv (cd_towc, NULL, NULL, NULL, NULL);
 688
 689                   iconv (cd_towc, &inbuf, &inlen, &outbuf, &outlen);
 690                   if (inlen == 0)
 691                     {
 692                       /* The string is converted.  */
 693                       assert (outlen < wbufsize);
 694                       assert (wbuf[(wbufsize - outlen) / sizeof (wchar_t) - 1]
 695                               == L'\0');
 696                       break;
 697                     }
 698
 699                   if (outlen != 0)
 700                     {
 701                       /* Something is wrong with this string, we ignore it.  */
 702                       error_at_line (0, 0, fname, start_line, gettext ("\
 703 invalid character: message ignored"));
 704                       goto ignore;
 705                     }
 706
 707                   /* The output buffer is too small.  */
 708                   wbufsize *= 2;
 709                   wbuf = (wchar_t *) xrealloc (wbuf, wbufsize);
 710                 }
 711
 712               /* Strip quote characters, change escape sequences into
 713                  correct characters etc.  */
 714               normalize_line (fname, start_line, cd_towc, wbuf,
 715                               current->quote_char, escape_char);
 716
 717               if (ident)
 718                 ident_len = line - this_line;
 719
 720               /* Now the string is free of escape sequences.  Convert it
 721                  back into a multibyte character string.  First free the
 722                  memory allocated for the original string.  */
 723               obstack_free (&current->mem_pool, this_line);
 724
 725               used = 1; /* Yes, we use the line.  */
 726
 727               /* Now fill in the new string.  It should never happen that
 728                  the replaced string is longer than the original.  */
 729               inbuf = (char *) wbuf;
 730               inlen = (wcslen (wbuf) + 1) * sizeof (wchar_t);
 731
 732               outlen = obstack_room (&current->mem_pool);
 733               obstack_blank (&current->mem_pool, outlen);
 734               this_line = (char *) obstack_base (&current->mem_pool);
 735               outbuf = this_line + ident_len;
 736               outlen -= ident_len;
 737
 738               /* Flush the state.  */
 739               iconv (cd_tomb, NULL, NULL, NULL, NULL);
 740
 741               iconv (cd_tomb, &inbuf, &inlen, &outbuf, &outlen);
 742               if (inlen != 0)
 743                 {
 744                   error_at_line (0, 0, fname, start_line,
 745                                  gettext ("invalid line"));
 746                   goto ignore;
 747                 }
 748               assert (outbuf[-1] == '\0');
 749
 750               /* Free the memory in the obstack we don't use.  */
 751               obstack_blank (&current->mem_pool, -(int) outlen);
 752               line = obstack_finish (&current->mem_pool);
 753
 754               newp = (struct message_list *) xmalloc (sizeof (*newp));
 755               newp->number = message_number;
 756               newp->message = line + ident_len;
 757               /* Remember symbolic name; is NULL if no is given.  */
 758               newp->symbol = ident ? line : NULL;
 759               /* Remember where we found the character.  */
 760               newp->fname = fname;
 761               newp->line = start_line;
 762
 763               /* Find place to insert to message.  We keep them in a
 764                  sorted single linked list.  */
 765               if (current->current_set->messages == NULL
 766                   || current->current_set->messages->number > message_number)
 767                 {
 768                   newp->next = current->current_set->messages;
 769                   current->current_set->messages = newp;
 770                 }
 771               else
 772                 {
 773                   struct message_list *runp;
 774                   runp = current->current_set->messages;
 775                   while (runp->next != NULL)
 776                     if (runp->next->number > message_number)
 777                       break;
 778                     else
 779                       runp = runp->next;
 780                   newp->next = runp->next;
 781                   runp->next = newp;
 782                 }
 783             }
 784           ++current->total_messages;
 785         }
 786       else
 787         {
 788           size_t cnt;
 789
 790           cnt = 0;
 791           /* See whether we have any non-white space character in this
 792              line.  */
 793           while (this_line[cnt] != '\0' && isspace (this_line[cnt]))
 794             ++cnt;
 795
 796           if (this_line[cnt] != '\0')
 797             /* Yes, some unknown characters found.  */
 798             error_at_line (0, 0, fname, start_line,
 799                            gettext ("malformed line ignored"));
 800         }
 801
 802     ignore:
 803       /* We can save the memory for the line if it was not used.  */
 804       if (!used)
 805         obstack_free (&current->mem_pool, this_line);
 806     }
 807
 808   /* Close the conversion modules.  */
 809   iconv_close (cd_towc);
 810   iconv_close (cd_tomb);
 811   free (codeset);
 812
 813  out:
 814   free (wbuf);
 815
 816   if (fp != stdin)
 817     fclose (fp);
 818   return current;
 819 }
 820
 821
 822 static void
 823 write_out (struct catalog *catalog, const char *output_name,
 824            const char *header_name)
 825 {
 826   /* Computing the "optimal" size.  */
 827   struct set_list *set_run;
 828   size_t best_total, best_size, best_depth;
 829   size_t act_size, act_depth;
 830   struct catalog_obj obj;
 831   struct obstack string_pool;
 832   const char *strings;
 833   size_t strings_size;
 834   uint32_t *array1, *array2;
 835   size_t cnt;
 836   int fd;
 837
 838   /* If not otherwise told try to read file with existing
 839      translations.  */
 840   if (!force_new)
 841     read_old (catalog, output_name);
 842
 843   /* Initialize best_size with a very high value.  */
 844   best_total = best_size = best_depth = UINT_MAX;
 845
 846   /* We need some start size for testing.  Let's start with
 847      TOTAL_MESSAGES / 5, which theoretically provides a mean depth of
 848      5.  */
 849   act_size = 1 + catalog->total_messages / 5;
 850
 851   /* We determine the size of a hash table here.  Because the message
 852      numbers can be chosen arbitrary by the programmer we cannot use
 853      the simple method of accessing the array using the message
 854      number.  The algorithm is based on the trivial hash function
 855      NUMBER % TABLE_SIZE, where collisions are stored in a second
 856      dimension up to TABLE_DEPTH.  We here compute TABLE_SIZE so that
 857      the needed space (= TABLE_SIZE * TABLE_DEPTH) is minimal.  */
 858   while (act_size <= best_total)
 859     {
 860       size_t deep[act_size];
 861
 862       act_depth = 1;
 863       memset (deep, '\0', act_size * sizeof (size_t));
 864       set_run = catalog->all_sets;
 865       while (set_run != NULL)
 866         {
 867           struct message_list *message_run;
 868
 869           message_run = set_run->messages;
 870           while (message_run != NULL)
 871             {
 872               size_t idx = (message_run->number * set_run->number) % act_size;
 873
 874               ++deep[idx];
 875               if (deep[idx] > act_depth)
 876                 {
 877                   act_depth = deep[idx];
 878                   if (act_depth * act_size > best_total)
 879                     break;
 880                 }
 881               message_run = message_run->next;
 882             }
 883           set_run = set_run->next;
 884         }
 885
 886       if (act_depth * act_size <= best_total)
 887         {
 888           /* We have found a better solution.  */
 889           best_total = act_depth * act_size;
 890           best_size = act_size;
 891           best_depth = act_depth;
 892         }
 893
 894       ++act_size;
 895     }
 896
 897   /* let's be prepared for an empty message file.  */
 898   if (best_size == UINT_MAX)
 899     {
 900       best_size = 1;
 901       best_depth = 1;
 902     }
 903
 904   /* OK, now we have the size we will use.  Fill in the header, build
 905      the table and the second one with swapped byte order.  */
 906   obj.magic = CATGETS_MAGIC;
 907   obj.plane_size = best_size;
 908   obj.plane_depth = best_depth;
 909
 910   /* Allocate room for all needed arrays.  */
 911   array1 =
 912     (uint32_t *) alloca (best_size * best_depth * sizeof (uint32_t) * 3);
 913   memset (array1, '\0', best_size * best_depth * sizeof (uint32_t) * 3);
 914   array2
 915     = (uint32_t *) alloca (best_size * best_depth * sizeof (uint32_t) * 3);
 916   obstack_init (&string_pool);
 917
 918   set_run = catalog->all_sets;
 919   while (set_run != NULL)
 920     {
 921       struct message_list *message_run;
 922
 923       message_run = set_run->messages;
 924       while (message_run != NULL)
 925         {
 926           size_t idx = (((message_run->number * set_run->number) % best_size)
 927                         * 3);
 928           /* Determine collision depth.  */
 929           while (array1[idx] != 0)
 930             idx += best_size * 3;
 931
 932           /* Store set number, message number and pointer into string
 933              space, relative to the first string.  */
 934           array1[idx + 0] = set_run->number;
 935           array1[idx + 1] = message_run->number;
 936           array1[idx + 2] = obstack_object_size (&string_pool);
 937
 938           /* Add current string to the continuous space containing all
 939              strings.  */
 940           obstack_grow0 (&string_pool, message_run->message,
 941                          strlen (message_run->message));
 942
 943           message_run = message_run->next;
 944         }
 945
 946       set_run = set_run->next;
 947     }
 948   strings_size = obstack_object_size (&string_pool);
 949   strings = obstack_finish (&string_pool);
 950
 951   /* Compute ARRAY2 by changing the byte order.  */
 952   for (cnt = 0; cnt < best_size * best_depth * 3; ++cnt)
 953     array2[cnt] = SWAPU32 (array1[cnt]);
 954
 955   /* Now we can write out the whole data.  */
 956   if (strcmp (output_name, "-") == 0
 957       || strcmp (output_name, "/dev/stdout") == 0)
 958     fd = STDOUT_FILENO;
 959   else
 960     {
 961       fd = creat (output_name, 0666);
 962       if (fd < 0)
 963         error (EXIT_FAILURE, errno, gettext ("cannot open output file `%s'"),
 964                output_name);
 965     }
 966
 967   /* Write out header.  */
 968   write (fd, &obj, sizeof (obj));
 969
 970   /* We always write out the little endian version of the index
 971      arrays.  */
 972 #if __BYTE_ORDER == __LITTLE_ENDIAN
 973   write (fd, array1, best_size * best_depth * sizeof (uint32_t) * 3);
 974   write (fd, array2, best_size * best_depth * sizeof (uint32_t) * 3);
 975 #elif __BYTE_ORDER == __BIG_ENDIAN
 976   write (fd, array2, best_size * best_depth * sizeof (uint32_t) * 3);
 977   write (fd, array1, best_size * best_depth * sizeof (uint32_t) * 3);
 978 #else
 979 # error Cannot handle __BYTE_ORDER byte order
 980 #endif
 981
 982   /* Finally write the strings.  */
 983   write (fd, strings, strings_size);
 984
 985   if (fd != STDOUT_FILENO)
 986     close (fd);
 987
 988   /* If requested now write out the header file.  */
 989   if (header_name != NULL)
 990     {
 991       int first = 1;
 992       FILE *fp;
 993
 994       /* Open output file.  "-" or "/dev/stdout" means write to
 995          standard output.  */
 996       if (strcmp (header_name, "-") == 0
 997           || strcmp (header_name, "/dev/stdout") == 0)
 998         fp = stdout;
 999       else
1000         {
1001           fp = fopen (header_name, "w");
1002           if (fp == NULL)
1003             error (EXIT_FAILURE, errno,
1004                    gettext ("cannot open output file `%s'"), header_name);
1005         }
1006
1007       /* Iterate over all sets and all messages.  */
1008       set_run = catalog->all_sets;
1009       while (set_run != NULL)
1010         {
1011           struct message_list *message_run;
1012
1013           /* If the current message set has a symbolic name write this
1014              out first.  */
1015           if (set_run->symbol != NULL)
1016             fprintf (fp, "%s#define %sSet %#x\t/* %s:%Zu */\n",
1017                      first ? "" : "\n", set_run->symbol, set_run->number - 1,
1018                      set_run->fname, set_run->line);
1019           first = 0;
1020
1021           message_run = set_run->messages;
1022           while (message_run != NULL)
1023             {
1024               /* If the current message has a symbolic name write
1025                  #define out.  But we have to take care for the set
1026                  not having a symbolic name.  */
1027               if (message_run->symbol != NULL)
1028                 {
1029                   if (set_run->symbol == NULL)
1030                     fprintf (fp, "#define AutomaticSet%d%s %#x\t/* %s:%Zu */\n",
1031                              set_run->number, message_run->symbol,
1032                              message_run->number, message_run->fname,
1033                              message_run->line);
1034                   else
1035                     fprintf (fp, "#define %s%s %#x\t/* %s:%Zu */\n",
1036                              set_run->symbol, message_run->symbol,
1037                              message_run->number, message_run->fname,
1038                              message_run->line);
1039                 }
1040
1041               message_run = message_run->next;
1042             }
1043
1044           set_run = set_run->next;
1045         }
1046
1047       if (fp != stdout)
1048         fclose (fp);
1049     }
1050 }
1051
1052
1053 static struct set_list *
1054 find_set (struct catalog *current, int number)
1055 {
1056   struct set_list *result = current->all_sets;
1057
1058   /* We must avoid set number 0 because a set of this number signals
1059      in the tables that the entry is not occupied.  */
1060   ++number;
1061
1062   while (result != NULL)
1063     if (result->number == number)
1064       return result;
1065     else
1066       result = result->next;
1067
1068   /* Prepare new message set.  */
1069   result = (struct set_list *) xcalloc (1, sizeof (*result));
1070   result->number = number;
1071   result->next = current->all_sets;
1072   current->all_sets = result;
1073
1074   return result;
1075 }
1076
1077
1078 /* Normalize given string *in*place* by processing escape sequences
1079    and quote characters.  */
1080 static void
1081 normalize_line (const char *fname, size_t line, iconv_t cd, wchar_t *string,
1082                 wchar_t quote_char, wchar_t escape_char)
1083 {
1084   int is_quoted;
1085   wchar_t *rp = string;
1086   wchar_t *wp = string;
1087
1088   if (quote_char != L'\0' && *rp == quote_char)
1089     {
1090       is_quoted = 1;
1091       ++rp;
1092     }
1093   else
1094     is_quoted = 0;
1095
1096   while (*rp != L'\0')
1097     if (*rp == quote_char)
1098       /* We simply end the string when we find the first time an
1099          not-escaped quote character.  */
1100         break;
1101     else if (*rp == escape_char)
1102       {
1103         ++rp;
1104         if (quote_char != L'\0' && *rp == quote_char)
1105           /* This is an extension to XPG.  */
1106           *wp++ = *rp++;
1107         else
1108           /* Recognize escape sequences.  */
1109           switch (*rp)
1110             {
1111             case L'n':
1112               *wp++ = L'\n';
1113               ++rp;
1114               break;
1115             case L't':
1116               *wp++ = L'\t';
1117               ++rp;
1118               break;
1119             case L'v':
1120               *wp++ = L'\v';
1121               ++rp;
1122               break;
1123             case L'b':
1124               *wp++ = L'\b';
1125               ++rp;
1126               break;
1127             case L'r':
1128               *wp++ = L'\r';
1129               ++rp;
1130               break;
1131             case L'f':
1132               *wp++ = L'\f';
1133               ++rp;
1134               break;
1135             case L'0' ... L'7':
1136               {
1137                 int number;
1138                 char cbuf[2];
1139                 char *cbufptr;
1140                 size_t cbufin;
1141                 wchar_t wcbuf[2];
1142                 char *wcbufptr;
1143                 size_t wcbufin;
1144
1145                 number = *rp++ - L'0';
1146                 while (number <= (255 / 8) && *rp >= L'0' && *rp <= L'7')
1147                   {
1148                     number *= 8;
1149                     number += *rp++ - L'0';
1150                   }
1151
1152                 cbuf[0] = (char) number;
1153                 cbuf[1] = '\0';
1154                 cbufptr = cbuf;
1155                 cbufin = 2;
1156
1157                 wcbufptr = (char *) wcbuf;
1158                 wcbufin = sizeof (wcbuf);
1159
1160                 /* Flush the state.  */
1161                 iconv (cd, NULL, NULL, NULL, NULL);
1162
1163                 iconv (cd, &cbufptr, &cbufin, &wcbufptr, &wcbufin);
1164                 if (cbufptr != &cbuf[2] || (wchar_t *) wcbufptr != &wcbuf[2])
1165                   error_at_line (0, 0, fname, line,
1166                                  gettext ("invalid escape sequence"));
1167                 else
1168                   *wp++ = wcbuf[0];
1169               }
1170               break;
1171             default:
1172               if (*rp == escape_char)
1173                 {
1174                   *wp++ = escape_char;
1175                   ++rp;
1176                 }
1177               else
1178                 /* Simply ignore the backslash character.  */;
1179               break;
1180             }
1181       }
1182     else
1183       *wp++ = *rp++;
1184
1185   /* If we saw a quote character at the beginning we expect another
1186      one at the end.  */
1187   if (is_quoted && *rp != quote_char)
1188     error_at_line (0, 0, fname, line, gettext ("unterminated message"));
1189
1190   /* Terminate string.  */
1191   *wp = L'\0';
1192   return;
1193 }
1194
1195
1196 static void
1197 read_old (struct catalog *catalog, const char *file_name)
1198 {
1199   struct catalog_info old_cat_obj;
1200   struct set_list *set = NULL;
1201   int last_set = -1;
1202   size_t cnt;
1203
1204   /* Try to open catalog, but don't look through the NLSPATH.  */
1205   if (__open_catalog (file_name, NULL, NULL, &old_cat_obj) != 0)
1206     {
1207       if (errno == ENOENT)
1208         /* No problem, the catalog simply does not exist.  */
1209         return;
1210       else
1211         error (EXIT_FAILURE, errno,
1212                gettext ("while opening old catalog file"));
1213     }
1214
1215   /* OK, we have the catalog loaded.  Now read all messages and merge
1216      them.  When set and message number clash for any message the new
1217      one is used.  If the new one is empty it indicates that the
1218      message should be deleted.  */
1219   for (cnt = 0; cnt < old_cat_obj.plane_size * old_cat_obj.plane_depth; ++cnt)
1220     {
1221       struct message_list *message, *last;
1222
1223       if (old_cat_obj.name_ptr[cnt * 3 + 0] == 0)
1224         /* No message in this slot.  */
1225         continue;
1226
1227       if (old_cat_obj.name_ptr[cnt * 3 + 0] - 1 != (uint32_t) last_set)
1228         {
1229           last_set = old_cat_obj.name_ptr[cnt * 3 + 0] - 1;
1230           set = find_set (catalog, old_cat_obj.name_ptr[cnt * 3 + 0] - 1);
1231         }
1232
1233       last = NULL;
1234       message = set->messages;
1235       while (message != NULL)
1236         {
1237           if ((uint32_t) message->number >= old_cat_obj.name_ptr[cnt * 3 + 1])
1238             break;
1239           last = message;
1240           message = message->next;
1241         }
1242
1243       if (message == NULL
1244           || (uint32_t) message->number > old_cat_obj.name_ptr[cnt * 3 + 1])
1245         {
1246           /* We have found a message which is not yet in the catalog.
1247              Insert it at the right position.  */
1248           struct message_list *newp;
1249
1250           newp = (struct message_list *) xmalloc (sizeof(*newp));
1251           newp->number = old_cat_obj.name_ptr[cnt * 3 + 1];
1252           newp->message =
1253             &old_cat_obj.strings[old_cat_obj.name_ptr[cnt * 3 + 2]];
1254           newp->fname = NULL;
1255           newp->line = 0;
1256           newp->symbol = NULL;
1257           newp->next = message;
1258
1259           if (last == NULL)
1260             set->messages = newp;
1261           else
1262             last->next = newp;
1263
1264           ++catalog->total_messages;
1265         }
1266       else if (*message->message == '\0')
1267         {
1268           /* The new empty message has overridden the old one thus
1269              "deleting" it as required.  Now remove the empty remains. */
1270           if (last == NULL)
1271             set->messages = message->next;
1272           else
1273             last->next = message->next;
1274         }
1275     }
1276 }
1277
1278
1279 static int
1280 open_conversion (const char *codeset, iconv_t *cd_towcp, iconv_t *cd_tombp,
1281                  wchar_t *escape_charp)
1282 {
1283   char buf[2];
1284   char *bufptr;
1285   size_t bufsize;
1286   wchar_t wbuf[2];
1287   char *wbufptr;
1288   size_t wbufsize;
1289
1290   /* If the input file does not specify the codeset use the locale's.  */
1291   if (codeset == NULL)
1292     {
1293       setlocale (LC_ALL, "");
1294       codeset = nl_langinfo (CODESET);
1295       setlocale (LC_ALL, "C");
1296     }
1297
1298   /* Get the conversion modules.  */
1299   *cd_towcp = iconv_open ("WCHAR_T", codeset);
1300   *cd_tombp = iconv_open (codeset, "WCHAR_T");
1301   if (*cd_towcp == (iconv_t) -1 || *cd_tombp == (iconv_t) -1)
1302     {
1303       error (0, 0, gettext ("conversion modules not available"));
1304       if (*cd_towcp != (iconv_t) -1)
1305         iconv_close (*cd_towcp);
1306
1307       return 1;
1308     }
1309
1310   /* One special case for historical reasons is the backslash
1311      character.  In some codesets the byte value 0x5c is not mapped to
1312      U005c in Unicode.  These charsets then don't have a backslash
1313      character at all.  Therefore we have to live with whatever the
1314      codeset provides and recognize, instead of the U005c, the character
1315      the byte value 0x5c is mapped to.  */
1316   buf[0] = '\\';
1317   buf[1] = '\0';
1318   bufptr = buf;
1319   bufsize = 2;
1320
1321   wbufptr = (char *) wbuf;
1322   wbufsize = sizeof (wbuf);
1323
1324   iconv (*cd_towcp, &bufptr, &bufsize, &wbufptr, &wbufsize);
1325   if (bufsize != 0 || wbufsize != 0)
1326     {
1327       /* Something went wrong, we couldn't convert the byte 0x5c.  Go
1328          on with using U005c.  */
1329       error (0, 0, gettext ("cannot determine escape character"));
1330       *escape_charp = L'\\';
1331     }
1332   else
1333     *escape_charp = wbuf[0];
1334
1335   return 0;
1336 }