catgets/gencat.c

   1 /* Copyright (C) 1996-2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 1996.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published
   7    by the Free Software Foundation; version 2 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software Foundation,
  17    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 # include "config.h"
  21 #endif
  22
  23 #include <argp.h>
  24 #include <assert.h>
  25 #include <ctype.h>
  26 #include <endian.h>
  27 #include <errno.h>
  28 #include <error.h>
  29 #include <fcntl.h>
  30 #include <iconv.h>
  31 #include <langinfo.h>
  32 #include <locale.h>
  33 #include <libintl.h>
  34 #include <limits.h>
  35 #include <nl_types.h>
  36 #include <obstack.h>
  37 #include <stdint.h>
  38 #include <stdio.h>
  39 #include <stdlib.h>
  40 #include <string.h>
  41 #include <unistd.h>
  42 #include <wchar.h>
  43
  44 #include "version.h"
  45
  46 #include "catgetsinfo.h"
  47
  48
  49 #define SWAPU32(w) \
  50   (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
  51
  52 struct message_list
  53 {
  54   int number;
  55   const char *message;
  56
  57   const char *fname;
  58   size_t line;
  59   const char *symbol;
  60
  61   struct message_list *next;
  62 };
  63
  64
  65 struct set_list
  66 {
  67   int number;
  68   int deleted;
  69   struct message_list *messages;
  70   int last_message;
  71
  72   const char *fname;
  73   size_t line;
  74   const char *symbol;
  75
  76   struct set_list *next;
  77 };
  78
  79
  80 struct catalog
  81 {
  82   struct set_list *all_sets;
  83   struct set_list *current_set;
  84   size_t total_messages;
  85   wint_t quote_char;
  86   int last_set;
  87
  88   struct obstack mem_pool;
  89 };
  90
  91
  92 /* If non-zero force creation of new file, not using existing one.  */
  93 static int force_new;
  94
  95 /* Name of output file.  */
  96 static const char *output_name;
  97
  98 /* Name of generated C header file.  */
  99 static const char *header_name;
 100
 101 /* Name and version of program.  */
 102 static void print_version (FILE *stream, struct argp_state *state);
 103 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
 104
 105 #define OPT_NEW 1
 106
 107 /* Definitions of arguments for argp functions.  */
 108 static const struct argp_option options[] =
 109 {
 110   { "header", 'H', N_("NAME"), 0,
 111     N_("Create C header file NAME containing symbol definitions") },
 112   { "new", OPT_NEW, NULL, 0,
 113     N_("Do not use existing catalog, force new output file") },
 114   { "output", 'o', N_("NAME"), 0, N_("Write output to file NAME") },
 115   { NULL, 0, NULL, 0, NULL }
 116 };
 117
 118 /* Short description of program.  */
 119 static const char doc[] = N_("Generate message catalog.\
 120 \vIf INPUT-FILE is -, input is read from standard input.  If OUTPUT-FILE\n\
 121 is -, output is written to standard output.\n");
 122
 123 /* Strings for arguments in help texts.  */
 124 static const char args_doc[] = N_("\
 125 -o OUTPUT-FILE [INPUT-FILE]...\n[OUTPUT-FILE [INPUT-FILE]...]");
 126
 127 /* Prototype for option handler.  */
 128 static error_t parse_opt (int key, char *arg, struct argp_state *state);
 129
 130 /* Function to print some extra text in the help message.  */
 131 static char *more_help (int key, const char *text, void *input);
 132
 133 /* Data structure to communicate with argp functions.  */
 134 static struct argp argp =
 135 {
 136   options, parse_opt, args_doc, doc, NULL, more_help
 137 };
 138
 139
 140 /* Wrapper functions with error checking for standard functions.  */
 141 extern void *xmalloc (size_t n);
 142 extern void *xcalloc (size_t n, size_t s);
 143 extern void *xrealloc (void *o, size_t n);
 144 extern char *xstrdup (const char *);
 145
 146 /* Prototypes for local functions.  */
 147 static void error_print (void);
 148 static struct catalog *read_input_file (struct catalog *current,
 149                                         const char *fname);
 150 static void write_out (struct catalog *result, const char *output_name,
 151                        const char *header_name);
 152 static struct set_list *find_set (struct catalog *current, int number);
 153 static void normalize_line (const char *fname, size_t line, iconv_t cd,
 154                             wchar_t *string, wchar_t quote_char,
 155                             wchar_t escape_char);
 156 static void read_old (struct catalog *catalog, const char *file_name);
 157 static int open_conversion (const char *codesetp, iconv_t *cd_towcp,
 158                             iconv_t *cd_tombp, wchar_t *escape_charp);
 159
 160
 161 int
 162 main (int argc, char *argv[])
 163 {
 164   struct catalog *result;
 165   int remaining;
 166
 167   /* Set program name for messages.  */
 168   error_print_progname = error_print;
 169
 170   /* Set locale via LC_ALL.  */
 171   setlocale (LC_ALL, "");
 172
 173   /* Set the text message domain.  */
 174   textdomain (PACKAGE);
 175
 176   /* Initialize local variables.  */
 177   result = NULL;
 178
 179   /* Parse and process arguments.  */
 180   argp_parse (&argp, argc, argv, 0, &remaining, NULL);
 181
 182   /* Determine output file.  */
 183   if (output_name == NULL)
 184     output_name = remaining < argc ? argv[remaining++] : "-";
 185
 186   /* Process all input files.  */
 187   setlocale (LC_CTYPE, "C");
 188   if (remaining < argc)
 189     do
 190       result = read_input_file (result, argv[remaining]);
 191     while (++remaining < argc);
 192   else
 193     result = read_input_file (NULL, "-");
 194
 195   /* Write out the result.  */
 196   if (result != NULL)
 197     write_out (result, output_name, header_name);
 198
 199   return error_message_count != 0;
 200 }
 201
 202
 203 /* Handle program arguments.  */
 204 static error_t
 205 parse_opt (int key, char *arg, struct argp_state *state)
 206 {
 207   switch (key)
 208     {
 209     case 'H':
 210       header_name = arg;
 211       break;
 212     case OPT_NEW:
 213       force_new = 1;
 214       break;
 215     case 'o':
 216       output_name = arg;
 217       break;
 218     default:
 219       return ARGP_ERR_UNKNOWN;
 220     }
 221   return 0;
 222 }
 223
 224
 225 static char *
 226 more_help (int key, const char *text, void *input)
 227 {
 228   switch (key)
 229     {
 230     case ARGP_KEY_HELP_EXTRA:
 231       /* We print some extra information.  */
 232       return strdup (gettext ("\
 233 For bug reporting instructions, please see:\n\
 234 <http://www.gnu.org/software/libc/bugs.html>.\n"));
 235     default:
 236       break;
 237     }
 238   return (char *) text;
 239 }
 240
 241 /* Print the version information.  */
 242 static void
 243 print_version (FILE *stream, struct argp_state *state)
 244 {
 245   fprintf (stream, "gencat (GNU %s) %s\n", PACKAGE, VERSION);
 246   fprintf (stream, gettext ("\
 247 Copyright (C) %s Free Software Foundation, Inc.\n\
 248 This is free software; see the source for copying conditions.  There is NO\n\
 249 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
 250 "), "2011");
 251   fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
 252 }
 253
 254
 255 /* The address of this function will be assigned to the hook in the
 256    error functions.  */
 257 static void
 258 error_print ()
 259 {
 260   /* We don't want the program name to be printed in messages.  Emacs'
 261      compile.el does not like this.  */
 262 }
 263
 264
 265 static struct catalog *
 266 read_input_file (struct catalog *current, const char *fname)
 267 {
 268   FILE *fp;
 269   char *buf;
 270   size_t len;
 271   size_t line_number;
 272   wchar_t *wbuf;
 273   size_t wbufsize;
 274   iconv_t cd_towc = (iconv_t) -1;
 275   iconv_t cd_tomb = (iconv_t) -1;
 276   wchar_t escape_char = L'\\';
 277   char *codeset = NULL;
 278
 279   if (strcmp (fname, "-") == 0 || strcmp (fname, "/dev/stdin") == 0)
 280     {
 281       fp = stdin;
 282       fname = gettext ("*standard input*");
 283     }
 284   else
 285     fp = fopen (fname, "r");
 286   if (fp == NULL)
 287     {
 288       error (0, errno, gettext ("cannot open input file `%s'"), fname);
 289       return current;
 290     }
 291
 292   /* If we haven't seen anything yet, allocate result structure.  */
 293   if (current == NULL)
 294     {
 295       current = (struct catalog *) xcalloc (1, sizeof (*current));
 296
 297 #define obstack_chunk_alloc malloc
 298 #define obstack_chunk_free free
 299       obstack_init (&current->mem_pool);
 300
 301       current->current_set = find_set (current, NL_SETD);
 302     }
 303
 304   buf = NULL;
 305   len = 0;
 306   line_number = 0;
 307
 308   wbufsize = 1024;
 309   wbuf = (wchar_t *) xmalloc (wbufsize);
 310
 311   while (!feof (fp))
 312     {
 313       int continued;
 314       int used;
 315       size_t start_line = line_number + 1;
 316       char *this_line;
 317
 318       do
 319         {
 320           int act_len;
 321
 322           act_len = getline (&buf, &len, fp);
 323           if (act_len <= 0)
 324             break;
 325           ++line_number;
 326
 327           /* It the line continued?  */
 328           continued = 0;
 329           if (buf[act_len - 1] == '\n')
 330             {
 331               --act_len;
 332
 333               /* There might be more than one backslash at the end of
 334                  the line.  Only if there is an odd number of them is
 335                  the line continued.  */
 336               if (act_len > 0 && buf[act_len - 1] == '\\')
 337                 {
 338                   int temp_act_len = act_len;
 339
 340                   do
 341                     {
 342                       --temp_act_len;
 343                       continued = !continued;
 344                     }
 345                   while (temp_act_len > 0 && buf[temp_act_len - 1] == '\\');
 346
 347                   if (continued)
 348                     --act_len;
 349                 }
 350             }
 351
 352           /* Append to currently selected line.  */
 353           obstack_grow (&current->mem_pool, buf, act_len);
 354         }
 355       while (continued);
 356
 357       obstack_1grow (&current->mem_pool, '\0');
 358       this_line = (char *) obstack_finish (&current->mem_pool);
 359
 360       used = 0;
 361       if (this_line[0] == '$')
 362         {
 363           if (isblank (this_line[1]))
 364             {
 365               int cnt = 1;
 366               while (isblank (this_line[cnt]))
 367                 ++cnt;
 368               if (strncmp (&this_line[cnt], "codeset=", 8) != 0)
 369                 /* This is a comment line. Do nothing.  */;
 370               else if (codeset != NULL)
 371                 /* Ignore multiple codeset. */;
 372               else
 373                 {
 374                   int start = cnt + 8;
 375                   cnt = start;
 376                   while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
 377                     ++cnt;
 378                   if (cnt != start)
 379                     {
 380                       int len = cnt - start;
 381                       codeset = xmalloc (len + 1);
 382                       *((char *) mempcpy (codeset, &this_line[start], len))
 383                         = '\0';
 384                     }
 385                 }
 386             }
 387           else if (strncmp (&this_line[1], "set", 3) == 0)
 388             {
 389               int cnt = sizeof ("set");
 390               int set_number;
 391               const char *symbol = NULL;
 392               while (isspace (this_line[cnt]))
 393                 ++cnt;
 394
 395               if (isdigit (this_line[cnt]))
 396                 {
 397                   set_number = atol (&this_line[cnt]);
 398
 399                   /* If the given number for the character set is
 400                      higher than any we used for symbolic set names
 401                      avoid clashing by using only higher numbers for
 402                      the following symbolic definitions.  */
 403                   if (set_number > current->last_set)
 404                     current->last_set = set_number;
 405                 }
 406               else
 407                 {
 408                   /* See whether it is a reasonable identifier.  */
 409                   int start = cnt;
 410                   while (isalnum (this_line[cnt]) || this_line[cnt] == '_')
 411                     ++cnt;
 412
 413                   if (cnt == start)
 414                     {
 415                       /* No correct character found.  */
 416                       error_at_line (0, 0, fname, start_line,
 417                                      gettext ("illegal set number"));
 418                       set_number = 0;
 419                     }
 420                   else
 421                     {
 422                       /* We have found seomthing that looks like a
 423                          correct identifier.  */
 424                       struct set_list *runp;
 425
 426                       this_line[cnt] = '\0';
 427                       used = 1;
 428                       symbol = &this_line[start];
 429
 430                       /* Test whether the identifier was already used.  */
 431                       runp = current->all_sets;
 432                       while (runp != 0)
 433                         if (runp->symbol != NULL
 434                             && strcmp (runp->symbol, symbol) == 0)
 435                           break;
 436                         else
 437                           runp = runp->next;
 438
 439                       if (runp != NULL)
 440                         {
 441                           /* We cannot allow duplicate identifiers for
 442                              message sets.  */
 443                           error_at_line (0, 0, fname, start_line,
 444                                          gettext ("duplicate set definition"));
 445                           error_at_line (0, 0, runp->fname, runp->line,
 446                                          gettext ("\
 447 this is the first definition"));
 448                           set_number = 0;
 449                         }
 450                       else
 451                         /* Allocate next free message set for identifier.  */
 452                         set_number = ++current->last_set;
 453                     }
 454                 }
 455
 456               if (set_number != 0)
 457                 {
 458                   /* We found a legal set number.  */
 459                   current->current_set = find_set (current, set_number);
 460                   if (symbol != NULL)
 461                       used = 1;
 462                   current->current_set->symbol = symbol;
 463                   current->current_set->fname = fname;
 464                   current->current_set->line = start_line;
 465                 }
 466             }
 467           else if (strncmp (&this_line[1], "delset", 6) == 0)
 468             {
 469               int cnt = sizeof ("delset");
 470               size_t set_number;
 471               while (isspace (this_line[cnt]))
 472                 ++cnt;
 473
 474               if (isdigit (this_line[cnt]))
 475                 {
 476                   size_t set_number = atol (&this_line[cnt]);
 477                   struct set_list *set;
 478
 479                   /* Mark the message set with the given number as
 480                      deleted.  */
 481                   set = find_set (current, set_number);
 482                   set->deleted = 1;
 483                 }
 484               else
 485                 {
 486                   /* See whether it is a reasonable identifier.  */
 487                   int start = cnt;
 488                   while (isalnum (this_line[cnt]) || this_line[cnt] == '_')
 489                     ++cnt;
 490
 491                   if (cnt == start)
 492                     {
 493                       error_at_line (0, 0, fname, start_line,
 494                                      gettext ("illegal set number"));
 495                       set_number = 0;
 496                     }
 497                   else
 498                     {
 499                       const char *symbol;
 500                       struct set_list *runp;
 501
 502                       this_line[cnt] = '\0';
 503                       used = 1;
 504                       symbol = &this_line[start];
 505
 506                       /* We have a symbolic set name.  This name must
 507                          appear somewhere else in the catalogs read so
 508                          far.  */
 509                       set_number = 0;
 510                       for (runp = current->all_sets; runp != NULL;
 511                            runp = runp->next)
 512                         {
 513                           if (strcmp (runp->symbol, symbol) == 0)
 514                             {
 515                               runp->deleted = 1;
 516                               break;
 517                             }
 518                         }
 519                       if (runp == NULL)
 520                         /* Name does not exist before.  */
 521                         error_at_line (0, 0, fname, start_line,
 522                                        gettext ("unknown set `%s'"), symbol);
 523                     }
 524                 }
 525             }
 526           else if (strncmp (&this_line[1], "quote", 5) == 0)
 527             {
 528               char buf[2];
 529               char *bufptr;
 530               size_t buflen;
 531               char *wbufptr;
 532               size_t wbuflen;
 533               int cnt;
 534
 535               cnt = sizeof ("quote");
 536               while (isspace (this_line[cnt]))
 537                 ++cnt;
 538
 539               /* We need the conversion.  */
 540               if (cd_towc == (iconv_t) -1
 541                   && open_conversion (codeset, &cd_towc, &cd_tomb,
 542                                       &escape_char) != 0)
 543                 /* Something is wrong.  */
 544                 goto out;
 545
 546               /* Yes, the quote char can be '\0'; this means no quote
 547                  char.  The function using the information works on
 548                  wide characters so we have to convert it here.  */
 549               buf[0] = this_line[cnt];
 550               buf[1] = '\0';
 551               bufptr = buf;
 552               buflen = 2;
 553
 554               wbufptr = (char *) wbuf;
 555               wbuflen = wbufsize;
 556
 557               /* Flush the state.  */
 558               iconv (cd_towc, NULL, NULL, NULL, NULL);
 559
 560               iconv (cd_towc, &bufptr, &buflen, &wbufptr, &wbuflen);
 561               if (buflen != 0 || (wchar_t *) wbufptr != &wbuf[2])
 562                 error_at_line (0, 0, fname, start_line,
 563                                gettext ("invalid quote character"));
 564               else
 565                 /* Use the converted wide character.  */
 566                 current->quote_char = wbuf[0];
 567             }
 568           else
 569             {
 570               int cnt;
 571               cnt = 2;
 572               while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
 573                 ++cnt;
 574               this_line[cnt] = '\0';
 575               error_at_line (0, 0, fname, start_line,
 576                              gettext ("unknown directive `%s': line ignored"),
 577                              &this_line[1]);
 578             }
 579         }
 580       else if (isalnum (this_line[0]) || this_line[0] == '_')
 581         {
 582           const char *ident = this_line;
 583           char *line = this_line;
 584           int message_number;
 585
 586           do
 587             ++line;
 588           while (line[0] != '\0' && !isspace (line[0]));
 589           if (line[0] != '\0')
 590             *line++ = '\0';     /* Terminate the identifier.  */
 591
 592           /* Now we found the beginning of the message itself.  */
 593
 594           if (isdigit (ident[0]))
 595             {
 596               struct message_list *runp;
 597               struct message_list *lastp;
 598
 599               message_number = atoi (ident);
 600
 601               /* Find location to insert the new message.  */
 602               runp = current->current_set->messages;
 603               lastp = NULL;
 604               while (runp != NULL)
 605                 if (runp->number == message_number)
 606                   break;
 607                 else
 608                   {
 609                     lastp = runp;
 610                     runp = runp->next;
 611                   }
 612               if (runp != NULL)
 613                 {
 614                   /* Oh, oh.  There is already a message with this
 615                      number in the message set.  */
 616                   if (runp->symbol == NULL)
 617                     {
 618                       /* The existing message had its number specified
 619                          by the user.  Fatal collision type uh, oh.  */
 620                       error_at_line (0, 0, fname, start_line,
 621                                      gettext ("duplicated message number"));
 622                       error_at_line (0, 0, runp->fname, runp->line,
 623                                      gettext ("this is the first definition"));
 624                       message_number = 0;
 625                     }
 626                   else
 627                     {
 628                       /* Collision was with number auto-assigned to a
 629                          symbolic.  Change existing symbolic number
 630                          and move to end the list (if not already there).  */
 631                       runp->number = ++current->current_set->last_message;
 632
 633                       if (runp->next != NULL)
 634                         {
 635                           struct message_list *endp;
 636
 637                           if (lastp == NULL)
 638                             current->current_set->messages=runp->next;
 639                           else
 640                             lastp->next=runp->next;
 641
 642                           endp = runp->next;
 643                           while (endp->next != NULL)
 644                             endp = endp->next;
 645
 646                           endp->next = runp;
 647                           runp->next = NULL;
 648                         }
 649                     }
 650                 }
 651               ident = NULL;     /* We don't have a symbol.  */
 652
 653               if (message_number != 0
 654                   && message_number > current->current_set->last_message)
 655                 current->current_set->last_message = message_number;
 656             }
 657           else if (ident[0] != '\0')
 658             {
 659               struct message_list *runp;
 660               struct message_list *lastp;
 661
 662               /* Test whether the symbolic name was not used for
 663                  another message in this message set.  */
 664               runp = current->current_set->messages;
 665               lastp = NULL;
 666               while (runp != NULL)
 667                 if (runp->symbol != NULL && strcmp (ident, runp->symbol) == 0)
 668                   break;
 669                 else
 670                   runp = runp->next;
 671               if (runp != NULL)
 672                 {
 673                   /* The name is already used.  */
 674                   error_at_line (0, 0, fname, start_line, gettext ("\
 675 duplicated message identifier"));
 676                   error_at_line (0, 0, runp->fname, runp->line,
 677                                  gettext ("this is the first definition"));
 678                   message_number = 0;
 679                 }
 680               else
 681                 /* Give the message the next unused number.  */
 682                 message_number = ++current->current_set->last_message;
 683             }
 684           else
 685             message_number = 0;
 686
 687           if (message_number != 0)
 688             {
 689               char *inbuf;
 690               size_t inlen;
 691               char *outbuf;
 692               size_t outlen;
 693               struct message_list *newp;
 694               size_t line_len = strlen (line) + 1;
 695               size_t ident_len = 0;
 696
 697               /* We need the conversion.  */
 698               if (cd_towc == (iconv_t) -1
 699                   && open_conversion (codeset, &cd_towc, &cd_tomb,
 700                                       &escape_char) != 0)
 701                 /* Something is wrong.  */
 702                 goto out;
 703
 704               /* Convert to a wide character string.  We have to
 705                  interpret escape sequences which will be impossible
 706                  without doing the conversion if the codeset of the
 707                  message is stateful.  */
 708               while (1)
 709                 {
 710                   inbuf = line;
 711                   inlen = line_len;
 712                   outbuf = (char *) wbuf;
 713                   outlen = wbufsize;
 714
 715                   /* Flush the state.  */
 716                   iconv (cd_towc, NULL, NULL, NULL, NULL);
 717
 718                   iconv (cd_towc, &inbuf, &inlen, &outbuf, &outlen);
 719                   if (inlen == 0)
 720                     {
 721                       /* The string is converted.  */
 722                       assert (outlen < wbufsize);
 723                       assert (wbuf[(wbufsize - outlen) / sizeof (wchar_t) - 1]
 724                               == L'\0');
 725                       break;
 726                     }
 727
 728                   if (outlen != 0)
 729                     {
 730                       /* Something is wrong with this string, we ignore it.  */
 731                       error_at_line (0, 0, fname, start_line, gettext ("\
 732 invalid character: message ignored"));
 733                       goto ignore;
 734                     }
 735
 736                   /* The output buffer is too small.  */
 737                   wbufsize *= 2;
 738                   wbuf = (wchar_t *) xrealloc (wbuf, wbufsize);
 739                 }
 740
 741               /* Strip quote characters, change escape sequences into
 742                  correct characters etc.  */
 743               normalize_line (fname, start_line, cd_towc, wbuf,
 744                               current->quote_char, escape_char);
 745
 746               if (ident)
 747                 ident_len = line - this_line;
 748
 749               /* Now the string is free of escape sequences.  Convert it
 750                  back into a multibyte character string.  First free the
 751                  memory allocated for the original string.  */
 752               obstack_free (&current->mem_pool, this_line);
 753
 754               used = 1; /* Yes, we use the line.  */
 755
 756               /* Now fill in the new string.  It should never happen that
 757                  the replaced string is longer than the original.  */
 758               inbuf = (char *) wbuf;
 759               inlen = (wcslen (wbuf) + 1) * sizeof (wchar_t);
 760
 761               outlen = obstack_room (&current->mem_pool);
 762               obstack_blank (&current->mem_pool, outlen);
 763               this_line = (char *) obstack_base (&current->mem_pool);
 764               outbuf = this_line + ident_len;
 765               outlen -= ident_len;
 766
 767               /* Flush the state.  */
 768               iconv (cd_tomb, NULL, NULL, NULL, NULL);
 769
 770               iconv (cd_tomb, &inbuf, &inlen, &outbuf, &outlen);
 771               if (inlen != 0)
 772                 {
 773                   error_at_line (0, 0, fname, start_line,
 774                                  gettext ("invalid line"));
 775                   goto ignore;
 776                 }
 777               assert (outbuf[-1] == '\0');
 778
 779               /* Free the memory in the obstack we don't use.  */
 780               obstack_blank (&current->mem_pool, -(int) outlen);
 781               line = obstack_finish (&current->mem_pool);
 782
 783               newp = (struct message_list *) xmalloc (sizeof (*newp));
 784               newp->number = message_number;
 785               newp->message = line + ident_len;
 786               /* Remember symbolic name; is NULL if no is given.  */
 787               newp->symbol = ident ? line : NULL;
 788               /* Remember where we found the character.  */
 789               newp->fname = fname;
 790               newp->line = start_line;
 791
 792               /* Find place to insert to message.  We keep them in a
 793                  sorted single linked list.  */
 794               if (current->current_set->messages == NULL
 795                   || current->current_set->messages->number > message_number)
 796                 {
 797                   newp->next = current->current_set->messages;
 798                   current->current_set->messages = newp;
 799                 }
 800               else
 801                 {
 802                   struct message_list *runp;
 803                   runp = current->current_set->messages;
 804                   while (runp->next != NULL)
 805                     if (runp->next->number > message_number)
 806                       break;
 807                     else
 808                       runp = runp->next;
 809                   newp->next = runp->next;
 810                   runp->next = newp;
 811                 }
 812             }
 813           ++current->total_messages;
 814         }
 815       else
 816         {
 817           size_t cnt;
 818
 819           cnt = 0;
 820           /* See whether we have any non-white space character in this
 821              line.  */
 822           while (this_line[cnt] != '\0' && isspace (this_line[cnt]))
 823             ++cnt;
 824
 825           if (this_line[cnt] != '\0')
 826             /* Yes, some unknown characters found.  */
 827             error_at_line (0, 0, fname, start_line,
 828                            gettext ("malformed line ignored"));
 829         }
 830
 831     ignore:
 832       /* We can save the memory for the line if it was not used.  */
 833       if (!used)
 834         obstack_free (&current->mem_pool, this_line);
 835     }
 836
 837   /* Close the conversion modules.  */
 838   iconv_close (cd_towc);
 839   iconv_close (cd_tomb);
 840   free (codeset);
 841
 842  out:
 843   free (wbuf);
 844
 845   if (fp != stdin)
 846     fclose (fp);
 847   return current;
 848 }
 849
 850
 851 static void
 852 write_out (struct catalog *catalog, const char *output_name,
 853            const char *header_name)
 854 {
 855   /* Computing the "optimal" size.  */
 856   struct set_list *set_run;
 857   size_t best_total, best_size, best_depth;
 858   size_t act_size, act_depth;
 859   struct catalog_obj obj;
 860   struct obstack string_pool;
 861   const char *strings;
 862   size_t strings_size;
 863   uint32_t *array1, *array2;
 864   size_t cnt;
 865   int fd;
 866
 867   /* If not otherwise told try to read file with existing
 868      translations.  */
 869   if (!force_new)
 870     read_old (catalog, output_name);
 871
 872   /* Initialize best_size with a very high value.  */
 873   best_total = best_size = best_depth = UINT_MAX;
 874
 875   /* We need some start size for testing.  Let's start with
 876      TOTAL_MESSAGES / 5, which theoretically provides a mean depth of
 877      5.  */
 878   act_size = 1 + catalog->total_messages / 5;
 879
 880   /* We determine the size of a hash table here.  Because the message
 881      numbers can be chosen arbitrary by the programmer we cannot use
 882      the simple method of accessing the array using the message
 883      number.  The algorithm is based on the trivial hash function
 884      NUMBER % TABLE_SIZE, where collisions are stored in a second
 885      dimension up to TABLE_DEPTH.  We here compute TABLE_SIZE so that
 886      the needed space (= TABLE_SIZE * TABLE_DEPTH) is minimal.  */
 887   while (act_size <= best_total)
 888     {
 889       size_t deep[act_size];
 890
 891       act_depth = 1;
 892       memset (deep, '\0', act_size * sizeof (size_t));
 893       set_run = catalog->all_sets;
 894       while (set_run != NULL)
 895         {
 896           struct message_list *message_run;
 897
 898           message_run = set_run->messages;
 899           while (message_run != NULL)
 900             {
 901               size_t idx = (message_run->number * set_run->number) % act_size;
 902
 903               ++deep[idx];
 904               if (deep[idx] > act_depth)
 905                 {
 906                   act_depth = deep[idx];
 907                   if (act_depth * act_size > best_total)
 908                     break;
 909                 }
 910               message_run = message_run->next;
 911             }
 912           set_run = set_run->next;
 913         }
 914
 915       if (act_depth * act_size <= best_total)
 916         {
 917           /* We have found a better solution.  */
 918           best_total = act_depth * act_size;
 919           best_size = act_size;
 920           best_depth = act_depth;
 921         }
 922
 923       ++act_size;
 924     }
 925
 926   /* let's be prepared for an empty message file.  */
 927   if (best_size == UINT_MAX)
 928     {
 929       best_size = 1;
 930       best_depth = 1;
 931     }
 932
 933   /* OK, now we have the size we will use.  Fill in the header, build
 934      the table and the second one with swapped byte order.  */
 935   obj.magic = CATGETS_MAGIC;
 936   obj.plane_size = best_size;
 937   obj.plane_depth = best_depth;
 938
 939   /* Allocate room for all needed arrays.  */
 940   array1 =
 941     (uint32_t *) alloca (best_size * best_depth * sizeof (uint32_t) * 3);
 942   memset (array1, '\0', best_size * best_depth * sizeof (uint32_t) * 3);
 943   array2
 944     = (uint32_t *) alloca (best_size * best_depth * sizeof (uint32_t) * 3);
 945   obstack_init (&string_pool);
 946
 947   set_run = catalog->all_sets;
 948   while (set_run != NULL)
 949     {
 950       struct message_list *message_run;
 951
 952       message_run = set_run->messages;
 953       while (message_run != NULL)
 954         {
 955           size_t idx = (((message_run->number * set_run->number) % best_size)
 956                         * 3);
 957           /* Determine collision depth.  */
 958           while (array1[idx] != 0)
 959             idx += best_size * 3;
 960
 961           /* Store set number, message number and pointer into string
 962              space, relative to the first string.  */
 963           array1[idx + 0] = set_run->number;
 964           array1[idx + 1] = message_run->number;
 965           array1[idx + 2] = obstack_object_size (&string_pool);
 966
 967           /* Add current string to the continuous space containing all
 968              strings.  */
 969           obstack_grow0 (&string_pool, message_run->message,
 970                          strlen (message_run->message));
 971
 972           message_run = message_run->next;
 973         }
 974
 975       set_run = set_run->next;
 976     }
 977   strings_size = obstack_object_size (&string_pool);
 978   strings = obstack_finish (&string_pool);
 979
 980   /* Compute ARRAY2 by changing the byte order.  */
 981   for (cnt = 0; cnt < best_size * best_depth * 3; ++cnt)
 982     array2[cnt] = SWAPU32 (array1[cnt]);
 983
 984   /* Now we can write out the whole data.  */
 985   if (strcmp (output_name, "-") == 0
 986       || strcmp (output_name, "/dev/stdout") == 0)
 987     fd = STDOUT_FILENO;
 988   else
 989     {
 990       fd = creat (output_name, 0666);
 991       if (fd < 0)
 992         error (EXIT_FAILURE, errno, gettext ("cannot open output file `%s'"),
 993                output_name);
 994     }
 995
 996   /* Write out header.  */
 997   write (fd, &obj, sizeof (obj));
 998
 999   /* We always write out the little endian version of the index
1000      arrays.  */
1001 #if __BYTE_ORDER == __LITTLE_ENDIAN
1002   write (fd, array1, best_size * best_depth * sizeof (uint32_t) * 3);
1003   write (fd, array2, best_size * best_depth * sizeof (uint32_t) * 3);
1004 #elif __BYTE_ORDER == __BIG_ENDIAN
1005   write (fd, array2, best_size * best_depth * sizeof (uint32_t) * 3);
1006   write (fd, array1, best_size * best_depth * sizeof (uint32_t) * 3);
1007 #else
1008 # error Cannot handle __BYTE_ORDER byte order
1009 #endif
1010
1011   /* Finally write the strings.  */
1012   write (fd, strings, strings_size);
1013
1014   if (fd != STDOUT_FILENO)
1015     close (fd);
1016
1017   /* If requested now write out the header file.  */
1018   if (header_name != NULL)
1019     {
1020       int first = 1;
1021       FILE *fp;
1022
1023       /* Open output file.  "-" or "/dev/stdout" means write to
1024          standard output.  */
1025       if (strcmp (header_name, "-") == 0
1026           || strcmp (header_name, "/dev/stdout") == 0)
1027         fp = stdout;
1028       else
1029         {
1030           fp = fopen (header_name, "w");
1031           if (fp == NULL)
1032             error (EXIT_FAILURE, errno,
1033                    gettext ("cannot open output file `%s'"), header_name);
1034         }
1035
1036       /* Iterate over all sets and all messages.  */
1037       set_run = catalog->all_sets;
1038       while (set_run != NULL)
1039         {
1040           struct message_list *message_run;
1041
1042           /* If the current message set has a symbolic name write this
1043              out first.  */
1044           if (set_run->symbol != NULL)
1045             fprintf (fp, "%s#define %sSet %#x\t/* %s:%Zu */\n",
1046                      first ? "" : "\n", set_run->symbol, set_run->number - 1,
1047                      set_run->fname, set_run->line);
1048           first = 0;
1049
1050           message_run = set_run->messages;
1051           while (message_run != NULL)
1052             {
1053               /* If the current message has a symbolic name write
1054                  #define out.  But we have to take care for the set
1055                  not having a symbolic name.  */
1056               if (message_run->symbol != NULL)
1057                 {
1058                   if (set_run->symbol == NULL)
1059                     fprintf (fp, "#define AutomaticSet%d%s %#x\t/* %s:%Zu */\n",
1060                              set_run->number, message_run->symbol,
1061                              message_run->number, message_run->fname,
1062                              message_run->line);
1063                   else
1064                     fprintf (fp, "#define %s%s %#x\t/* %s:%Zu */\n",
1065                              set_run->symbol, message_run->symbol,
1066                              message_run->number, message_run->fname,
1067                              message_run->line);
1068                 }
1069
1070               message_run = message_run->next;
1071             }
1072
1073           set_run = set_run->next;
1074         }
1075
1076       if (fp != stdout)
1077         fclose (fp);
1078     }
1079 }
1080
1081
1082 static struct set_list *
1083 find_set (struct catalog *current, int number)
1084 {
1085   struct set_list *result = current->all_sets;
1086
1087   /* We must avoid set number 0 because a set of this number signals
1088      in the tables that the entry is not occupied.  */
1089   ++number;
1090
1091   while (result != NULL)
1092     if (result->number == number)
1093       return result;
1094     else
1095       result = result->next;
1096
1097   /* Prepare new message set.  */
1098   result = (struct set_list *) xcalloc (1, sizeof (*result));
1099   result->number = number;
1100   result->next = current->all_sets;
1101   current->all_sets = result;
1102
1103   return result;
1104 }
1105
1106
1107 /* Normalize given string *in*place* by processing escape sequences
1108    and quote characters.  */
1109 static void
1110 normalize_line (const char *fname, size_t line, iconv_t cd, wchar_t *string,
1111                 wchar_t quote_char, wchar_t escape_char)
1112 {
1113   int is_quoted;
1114   wchar_t *rp = string;
1115   wchar_t *wp = string;
1116
1117   if (quote_char != L'\0' && *rp == quote_char)
1118     {
1119       is_quoted = 1;
1120       ++rp;
1121     }
1122   else
1123     is_quoted = 0;
1124
1125   while (*rp != L'\0')
1126     if (*rp == quote_char)
1127       /* We simply end the string when we find the first time an
1128          not-escaped quote character.  */
1129         break;
1130     else if (*rp == escape_char)
1131       {
1132         ++rp;
1133         if (quote_char != L'\0' && *rp == quote_char)
1134           /* This is an extension to XPG.  */
1135           *wp++ = *rp++;
1136         else
1137           /* Recognize escape sequences.  */
1138           switch (*rp)
1139             {
1140             case L'n':
1141               *wp++ = L'\n';
1142               ++rp;
1143               break;
1144             case L't':
1145               *wp++ = L'\t';
1146               ++rp;
1147               break;
1148             case L'v':
1149               *wp++ = L'\v';
1150               ++rp;
1151               break;
1152             case L'b':
1153               *wp++ = L'\b';
1154               ++rp;
1155               break;
1156             case L'r':
1157               *wp++ = L'\r';
1158               ++rp;
1159               break;
1160             case L'f':
1161               *wp++ = L'\f';
1162               ++rp;
1163               break;
1164             case L'0' ... L'7':
1165               {
1166                 int number;
1167                 char cbuf[2];
1168                 char *cbufptr;
1169                 size_t cbufin;
1170                 wchar_t wcbuf[2];
1171                 char *wcbufptr;
1172                 size_t wcbufin;
1173
1174                 number = *rp++ - L'0';
1175                 while (number <= (255 / 8) && *rp >= L'0' && *rp <= L'7')
1176                   {
1177                     number *= 8;
1178                     number += *rp++ - L'0';
1179                   }
1180
1181                 cbuf[0] = (char) number;
1182                 cbuf[1] = '\0';
1183                 cbufptr = cbuf;
1184                 cbufin = 2;
1185
1186                 wcbufptr = (char *) wcbuf;
1187                 wcbufin = sizeof (wcbuf);
1188
1189                 /* Flush the state.  */
1190                 iconv (cd, NULL, NULL, NULL, NULL);
1191
1192                 iconv (cd, &cbufptr, &cbufin, &wcbufptr, &wcbufin);
1193                 if (cbufptr != &cbuf[2] || (wchar_t *) wcbufptr != &wcbuf[2])
1194                   error_at_line (0, 0, fname, line,
1195                                  gettext ("invalid escape sequence"));
1196                 else
1197                   *wp++ = wcbuf[0];
1198               }
1199               break;
1200             default:
1201               if (*rp == escape_char)
1202                 {
1203                   *wp++ = escape_char;
1204                   ++rp;
1205                 }
1206               else
1207                 /* Simply ignore the backslash character.  */;
1208               break;
1209             }
1210       }
1211     else
1212       *wp++ = *rp++;
1213
1214   /* If we saw a quote character at the beginning we expect another
1215      one at the end.  */
1216   if (is_quoted && *rp != quote_char)
1217     error_at_line (0, 0, fname, line, gettext ("unterminated message"));
1218
1219   /* Terminate string.  */
1220   *wp = L'\0';
1221   return;
1222 }
1223
1224
1225 static void
1226 read_old (struct catalog *catalog, const char *file_name)
1227 {
1228   struct catalog_info old_cat_obj;
1229   struct set_list *set = NULL;
1230   int last_set = -1;
1231   size_t cnt;
1232
1233   /* Try to open catalog, but don't look through the NLSPATH.  */
1234   if (__open_catalog (file_name, NULL, NULL, &old_cat_obj) != 0)
1235     {
1236       if (errno == ENOENT)
1237         /* No problem, the catalog simply does not exist.  */
1238         return;
1239       else
1240         error (EXIT_FAILURE, errno,
1241                gettext ("while opening old catalog file"));
1242     }
1243
1244   /* OK, we have the catalog loaded.  Now read all messages and merge
1245      them.  When set and message number clash for any message the new
1246      one is used.  If the new one is empty it indicates that the
1247      message should be deleted.  */
1248   for (cnt = 0; cnt < old_cat_obj.plane_size * old_cat_obj.plane_depth; ++cnt)
1249     {
1250       struct message_list *message, *last;
1251
1252       if (old_cat_obj.name_ptr[cnt * 3 + 0] == 0)
1253         /* No message in this slot.  */
1254         continue;
1255
1256       if (old_cat_obj.name_ptr[cnt * 3 + 0] - 1 != (uint32_t) last_set)
1257         {
1258           last_set = old_cat_obj.name_ptr[cnt * 3 + 0] - 1;
1259           set = find_set (catalog, old_cat_obj.name_ptr[cnt * 3 + 0] - 1);
1260         }
1261
1262       last = NULL;
1263       message = set->messages;
1264       while (message != NULL)
1265         {
1266           if ((uint32_t) message->number >= old_cat_obj.name_ptr[cnt * 3 + 1])
1267             break;
1268           last = message;
1269           message = message->next;
1270         }
1271
1272       if (message == NULL
1273           || (uint32_t) message->number > old_cat_obj.name_ptr[cnt * 3 + 1])
1274         {
1275           /* We have found a message which is not yet in the catalog.
1276              Insert it at the right position.  */
1277           struct message_list *newp;
1278
1279           newp = (struct message_list *) xmalloc (sizeof(*newp));
1280           newp->number = old_cat_obj.name_ptr[cnt * 3 + 1];
1281           newp->message =
1282             &old_cat_obj.strings[old_cat_obj.name_ptr[cnt * 3 + 2]];
1283           newp->fname = NULL;
1284           newp->line = 0;
1285           newp->symbol = NULL;
1286           newp->next = message;
1287
1288           if (last == NULL)
1289             set->messages = newp;
1290           else
1291             last->next = newp;
1292
1293           ++catalog->total_messages;
1294         }
1295       else if (*message->message == '\0')
1296         {
1297           /* The new empty message has overridden the old one thus
1298              "deleting" it as required.  Now remove the empty remains. */
1299           if (last == NULL)
1300             set->messages = message->next;
1301           else
1302             last->next = message->next;
1303         }
1304     }
1305 }
1306
1307
1308 static int
1309 open_conversion (const char *codeset, iconv_t *cd_towcp, iconv_t *cd_tombp,
1310                  wchar_t *escape_charp)
1311 {
1312   char buf[2];
1313   char *bufptr;
1314   size_t bufsize;
1315   wchar_t wbuf[2];
1316   char *wbufptr;
1317   size_t wbufsize;
1318
1319   /* If the input file does not specify the codeset use the locale's.  */
1320   if (codeset == NULL)
1321     {
1322       setlocale (LC_ALL, "");
1323       codeset = nl_langinfo (CODESET);
1324       setlocale (LC_ALL, "C");
1325     }
1326
1327   /* Get the conversion modules.  */
1328   *cd_towcp = iconv_open ("WCHAR_T", codeset);
1329   *cd_tombp = iconv_open (codeset, "WCHAR_T");
1330   if (*cd_towcp == (iconv_t) -1 || *cd_tombp == (iconv_t) -1)
1331     {
1332       error (0, 0, gettext ("conversion modules not available"));
1333       if (*cd_towcp != (iconv_t) -1)
1334         iconv_close (*cd_towcp);
1335
1336       return 1;
1337     }
1338
1339   /* One special case for historical reasons is the backslash
1340      character.  In some codesets the byte value 0x5c is not mapped to
1341      U005c in Unicode.  These charsets then don't have a backslash
1342      character at all.  Therefore we have to live with whatever the
1343      codeset provides and recognize, instead of the U005c, the character
1344      the byte value 0x5c is mapped to.  */
1345   buf[0] = '\\';
1346   buf[1] = '\0';
1347   bufptr = buf;
1348   bufsize = 2;
1349
1350   wbufptr = (char *) wbuf;
1351   wbufsize = sizeof (wbuf);
1352
1353   iconv (*cd_towcp, &bufptr, &bufsize, &wbufptr, &wbufsize);
1354   if (bufsize != 0 || wbufsize != 0)
1355     {
1356       /* Something went wrong, we couldn't convert the byte 0x5c.  Go
1357          on with using U005c.  */
1358       error (0, 0, gettext ("cannot determine escape character"));
1359       *escape_charp = L'\\';
1360     }
1361   else
1362     *escape_charp = wbuf[0];
1363
1364   return 0;
1365 }