catgets/gencat.c

   1 /* Copyright (C) 1996-1999, 2000, 2001 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 1996.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Library General Public License as
   7    published by the Free Software Foundation; either version 2 of the
   8    License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Library General Public License for more details.
  14
  15    You should have received a copy of the GNU Library General Public
  16    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18    Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include <config.h>
  22 #endif
  23
  24 #include <argp.h>
  25 #include <assert.h>
  26 #include <ctype.h>
  27 #include <endian.h>
  28 #include <errno.h>
  29 #include <error.h>
  30 #include <fcntl.h>
  31 #include <iconv.h>
  32 #include <langinfo.h>
  33 #include <locale.h>
  34 #include <libintl.h>
  35 #include <limits.h>
  36 #include <nl_types.h>
  37 #include <obstack.h>
  38 #include <stdint.h>
  39 #include <stdio.h>
  40 #include <stdlib.h>
  41 #include <string.h>
  42 #include <unistd.h>
  43 #include <wchar.h>
  44
  45 #include "version.h"
  46
  47 #include "catgetsinfo.h"
  48
  49
  50 #define SWAPU32(w) \
  51   (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
  52
  53 struct message_list
  54 {
  55   int number;
  56   const char *message;
  57
  58   const char *fname;
  59   size_t line;
  60   const char *symbol;
  61
  62   struct message_list *next;
  63 };
  64
  65
  66 struct set_list
  67 {
  68   int number;
  69   int deleted;
  70   struct message_list *messages;
  71   int last_message;
  72
  73   const char *fname;
  74   size_t line;
  75   const char *symbol;
  76
  77   struct set_list *next;
  78 };
  79
  80
  81 struct catalog
  82 {
  83   struct set_list *all_sets;
  84   struct set_list *current_set;
  85   size_t total_messages;
  86   wint_t quote_char;
  87   int last_set;
  88
  89   struct obstack mem_pool;
  90 };
  91
  92
  93 /* If non-zero force creation of new file, not using existing one.  */
  94 static int force_new;
  95
  96 /* Name of output file.  */
  97 static const char *output_name;
  98
  99 /* Name of generated C header file.  */
 100 static const char *header_name;
 101
 102 /* Name and version of program.  */
 103 static void print_version (FILE *stream, struct argp_state *state);
 104 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
 105
 106 #define OPT_NEW 1
 107
 108 /* Definitions of arguments for argp functions.  */
 109 static const struct argp_option options[] =
 110 {
 111   { "header", 'H', N_("NAME"), 0,
 112     N_("Create C header file NAME containing symbol definitions") },
 113   { "new", OPT_NEW, NULL, 0,
 114     N_("Do not use existing catalog, force new output file") },
 115   { "output", 'o', N_("NAME"), 0, N_("Write output to file NAME") },
 116   { NULL, 0, NULL, 0, NULL }
 117 };
 118
 119 /* Short description of program.  */
 120 static const char doc[] = N_("Generate message catalog.\
 121 \vIf INPUT-FILE is -, input is read from standard input.  If OUTPUT-FILE\n\
 122 is -, output is written to standard output.\n");
 123
 124 /* Strings for arguments in help texts.  */
 125 static const char args_doc[] = N_("\
 126 -o OUTPUT-FILE [INPUT-FILE]...\n[OUTPUT-FILE [INPUT-FILE]...]");
 127
 128 /* Prototype for option handler.  */
 129 static error_t parse_opt (int key, char *arg, struct argp_state *state);
 130
 131 /* Function to print some extra text in the help message.  */
 132 static char *more_help (int key, const char *text, void *input);
 133
 134 /* Data structure to communicate with argp functions.  */
 135 static struct argp argp =
 136 {
 137   options, parse_opt, args_doc, doc, NULL, more_help
 138 };
 139
 140
 141 /* Wrapper functions with error checking for standard functions.  */
 142 extern void *xmalloc (size_t n);
 143 extern void *xcalloc (size_t n, size_t s);
 144 extern void *xrealloc (void *o, size_t n);
 145 extern char *xstrdup (const char *);
 146
 147 /* Prototypes for local functions.  */
 148 static void error_print (void);
 149 static struct catalog *read_input_file (struct catalog *current,
 150                                         const char *fname);
 151 static void write_out (struct catalog *result, const char *output_name,
 152                        const char *header_name);
 153 static struct set_list *find_set (struct catalog *current, int number);
 154 static void normalize_line (const char *fname, size_t line, iconv_t cd,
 155                             wchar_t *string, wchar_t quote_char,
 156                             wchar_t escape_char);
 157 static void read_old (struct catalog *catalog, const char *file_name);
 158 static int open_conversion (const char *codesetp, iconv_t *cd_towcp,
 159                             iconv_t *cd_tombp, wchar_t *escape_charp);
 160
 161
 162 int
 163 main (int argc, char *argv[])
 164 {
 165   struct catalog *result;
 166   int remaining;
 167
 168   /* Set program name for messages.  */
 169   error_print_progname = error_print;
 170
 171   /* Set locale via LC_ALL.  */
 172   setlocale (LC_ALL, "");
 173
 174   /* Set the text message domain.  */
 175   textdomain (PACKAGE);
 176
 177   /* Initialize local variables.  */
 178   result = NULL;
 179
 180   /* Parse and process arguments.  */
 181   argp_parse (&argp, argc, argv, 0, &remaining, NULL);
 182
 183   /* Determine output file.  */
 184   if (output_name == NULL)
 185     output_name = remaining < argc ? argv[remaining++] : "-";
 186
 187   /* Process all input files.  */
 188   setlocale (LC_CTYPE, "C");
 189   if (remaining < argc)
 190     do
 191       result = read_input_file (result, argv[remaining]);
 192     while (++remaining < argc);
 193   else
 194     result = read_input_file (NULL, "-");
 195
 196   /* Write out the result.  */
 197   if (result != NULL)
 198     write_out (result, output_name, header_name);
 199
 200   return EXIT_SUCCESS;
 201 }
 202
 203
 204 /* Handle program arguments.  */
 205 static error_t
 206 parse_opt (int key, char *arg, struct argp_state *state)
 207 {
 208   switch (key)
 209     {
 210     case 'H':
 211       header_name = arg;
 212       break;
 213     case OPT_NEW:
 214       force_new = 1;
 215       break;
 216     case 'o':
 217       output_name = arg;
 218       break;
 219     default:
 220       return ARGP_ERR_UNKNOWN;
 221     }
 222   return 0;
 223 }
 224
 225
 226 static char *
 227 more_help (int key, const char *text, void *input)
 228 {
 229   switch (key)
 230     {
 231     case ARGP_KEY_HELP_EXTRA:
 232       /* We print some extra information.  */
 233       return strdup (gettext ("\
 234 Report bugs using the `glibcbug' script to <bugs@gnu.org>.\n"));
 235     default:
 236       break;
 237     }
 238   return (char *) text;
 239 }
 240
 241 /* Print the version information.  */
 242 static void
 243 print_version (FILE *stream, struct argp_state *state)
 244 {
 245   fprintf (stream, "gencat (GNU %s) %s\n", PACKAGE, VERSION);
 246   fprintf (stream, gettext ("\
 247 Copyright (C) %s Free Software Foundation, Inc.\n\
 248 This is free software; see the source for copying conditions.  There is NO\n\
 249 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
 250 "), "2001");
 251   fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
 252 }
 253
 254
 255 /* The address of this function will be assigned to the hook in the
 256    error functions.  */
 257 static void
 258 error_print ()
 259 {
 260   /* We don't want the program name to be printed in messages.  Emacs'
 261      compile.el does not like this.  */
 262 }
 263
 264
 265 static struct catalog *
 266 read_input_file (struct catalog *current, const char *fname)
 267 {
 268   FILE *fp;
 269   char *buf;
 270   size_t len;
 271   size_t line_number;
 272   wchar_t *wbuf;
 273   size_t wbufsize;
 274   iconv_t cd_towc = (iconv_t) -1;
 275   iconv_t cd_tomb = (iconv_t) -1;
 276   wchar_t escape_char = L'\\';
 277   char *codeset = NULL;
 278
 279   if (strcmp (fname, "-") == 0 || strcmp (fname, "/dev/stdin") == 0)
 280     {
 281       fp = stdin;
 282       fname = gettext ("*standard input*");
 283     }
 284   else
 285     fp = fopen (fname, "r");
 286   if (fp == NULL)
 287     {
 288       error (0, errno, gettext ("cannot open input file `%s'"), fname);
 289       return current;
 290     }
 291
 292   /* If we haven't seen anything yet, allocate result structure.  */
 293   if (current == NULL)
 294     {
 295       current = (struct catalog *) xcalloc (1, sizeof (*current));
 296
 297 #define obstack_chunk_alloc malloc
 298 #define obstack_chunk_free free
 299       obstack_init (&current->mem_pool);
 300
 301       current->current_set = find_set (current, NL_SETD);
 302     }
 303
 304   buf = NULL;
 305   len = 0;
 306   line_number = 0;
 307
 308   wbufsize = 1024;
 309   wbuf = (wchar_t *) xmalloc (wbufsize);
 310
 311   while (!feof (fp))
 312     {
 313       int continued;
 314       int used;
 315       size_t start_line = line_number + 1;
 316       char *this_line;
 317
 318       do
 319         {
 320           int act_len;
 321
 322           act_len = getline (&buf, &len, fp);
 323           if (act_len <= 0)
 324             break;
 325           ++line_number;
 326
 327           /* It the line continued?  */
 328           if (buf[act_len - 1] == '\n')
 329             {
 330               --act_len;
 331               continued = buf[act_len - 1] == '\\';
 332               if (continued)
 333                 --act_len;
 334             }
 335           else
 336             continued = 0;
 337
 338           /* Append to currently selected line.  */
 339           obstack_grow (&current->mem_pool, buf, act_len);
 340         }
 341       while (continued);
 342
 343       obstack_1grow (&current->mem_pool, '\0');
 344       this_line = (char *) obstack_finish (&current->mem_pool);
 345
 346       used = 0;
 347       if (this_line[0] == '$')
 348         {
 349           if (isblank (this_line[1]))
 350             {
 351               int cnt = 1;
 352               while (isblank (this_line[cnt]))
 353                 ++cnt;
 354               if (strncmp (&this_line[cnt], "codeset=", 8) != 0)
 355                 /* This is a comment line. Do nothing.  */;
 356               else if (codeset != NULL)
 357                 /* Ignore multiple codeset. */;
 358               else
 359                 {
 360                   int start = cnt + 8;
 361                   cnt = start;
 362                   while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
 363                     ++cnt;
 364                   if (cnt != start)
 365                     {
 366                       int len = cnt - start;
 367                       codeset = xmalloc (len + 1);
 368                       *((char *) mempcpy (codeset, &this_line[start], len))
 369                         = '\0';
 370                     }
 371                 }
 372             }
 373           else if (strncmp (&this_line[1], "set", 3) == 0)
 374             {
 375               int cnt = sizeof ("set");
 376               int set_number;
 377               const char *symbol = NULL;
 378               while (isspace (this_line[cnt]))
 379                 ++cnt;
 380
 381               if (isdigit (this_line[cnt]))
 382                 {
 383                   set_number = atol (&this_line[cnt]);
 384
 385                   /* If the given number for the character set is
 386                      higher than any we used for symbolic set names
 387                      avoid clashing by using only higher numbers for
 388                      the following symbolic definitions.  */
 389                   if (set_number > current->last_set)
 390                     current->last_set = set_number;
 391                 }
 392               else
 393                 {
 394                   /* See whether it is a reasonable identifier.  */
 395                   int start = cnt;
 396                   while (isalnum (this_line[cnt]) || this_line[cnt] == '_')
 397                     ++cnt;
 398
 399                   if (cnt == start)
 400                     {
 401                       /* No correct character found.  */
 402                       error_at_line (0, 0, fname, start_line,
 403                                      gettext ("illegal set number"));
 404                       set_number = 0;
 405                     }
 406                   else
 407                     {
 408                       /* We have found seomthing that looks like a
 409                          correct identifier.  */
 410                       struct set_list *runp;
 411
 412                       this_line[cnt] = '\0';
 413                       used = 1;
 414                       symbol = &this_line[start];
 415
 416                       /* Test whether the identifier was already used.  */
 417                       runp = current->all_sets;
 418                       while (runp != 0)
 419                         if (runp->symbol != NULL
 420                             && strcmp (runp->symbol, symbol) == 0)
 421                           break;
 422                         else
 423                           runp = runp->next;
 424
 425                       if (runp != NULL)
 426                         {
 427                           /* We cannot allow duplicate identifiers for
 428                              message sets.  */
 429                           error_at_line (0, 0, fname, start_line,
 430                                          gettext ("duplicate set definition"));
 431                           error_at_line (0, 0, runp->fname, runp->line,
 432                                          gettext ("\
 433 this is the first definition"));
 434                           set_number = 0;
 435                         }
 436                       else
 437                         /* Allocate next free message set for identifier.  */
 438                         set_number = ++current->last_set;
 439                     }
 440                 }
 441
 442               if (set_number != 0)
 443                 {
 444                   /* We found a legal set number.  */
 445                   current->current_set = find_set (current, set_number);
 446                   if (symbol != NULL)
 447                       used = 1;
 448                   current->current_set->symbol = symbol;
 449                   current->current_set->fname = fname;
 450                   current->current_set->line = start_line;
 451                 }
 452             }
 453           else if (strncmp (&this_line[1], "delset", 6) == 0)
 454             {
 455               int cnt = sizeof ("delset");
 456               size_t set_number;
 457               while (isspace (this_line[cnt]))
 458                 ++cnt;
 459
 460               if (isdigit (this_line[cnt]))
 461                 {
 462                   size_t set_number = atol (&this_line[cnt]);
 463                   struct set_list *set;
 464
 465                   /* Mark the message set with the given number as
 466                      deleted.  */
 467                   set = find_set (current, set_number);
 468                   set->deleted = 1;
 469                 }
 470               else
 471                 {
 472                   /* See whether it is a reasonable identifier.  */
 473                   int start = cnt;
 474                   while (isalnum (this_line[cnt]) || this_line[cnt] == '_')
 475                     ++cnt;
 476
 477                   if (cnt == start)
 478                     {
 479                       error_at_line (0, 0, fname, start_line,
 480                                      gettext ("illegal set number"));
 481                       set_number = 0;
 482                     }
 483                   else
 484                     {
 485                       const char *symbol;
 486                       struct set_list *runp;
 487
 488                       this_line[cnt] = '\0';
 489                       used = 1;
 490                       symbol = &this_line[start];
 491
 492                       /* We have a symbolic set name.  This name must
 493                          appear somewhere else in the catalogs read so
 494                          far.  */
 495                       set_number = 0;
 496                       for (runp = current->all_sets; runp != NULL;
 497                            runp = runp->next)
 498                         {
 499                           if (strcmp (runp->symbol, symbol) == 0)
 500                             {
 501                               runp->deleted = 1;
 502                               break;
 503                             }
 504                         }
 505                       if (runp == NULL)
 506                         /* Name does not exist before.  */
 507                         error_at_line (0, 0, fname, start_line,
 508                                        gettext ("unknown set `%s'"), symbol);
 509                     }
 510                 }
 511             }
 512           else if (strncmp (&this_line[1], "quote", 5) == 0)
 513             {
 514               char buf[2];
 515               char *bufptr;
 516               size_t buflen;
 517               char *wbufptr;
 518               size_t wbuflen;
 519               int cnt;
 520
 521               cnt = sizeof ("quote");
 522               while (isspace (this_line[cnt]))
 523                 ++cnt;
 524
 525               /* We need the conversion.  */
 526               if (cd_towc == (iconv_t) -1
 527                   && open_conversion (codeset, &cd_towc, &cd_tomb,
 528                                       &escape_char) != 0)
 529                 /* Something is wrong.  */
 530                 goto out;
 531
 532               /* Yes, the quote char can be '\0'; this means no quote
 533                  char.  The function using the information works on
 534                  wide characters so we have to convert it here.  */
 535               buf[0] = this_line[cnt];
 536               buf[1] = '\0';
 537               bufptr = buf;
 538               buflen = 2;
 539
 540               wbufptr = (char *) wbuf;
 541               wbuflen = wbufsize;
 542
 543               /* Flush the state.  */
 544               iconv (cd_towc, NULL, NULL, NULL, NULL);
 545
 546               iconv (cd_towc, &bufptr, &buflen, &wbufptr, &wbuflen);
 547               if (buflen != 0 || (wchar_t *) wbufptr != &wbuf[2])
 548                 error_at_line (0, 0, fname, start_line,
 549                                gettext ("invalid quote character"));
 550               else
 551                 /* Use the converted wide character.  */
 552                 current->quote_char = wbuf[0];
 553             }
 554           else
 555             {
 556               int cnt;
 557               cnt = 2;
 558               while (this_line[cnt] != '\0' && !isspace (this_line[cnt]))
 559                 ++cnt;
 560               this_line[cnt] = '\0';
 561               error_at_line (0, 0, fname, start_line,
 562                              gettext ("unknown directive `%s': line ignored"),
 563                              &this_line[1]);
 564             }
 565         }
 566       else if (isalnum (this_line[0]) || this_line[0] == '_')
 567         {
 568           const char *ident = this_line;
 569           char *line = this_line;
 570           int message_number;
 571
 572           do
 573             ++line;
 574           while (line[0] != '\0' && !isspace (line[0]));
 575           if (line[0] != '\0')
 576             *line++ = '\0';     /* Terminate the identifier.  */
 577
 578           /* Now we found the beginning of the message itself.  */
 579
 580           if (isdigit (ident[0]))
 581             {
 582               struct message_list *runp;
 583               struct message_list *lastp;
 584
 585               message_number = atoi (ident);
 586
 587               /* Find location to insert the new message.  */
 588               runp = current->current_set->messages;
 589               lastp = NULL;
 590               while (runp != NULL)
 591                 if (runp->number == message_number)
 592                   break;
 593                 else
 594                   {
 595                     lastp = runp;
 596                     runp = runp->next;
 597                   }
 598               if (runp != NULL)
 599                 {
 600                   /* Oh, oh.  There is already a message with this
 601                      number in the message set.  */
 602                   error_at_line (0, 0, fname, start_line,
 603                                  gettext ("duplicated message number"));
 604                   error_at_line (0, 0, runp->fname, runp->line,
 605                                  gettext ("this is the first definition"));
 606                   message_number = 0;
 607                 }
 608               ident = NULL;     /* We don't have a symbol.  */
 609
 610               if (message_number != 0
 611                   && message_number > current->current_set->last_message)
 612                 current->current_set->last_message = message_number;
 613             }
 614           else if (ident[0] != '\0')
 615             {
 616               struct message_list *runp;
 617               struct message_list *lastp;
 618
 619               /* Test whether the symbolic name was not used for
 620                  another message in this message set.  */
 621               runp = current->current_set->messages;
 622               lastp = NULL;
 623               while (runp != NULL)
 624                 if (runp->symbol != NULL && strcmp (ident, runp->symbol) == 0)
 625                   break;
 626                 else
 627                   runp = runp->next;
 628               if (runp != NULL)
 629                 {
 630                   /* The name is already used.  */
 631                   error_at_line (0, 0, fname, start_line, gettext ("\
 632 duplicated message identifier"));
 633                   error_at_line (0, 0, runp->fname, runp->line,
 634                                  gettext ("this is the first definition"));
 635                   message_number = 0;
 636                 }
 637               else
 638                 /* Give the message the next unused number.  */
 639                 message_number = ++current->current_set->last_message;
 640             }
 641           else
 642             message_number = 0;
 643
 644           if (message_number != 0)
 645             {
 646               char *inbuf;
 647               size_t inlen;
 648               char *outbuf;
 649               size_t outlen;
 650               struct message_list *newp;
 651               size_t line_len = strlen (line) + 1;
 652
 653               /* We need the conversion.  */
 654               if (cd_towc == (iconv_t) -1
 655                   && open_conversion (codeset, &cd_towc, &cd_tomb,
 656                                       &escape_char) != 0)
 657                 /* Something is wrong.  */
 658                 goto out;
 659
 660               /* Convert to a wide character string.  We have to
 661                  interpret escape sequences which will be impossible
 662                  without doing the conversion if the codeset of the
 663                  message is stateful.  */
 664               while (1)
 665                 {
 666                   inbuf = line;
 667                   inlen = line_len;
 668                   outbuf = (char *) wbuf;
 669                   outlen = wbufsize;
 670
 671                   /* Flush the state.  */
 672                   iconv (cd_towc, NULL, NULL, NULL, NULL);
 673
 674                   iconv (cd_towc, &inbuf, &inlen, &outbuf, &outlen);
 675                   if (inlen == 0)
 676                     {
 677                       /* The string is converted.  */
 678                       assert (outlen < wbufsize);
 679                       assert (wbuf[(wbufsize - outlen) / sizeof (wchar_t) - 1]
 680                               == L'\0');
 681                       break;
 682                     }
 683
 684                   if (outlen != 0)
 685                     {
 686                       /* Something is wrong with this string, we ignore it.  */
 687                       error_at_line (0, 0, fname, start_line, gettext ("\
 688 invalid character: message ignored"));
 689                       goto ignore;
 690                     }
 691
 692                   /* The output buffer is too small.  */
 693                   wbufsize *= 2;
 694                   wbuf = (wchar_t *) xrealloc (wbuf, wbufsize);
 695                 }
 696
 697               /* Strip quote characters, change escape sequences into
 698                  correct characters etc.  */
 699               normalize_line (fname, start_line, cd_towc, wbuf,
 700                               current->quote_char, escape_char);
 701
 702               /* Now the string is free of escape sequences.  Convert it
 703                  back into a multibyte character string.  First free the
 704                  memory allocated for the original string.  */
 705               obstack_free (&current->mem_pool, this_line);
 706
 707               used = 1; /* Yes, we use the line.  */
 708
 709               /* Now fill in the new string.  It should never happen that
 710                  the replaced string is longer than the original.  */
 711               inbuf = (char *) wbuf;
 712               inlen = (wcslen (wbuf) + 1) * sizeof (wchar_t);
 713
 714               outlen = obstack_room (&current->mem_pool);
 715               obstack_blank (&current->mem_pool, outlen);
 716               this_line = (char *) obstack_base (&current->mem_pool);
 717               outbuf = this_line;
 718
 719               /* Flush the state.  */
 720               iconv (cd_tomb, NULL, NULL, NULL, NULL);
 721
 722               iconv (cd_tomb, &inbuf, &inlen, &outbuf, &outlen);
 723               if (inlen != 0)
 724                 {
 725                   error_at_line (0, 0, fname, start_line,
 726                                  gettext ("invalid line"));
 727                   goto ignore;
 728                 }
 729               assert (outbuf[-1] == '\0');
 730
 731               /* Free the memory in the obstack we don't use.  */
 732               obstack_blank (&current->mem_pool, -(int) outlen);
 733               line = obstack_finish (&current->mem_pool);
 734
 735               newp = (struct message_list *) xmalloc (sizeof (*newp));
 736               newp->number = message_number;
 737               newp->message = line;
 738               /* Remember symbolic name; is NULL if no is given.  */
 739               newp->symbol = ident;
 740               /* Remember where we found the character.  */
 741               newp->fname = fname;
 742               newp->line = start_line;
 743
 744               /* Find place to insert to message.  We keep them in a
 745                  sorted single linked list.  */
 746               if (current->current_set->messages == NULL
 747                   || current->current_set->messages->number > message_number)
 748                 {
 749                   newp->next = current->current_set->messages;
 750                   current->current_set->messages = newp;
 751                 }
 752               else
 753                 {
 754                   struct message_list *runp;
 755                   runp = current->current_set->messages;
 756                   while (runp->next != NULL)
 757                     if (runp->next->number > message_number)
 758                       break;
 759                     else
 760                       runp = runp->next;
 761                   newp->next = runp->next;
 762                   runp->next = newp;
 763                 }
 764             }
 765           ++current->total_messages;
 766         }
 767       else
 768         {
 769           size_t cnt;
 770
 771           cnt = 0;
 772           /* See whether we have any non-white space character in this
 773              line.  */
 774           while (this_line[cnt] != '\0' && isspace (this_line[cnt]))
 775             ++cnt;
 776
 777           if (this_line[cnt] != '\0')
 778             /* Yes, some unknown characters found.  */
 779             error_at_line (0, 0, fname, start_line,
 780                            gettext ("malformed line ignored"));
 781         }
 782
 783     ignore:
 784       /* We can save the memory for the line if it was not used.  */
 785       if (!used)
 786         obstack_free (&current->mem_pool, this_line);
 787     }
 788
 789   /* Close the conversion modules.  */
 790   iconv_close (cd_towc);
 791   iconv_close (cd_tomb);
 792   free (codeset);
 793
 794  out:
 795   free (wbuf);
 796
 797   if (fp != stdin)
 798     fclose (fp);
 799   return current;
 800 }
 801
 802
 803 static void
 804 write_out (struct catalog *catalog, const char *output_name,
 805            const char *header_name)
 806 {
 807   /* Computing the "optimal" size.  */
 808   struct set_list *set_run;
 809   size_t best_total, best_size, best_depth;
 810   size_t act_size, act_depth;
 811   struct catalog_obj obj;
 812   struct obstack string_pool;
 813   const char *strings;
 814   size_t strings_size;
 815   uint32_t *array1, *array2;
 816   size_t cnt;
 817   int fd;
 818
 819   /* If not otherwise told try to read file with existing
 820      translations.  */
 821   if (!force_new)
 822     read_old (catalog, output_name);
 823
 824   /* Initialize best_size with a very high value.  */
 825   best_total = best_size = best_depth = UINT_MAX;
 826
 827   /* We need some start size for testing.  Let's start with
 828      TOTAL_MESSAGES / 5, which theoretically provides a mean depth of
 829      5.  */
 830   act_size = 1 + catalog->total_messages / 5;
 831
 832   /* We determine the size of a hash table here.  Because the message
 833      numbers can be chosen arbitrary by the programmer we cannot use
 834      the simple method of accessing the array using the message
 835      number.  The algorithm is based on the trivial hash function
 836      NUMBER % TABLE_SIZE, where collisions are stored in a second
 837      dimension up to TABLE_DEPTH.  We here compute TABLE_SIZE so that
 838      the needed space (= TABLE_SIZE * TABLE_DEPTH) is minimal.  */
 839   while (act_size <= best_total)
 840     {
 841       size_t deep[act_size];
 842
 843       act_depth = 1;
 844       memset (deep, '\0', act_size * sizeof (size_t));
 845       set_run = catalog->all_sets;
 846       while (set_run != NULL)
 847         {
 848           struct message_list *message_run;
 849
 850           message_run = set_run->messages;
 851           while (message_run != NULL)
 852             {
 853               size_t idx = (message_run->number * set_run->number) % act_size;
 854
 855               ++deep[idx];
 856               if (deep[idx] > act_depth)
 857                 {
 858                   act_depth = deep[idx];
 859                   if (act_depth * act_size > best_total)
 860                     break;
 861                 }
 862               message_run = message_run->next;
 863             }
 864           set_run = set_run->next;
 865         }
 866
 867       if (act_depth * act_size <= best_total)
 868         {
 869           /* We have found a better solution.  */
 870           best_total = act_depth * act_size;
 871           best_size = act_size;
 872           best_depth = act_depth;
 873         }
 874
 875       ++act_size;
 876     }
 877
 878   /* let's be prepared for an empty message file.  */
 879   if (best_size == UINT_MAX)
 880     {
 881       best_size = 1;
 882       best_depth = 1;
 883     }
 884
 885   /* OK, now we have the size we will use.  Fill in the header, build
 886      the table and the second one with swapped byte order.  */
 887   obj.magic = CATGETS_MAGIC;
 888   obj.plane_size = best_size;
 889   obj.plane_depth = best_depth;
 890
 891   /* Allocate room for all needed arrays.  */
 892   array1 =
 893     (uint32_t *) alloca (best_size * best_depth * sizeof (uint32_t) * 3);
 894   memset (array1, '\0', best_size * best_depth * sizeof (uint32_t) * 3);
 895   array2
 896     = (uint32_t *) alloca (best_size * best_depth * sizeof (uint32_t) * 3);
 897   obstack_init (&string_pool);
 898
 899   set_run = catalog->all_sets;
 900   while (set_run != NULL)
 901     {
 902       struct message_list *message_run;
 903
 904       message_run = set_run->messages;
 905       while (message_run != NULL)
 906         {
 907           size_t idx = (((message_run->number * set_run->number) % best_size)
 908                         * 3);
 909           /* Determine collision depth.  */
 910           while (array1[idx] != 0)
 911             idx += best_size * 3;
 912
 913           /* Store set number, message number and pointer into string
 914              space, relative to the first string.  */
 915           array1[idx + 0] = set_run->number;
 916           array1[idx + 1] = message_run->number;
 917           array1[idx + 2] = obstack_object_size (&string_pool);
 918
 919           /* Add current string to the continuous space containing all
 920              strings.  */
 921           obstack_grow0 (&string_pool, message_run->message,
 922                          strlen (message_run->message));
 923
 924           message_run = message_run->next;
 925         }
 926
 927       set_run = set_run->next;
 928     }
 929   strings_size = obstack_object_size (&string_pool);
 930   strings = obstack_finish (&string_pool);
 931
 932   /* Compute ARRAY2 by changing the byte order.  */
 933   for (cnt = 0; cnt < best_size * best_depth * 3; ++cnt)
 934     array2[cnt] = SWAPU32 (array1[cnt]);
 935
 936   /* Now we can write out the whole data.  */
 937   if (strcmp (output_name, "-") == 0
 938       || strcmp (output_name, "/dev/stdout") == 0)
 939     fd = STDOUT_FILENO;
 940   else
 941     {
 942       fd = creat (output_name, 0666);
 943       if (fd < 0)
 944         error (EXIT_FAILURE, errno, gettext ("cannot open output file `%s'"),
 945                output_name);
 946     }
 947
 948   /* Write out header.  */
 949   write (fd, &obj, sizeof (obj));
 950
 951   /* We always write out the little endian version of the index
 952      arrays.  */
 953 #if __BYTE_ORDER == __LITTLE_ENDIAN
 954   write (fd, array1, best_size * best_depth * sizeof (uint32_t) * 3);
 955   write (fd, array2, best_size * best_depth * sizeof (uint32_t) * 3);
 956 #elif __BYTE_ORDER == __BIG_ENDIAN
 957   write (fd, array2, best_size * best_depth * sizeof (uint32_t) * 3);
 958   write (fd, array1, best_size * best_depth * sizeof (uint32_t) * 3);
 959 #else
 960 # error Cannot handle __BYTE_ORDER byte order
 961 #endif
 962
 963   /* Finally write the strings.  */
 964   write (fd, strings, strings_size);
 965
 966   if (fd != STDOUT_FILENO)
 967     close (fd);
 968
 969   /* If requested now write out the header file.  */
 970   if (header_name != NULL)
 971     {
 972       int first = 1;
 973       FILE *fp;
 974
 975       /* Open output file.  "-" or "/dev/stdout" means write to
 976          standard output.  */
 977       if (strcmp (header_name, "-") == 0
 978           || strcmp (header_name, "/dev/stdout") == 0)
 979         fp = stdout;
 980       else
 981         {
 982           fp = fopen (header_name, "w");
 983           if (fp == NULL)
 984             error (EXIT_FAILURE, errno,
 985                    gettext ("cannot open output file `%s'"), header_name);
 986         }
 987
 988       /* Iterate over all sets and all messages.  */
 989       set_run = catalog->all_sets;
 990       while (set_run != NULL)
 991         {
 992           struct message_list *message_run;
 993
 994           /* If the current message set has a symbolic name write this
 995              out first.  */
 996           if (set_run->symbol != NULL)
 997             fprintf (fp, "%s#define %sSet %#x\t/* %s:%Zu */\n",
 998                      first ? "" : "\n", set_run->symbol, set_run->number - 1,
 999                      set_run->fname, set_run->line);
1000           first = 0;
1001
1002           message_run = set_run->messages;
1003           while (message_run != NULL)
1004             {
1005               /* If the current message has a symbolic name write
1006                  #define out.  But we have to take care for the set
1007                  not having a symbolic name.  */
1008               if (message_run->symbol != NULL)
1009                 {
1010                   if (set_run->symbol == NULL)
1011                     fprintf (fp, "#define AutomaticSet%d%s %#x\t/* %s:%Zu */\n",
1012                              set_run->number, message_run->symbol,
1013                              message_run->number, message_run->fname,
1014                              message_run->line);
1015                   else
1016                     fprintf (fp, "#define %s%s %#x\t/* %s:%Zu */\n",
1017                              set_run->symbol, message_run->symbol,
1018                              message_run->number, message_run->fname,
1019                              message_run->line);
1020                 }
1021
1022               message_run = message_run->next;
1023             }
1024
1025           set_run = set_run->next;
1026         }
1027
1028       if (fp != stdout)
1029         fclose (fp);
1030     }
1031 }
1032
1033
1034 static struct set_list *
1035 find_set (struct catalog *current, int number)
1036 {
1037   struct set_list *result = current->all_sets;
1038
1039   /* We must avoid set number 0 because a set of this number signals
1040      in the tables that the entry is not occupied.  */
1041   ++number;
1042
1043   while (result != NULL)
1044     if (result->number == number)
1045       return result;
1046     else
1047       result = result->next;
1048
1049   /* Prepare new message set.  */
1050   result = (struct set_list *) xcalloc (1, sizeof (*result));
1051   result->number = number;
1052   result->next = current->all_sets;
1053   current->all_sets = result;
1054
1055   return result;
1056 }
1057
1058
1059 /* Normalize given string *in*place* by processing escape sequences
1060    and quote characters.  */
1061 static void
1062 normalize_line (const char *fname, size_t line, iconv_t cd, wchar_t *string,
1063                 wchar_t quote_char, wchar_t escape_char)
1064 {
1065   int is_quoted;
1066   wchar_t *rp = string;
1067   wchar_t *wp = string;
1068
1069   if (quote_char != L'\0' && *rp == quote_char)
1070     {
1071       is_quoted = 1;
1072       ++rp;
1073     }
1074   else
1075     is_quoted = 0;
1076
1077   while (*rp != L'\0')
1078     if (*rp == quote_char)
1079       /* We simply end the string when we find the first time an
1080          not-escaped quote character.  */
1081         break;
1082     else if (*rp == escape_char)
1083       {
1084         ++rp;
1085         if (quote_char != L'\0' && *rp == quote_char)
1086           /* This is an extension to XPG.  */
1087           *wp++ = *rp++;
1088         else
1089           /* Recognize escape sequences.  */
1090           switch (*rp)
1091             {
1092             case L'n':
1093               *wp++ = L'\n';
1094               ++rp;
1095               break;
1096             case L't':
1097               *wp++ = L'\t';
1098               ++rp;
1099               break;
1100             case L'v':
1101               *wp++ = L'\v';
1102               ++rp;
1103               break;
1104             case L'b':
1105               *wp++ = L'\b';
1106               ++rp;
1107               break;
1108             case L'r':
1109               *wp++ = L'\r';
1110               ++rp;
1111               break;
1112             case L'f':
1113               *wp++ = L'\f';
1114               ++rp;
1115               break;
1116             case L'0' ... L'7':
1117               {
1118                 int number;
1119                 char cbuf[2];
1120                 char *cbufptr;
1121                 size_t cbufin;
1122                 wchar_t wcbuf[2];
1123                 char *wcbufptr;
1124                 size_t wcbufin;
1125
1126                 number = *rp++ - L'0';
1127                 while (number <= (255 / 8) && *rp >= L'0' && *rp <= L'7')
1128                   {
1129                     number *= 8;
1130                     number += *rp++ - L'0';
1131                   }
1132
1133                 cbuf[0] = (char) number;
1134                 cbuf[1] = '\0';
1135                 cbufptr = cbuf;
1136                 cbufin = 2;
1137
1138                 wcbufptr = (char *) wcbuf;
1139                 wcbufin = sizeof (wcbuf);
1140
1141                 /* Flush the state.  */
1142                 iconv (cd, NULL, NULL, NULL, NULL);
1143
1144                 iconv (cd, &cbufptr, &cbufin, &wcbufptr, &wcbufin);
1145                 if (cbufptr != &cbuf[2] || (wchar_t *) wcbufptr != &wcbuf[2])
1146                   error_at_line (0, 0, fname, line,
1147                                  gettext ("invalid escape sequence"));
1148                 else
1149                   *wp++ = wcbuf[0];
1150               }
1151               break;
1152             default:
1153               if (*rp == escape_char)
1154                 {
1155                   *wp++ = escape_char;
1156                   ++rp;
1157                 }
1158               else
1159                 /* Simply ignore the backslash character.  */;
1160               break;
1161             }
1162       }
1163     else
1164       *wp++ = *rp++;
1165
1166   /* If we saw a quote character at the beginning we expect another
1167      one at the end.  */
1168   if (is_quoted && *rp != quote_char)
1169     error_at_line (0, 0, fname, line, gettext ("unterminated message"));
1170
1171   /* Terminate string.  */
1172   *wp = L'\0';
1173   return;
1174 }
1175
1176
1177 static void
1178 read_old (struct catalog *catalog, const char *file_name)
1179 {
1180   struct catalog_info old_cat_obj;
1181   struct set_list *set = NULL;
1182   int last_set = -1;
1183   size_t cnt;
1184
1185   old_cat_obj.status = closed;
1186   old_cat_obj.cat_name = file_name;
1187   old_cat_obj.nlspath = NULL;
1188   __libc_lock_init (old_cat_obj.lock);
1189
1190   /* Try to open catalog, but don't look through the NLSPATH.  */
1191   __open_catalog (&old_cat_obj);
1192
1193   if (old_cat_obj.status != mmapped && old_cat_obj.status != malloced)
1194     {
1195       if (errno == ENOENT)
1196         /* No problem, the catalog simply does not exist.  */
1197         return;
1198       else
1199         error (EXIT_FAILURE, errno, gettext ("while opening old catalog file"));
1200     }
1201
1202   /* OK, we have the catalog loaded.  Now read all messages and merge
1203      them.  When set and message number clash for any message the new
1204      one is used.  If the new one is empty it indicates that the
1205      message should be deleted.  */
1206   for (cnt = 0; cnt < old_cat_obj.plane_size * old_cat_obj.plane_depth; ++cnt)
1207     {
1208       struct message_list *message, *last;
1209
1210       if (old_cat_obj.name_ptr[cnt * 3 + 0] == 0)
1211         /* No message in this slot.  */
1212         continue;
1213
1214       if (old_cat_obj.name_ptr[cnt * 3 + 0] - 1 != (uint32_t) last_set)
1215         {
1216           last_set = old_cat_obj.name_ptr[cnt * 3 + 0] - 1;
1217           set = find_set (catalog, old_cat_obj.name_ptr[cnt * 3 + 0] - 1);
1218         }
1219
1220       last = NULL;
1221       message = set->messages;
1222       while (message != NULL)
1223         {
1224           if ((uint32_t) message->number >= old_cat_obj.name_ptr[cnt * 3 + 1])
1225             break;
1226           last = message;
1227           message = message->next;
1228         }
1229
1230       if (message == NULL
1231           || (uint32_t) message->number > old_cat_obj.name_ptr[cnt * 3 + 1])
1232         {
1233           /* We have found a message which is not yet in the catalog.
1234              Insert it at the right position.  */
1235           struct message_list *newp;
1236
1237           newp = (struct message_list *) xmalloc (sizeof(*newp));
1238           newp->number = old_cat_obj.name_ptr[cnt * 3 + 1];
1239           newp->message =
1240             &old_cat_obj.strings[old_cat_obj.name_ptr[cnt * 3 + 2]];
1241           newp->fname = NULL;
1242           newp->line = 0;
1243           newp->symbol = NULL;
1244           newp->next = message;
1245
1246           if (last == NULL)
1247             set->messages = newp;
1248           else
1249             last->next = newp;
1250
1251           ++catalog->total_messages;
1252         }
1253       else if (*message->message == '\0')
1254         {
1255           /* The new empty message has overridden the old one thus
1256              "deleting" it as required.  Now remove the empty remains. */
1257           if (last == NULL)
1258             set->messages = message->next;
1259           else
1260             last->next = message->next;
1261         }
1262     }
1263 }
1264
1265
1266 static int
1267 open_conversion (const char *codeset, iconv_t *cd_towcp, iconv_t *cd_tombp,
1268                  wchar_t *escape_charp)
1269 {
1270   char buf[2];
1271   char *bufptr;
1272   size_t bufsize;
1273   wchar_t wbuf[2];
1274   char *wbufptr;
1275   size_t wbufsize;
1276
1277   /* If the input file does not specify the codeset use the locale's.  */
1278   if (codeset == NULL)
1279     {
1280       setlocale (LC_ALL, "");
1281       codeset = nl_langinfo (CODESET);
1282       setlocale (LC_ALL, "C");
1283     }
1284
1285   /* Get the conversion modules.  */
1286   *cd_towcp = iconv_open ("WCHAR_T", codeset);
1287   *cd_tombp = iconv_open (codeset, "WCHAR_T");
1288   if (*cd_towcp == (iconv_t) -1 || *cd_tombp == (iconv_t) -1)
1289     {
1290       error (0, 0, gettext ("conversion modules not available"));
1291       if (*cd_towcp != (iconv_t) -1)
1292         iconv_close (*cd_towcp);
1293
1294       return 1;
1295     }
1296
1297   /* One special case for historical reasons is the backslash
1298      character.  In some codesets the byte value 0x5c is not mapped to
1299      U005c in Unicode.  These charsets then don't have a backslash
1300      character at all.  Therefore we have to live with whatever the
1301      codeset provides and recognize, instead of the U005c, the character
1302      the byte value 0x5c is mapped to.  */
1303   buf[0] = '\\';
1304   buf[1] = '\0';
1305   bufptr = buf;
1306   bufsize = 2;
1307
1308   wbufptr = (char *) wbuf;
1309   wbufsize = sizeof (wbuf);
1310
1311   iconv (*cd_towcp, &bufptr, &bufsize, &wbufptr, &wbufsize);
1312   if (bufsize != 0 || wbufsize != 0)
1313     {
1314       /* Something went wrong, we couldn't convert the byte 0x5c.  Go
1315          on with using U005c.  */
1316       error (0, 0, gettext ("cannot determine escape character"));
1317       *escape_charp = L'\\';
1318     }
1319   else
1320     *escape_charp = wbuf[0];
1321
1322   return 0;
1323 }