ptx: remove use of diacrit module

author Paul Eggert <eggert@cs.ucla.edu>

Sun, 21 Mar 2021 21:00:26 +0000 (14:00 -0700)

committer Paul Eggert <eggert@cs.ucla.edu>

Sun, 21 Mar 2021 21:01:15 +0000 (14:01 -0700)
author Paul Eggert <eggert@cs.ucla.edu>
Sun, 21 Mar 2021 21:00:26 +0000 (14:00 -0700)
committer Paul Eggert <eggert@cs.ucla.edu>
Sun, 21 Mar 2021 21:01:15 +0000 (14:01 -0700)
diff --git a/NEWS b/NEWS

index 89f60d28fed0625a0bf29bd5ffc3fa835ce2cd0a..97cb4bd64706475aa62d1b2c93e8184cbd915755 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -56,6 +56,10 @@ GNU coreutils NEWS                                    -*- outline -*-
    directory merely because it was removed.  This reverts a change
    that was made in release 8.32.
  
+  ptx -T no longer attempts to substitute old-fashioned TeX escapes
+  for 8-bit non-ASCII alphabetic characters.  TeX indexes should
+  instead use '\usepackage[latin1]{inputenc}' or equivalent.
+
  ** New Features
  
    expr and factor now support bignums on all platforms.
diff --git a/bootstrap.conf b/bootstrap.conf

index 63435fc641a8b781f3d53a341d8c0ceb71a63e23..ab6b3ef0c8e8a88e3e76602596aa8ef453af67f9 100644 (file)
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -64,7 +64,6 @@ gnulib_modules="
    d-ino
    d-type
    di-set
-  diacrit
    dirfd
    dirname
    do-release-commit-and-tag
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index af8a02eaa46dfc632fd5d84ea7b3fa5e3b1ece7f..ac0b4467d84e0308d209d0e15a401f5f95419be1 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5481,18 +5481,19 @@ processing.
  @node Charset selection in ptx
  @subsection Charset selection
  
-@c FIXME:  People don't necessarily know what an IBM-PC was these days.
-As it is set up now, the program assumes that the input file is coded
-using 8-bit ISO 8859-1 code, also known as Latin-1 character set,
-@emph{unless} it is compiled for MS-DOS, in which case it uses the
-character set of the IBM-PC@.  (GNU @command{ptx} is not known to work on
-smaller MS-DOS machines anymore.)  Compared to 7-bit ASCII, the set
-of characters which are letters is different; this alters the behavior
-of regular expression matching.  Thus, the default regular expression
+As it is set up now, @command{ptx} assumes that the input file is coded
+using 8-bit characters, and it may not work well in multibyte locales.
+In a single-byte locale, the default regular expression
  for a keyword allows foreign or diacriticized letters.  Keyword sorting,
  however, is still crude; it obeys the underlying character set ordering
  quite blindly.
  
+The output of @command{ptx} assumes the locale's character encoding.
+For example, with @command{ptx}'s @option{-T} option, if the locale
+uses the Latin-1 encoding you may need a LaTeX directive like
+@samp{\usepackage[latin1]@{inputenc@}} to render non-ASCII characters
+correctly.
+
  @table @samp
  
  @item -f
diff --git a/src/ptx.c b/src/ptx.c

index 4864fdc75137b5d25310fc39d0793e8bafa6032f..5d3c2c087b95815adefc8bed7ffab4a2b4138c4a 100644 (file)
--- a/src/ptx.c
+++ b/src/ptx.c
@@ -25,7 +25,6 @@
  #include "die.h"
  #include <regex.h>
  #include "argmatch.h"
-#include "diacrit.h"
  #include "error.h"
  #include "fadvise.h"
  #include "quote.h"
@@ -1033,8 +1032,6 @@ static void
  print_field (BLOCK field)
  {
    char *cursor;                        /* Cursor in field to print */
-  int base;                    /* Base character, without diacritic */
-  int diacritic;               /* Diacritic code for the character */
  
    /* Whitespace is not really compressed.  Instead, each white space
       character (tab, vt, ht etc.) is printed as one single space.  */
@@ -1044,140 +1041,44 @@ print_field (BLOCK field)
        unsigned char character = *cursor;
        if (edited_flag[character])
          {
+          /* Handle cases which are specific to 'roff' or TeX.  All
+             white space processing is done as the default case of
+             this switch.  */
  
-          /* First check if this is a diacriticized character.
-
-             This works only for TeX.  I do not know how diacriticized
-             letters work with 'roff'.  Please someone explain it to me!  */
-
-          diacritic = todiac (character);
-          if (diacritic != 0 && output_format == TEX_FORMAT)
+          switch (character)
              {
-              base = tobase (character);
-              switch (diacritic)
-                {
-
-                case 1:                /* Latin diphthongs */
-                  switch (base)
-                    {
-                    case 'o':
-                      fputs ("\\oe{}", stdout);
-                      break;
-
-                    case 'O':
-                      fputs ("\\OE{}", stdout);
-                      break;
-
-                    case 'a':
-                      fputs ("\\ae{}", stdout);
-                      break;
-
-                    case 'A':
-                      fputs ("\\AE{}", stdout);
-                      break;
-
-                    default:
-                      putchar (' ');
-                    }
-                  break;
-
-                case 2:                /* Acute accent */
-                  printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 3:                /* Grave accent */
-                  printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 4:                /* Circumflex accent */
-                  printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 5:                /* Diaeresis */
-                  printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 6:                /* Tilde accent */
-                  printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 7:                /* Cedilla */
-                  printf ("\\c{%c}", base);
-                  break;
-
-                case 8:                /* Small circle beneath */
-                  switch (base)
-                    {
-                    case 'a':
-                      fputs ("\\aa{}", stdout);
-                      break;
-
-                    case 'A':
-                      fputs ("\\AA{}", stdout);
-                      break;
-
-                    default:
-                      putchar (' ');
-                    }
-                  break;
-
-                case 9:                /* Strike through */
-                  switch (base)
-                    {
-                    case 'o':
-                      fputs ("\\o{}", stdout);
-                      break;
-
-                    case 'O':
-                      fputs ("\\O{}", stdout);
-                      break;
-
-                    default:
-                      putchar (' ');
-                    }
-                  break;
-                }
-            }
-          else
-
-            /* This is not a diacritic character, so handle cases which are
-               really specific to 'roff' or TeX.  All white space processing
-               is done as the default case of this switch.  */
-
-            switch (character)
-              {
-              case '"':
-                /* In roff output format, double any quote.  */
-                putchar ('"');
-                putchar ('"');
-                break;
+            case '"':
+              /* In roff output format, double any quote.  */
+              putchar ('"');
+              putchar ('"');
+              break;
  
-              case '$':
-              case '%':
-              case '&':
-              case '#':
-              case '_':
-                /* In TeX output format, precede these with a backslash.  */
-                putchar ('\\');
-                putchar (character);
-                break;
+            case '$':
+            case '%':
+            case '&':
+            case '#':
+            case '_':
+              /* In TeX output format, precede these with a backslash.  */
+              putchar ('\\');
+              putchar (character);
+              break;
  
-              case '{':
-              case '}':
-                /* In TeX output format, precede these with a backslash and
-                   force mathematical mode.  */
-                printf ("$\\%c$", character);
-                break;
+            case '{':
+            case '}':
+              /* In TeX output format, precede these with a backslash and
+                 force mathematical mode.  */
+              printf ("$\\%c$", character);
+              break;
  
-              case '\\':
-                /* In TeX output mode, request production of a backslash.  */
-                fputs ("\\backslash{}", stdout);
-                break;
+            case '\\':
+              /* In TeX output mode, request production of a backslash.  */
+              fputs ("\\backslash{}", stdout);
+              break;
  
-              default:
-                /* Any other flagged character produces a single space.  */
-                putchar (' ');
-              }
+            default:
+              /* Any other flagged character produces a single space.  */
+              putchar (' ');
+            }
          }
        else
          putchar (*cursor);
@@ -1331,11 +1232,6 @@ fix_output_parameters (void)
        for (cursor = "$%&#_{}\\"; *cursor; cursor++)
          edited_flag[to_uchar (*cursor)] = 1;
  
-      /* Any character with 8th bit set will print to a single space, unless
-         it is diacriticized.  */
-
-      for (character = 0200; character < CHAR_SET_SIZE; character++)
-        edited_flag[character] = todiac (character) != 0;
        break;
      }
  }
author	Paul Eggert <eggert@cs.ucla.edu>
	Sun, 21 Mar 2021 21:00:26 +0000 (14:00 -0700)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Sun, 21 Mar 2021 21:01:15 +0000 (14:01 -0700)
NEWS		patch \| blob \| blame \| history
bootstrap.conf		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/ptx.c		patch \| blob \| blame \| history