From a71539e62830927957ddea3ddb7615f68ffcc55b Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sun, 21 Mar 2021 14:00:26 -0700
Subject: [PATCH] ptx: remove use of diacrit module
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The diacrit module is obsolete, and ptxâs use of it is obsolete
too; it assumes an 8-bit locale (not that common these days) and
that TeX cannot process the 8-bit characters (nowadays, it can).
* NEWS, doc/coreutils.texi (Charset selection in ptx): Document this.
* bootstrap.conf (gnulib_modules): Remove diacrit.
* src/ptx.c: Do not include diacrit.h.
(print_field, fix_output_parameters): Remove obsolete support
for 8-bit diacritics.
---
 NEWS               |   4 ++
 bootstrap.conf     |   1 -
 doc/coreutils.texi |  17 ++---
 src/ptx.c          | 168 +++++++++------------------------------------
 4 files changed, 45 insertions(+), 145 deletions(-)

diff --git a/NEWS b/NEWS
index 89f60d28fe..97cb4bd647 100644
--- a/NEWS
+++ b/NEWS
@@ -56,6 +56,10 @@ GNU coreutils NEWS                                    -*- outline -*-
   directory merely because it was removed.  This reverts a change
   that was made in release 8.32.
 
+  ptx -T no longer attempts to substitute old-fashioned TeX escapes
+  for 8-bit non-ASCII alphabetic characters.  TeX indexes should
+  instead use '\usepackage[latin1]{inputenc}' or equivalent.
+
 ** New Features
 
   expr and factor now support bignums on all platforms.
diff --git a/bootstrap.conf b/bootstrap.conf
index 63435fc641..ab6b3ef0c8 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -64,7 +64,6 @@ gnulib_modules="
   d-ino
   d-type
   di-set
-  diacrit
   dirfd
   dirname
   do-release-commit-and-tag
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index af8a02eaa4..ac0b4467d8 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -5481,18 +5481,19 @@ processing.
 @node Charset selection in ptx
 @subsection Charset selection
 
-@c FIXME:  People don't necessarily know what an IBM-PC was these days.
-As it is set up now, the program assumes that the input file is coded
-using 8-bit ISO 8859-1 code, also known as Latin-1 character set,
-@emph{unless} it is compiled for MS-DOS, in which case it uses the
-character set of the IBM-PC@.  (GNU @command{ptx} is not known to work on
-smaller MS-DOS machines anymore.)  Compared to 7-bit ASCII, the set
-of characters which are letters is different; this alters the behavior
-of regular expression matching.  Thus, the default regular expression
+As it is set up now, @command{ptx} assumes that the input file is coded
+using 8-bit characters, and it may not work well in multibyte locales.
+In a single-byte locale, the default regular expression
 for a keyword allows foreign or diacriticized letters.  Keyword sorting,
 however, is still crude; it obeys the underlying character set ordering
 quite blindly.
 
+The output of @command{ptx} assumes the locale's character encoding.
+For example, with @command{ptx}'s @option{-T} option, if the locale
+uses the Latin-1 encoding you may need a LaTeX directive like
+@samp{\usepackage[latin1]@{inputenc@}} to render non-ASCII characters
+correctly.
+
 @table @samp
 
 @item -f
diff --git a/src/ptx.c b/src/ptx.c
index 4864fdc751..5d3c2c087b 100644
--- a/src/ptx.c
+++ b/src/ptx.c
@@ -25,7 +25,6 @@
 #include "die.h"
 #include <regex.h>
 #include "argmatch.h"
-#include "diacrit.h"
 #include "error.h"
 #include "fadvise.h"
 #include "quote.h"
@@ -1033,8 +1032,6 @@ static void
 print_field (BLOCK field)
 {
   char *cursor;			/* Cursor in field to print */
-  int base;			/* Base character, without diacritic */
-  int diacritic;		/* Diacritic code for the character */
 
   /* Whitespace is not really compressed.  Instead, each white space
      character (tab, vt, ht etc.) is printed as one single space.  */
@@ -1044,140 +1041,44 @@ print_field (BLOCK field)
       unsigned char character = *cursor;
       if (edited_flag[character])
         {
+          /* Handle cases which are specific to 'roff' or TeX.  All
+             white space processing is done as the default case of
+             this switch.  */
 
-          /* First check if this is a diacriticized character.
-
-             This works only for TeX.  I do not know how diacriticized
-             letters work with 'roff'.  Please someone explain it to me!  */
-
-          diacritic = todiac (character);
-          if (diacritic != 0 && output_format == TEX_FORMAT)
+          switch (character)
             {
-              base = tobase (character);
-              switch (diacritic)
-                {
-
-                case 1:		/* Latin diphthongs */
-                  switch (base)
-                    {
-                    case 'o':
-                      fputs ("\\oe{}", stdout);
-                      break;
-
-                    case 'O':
-                      fputs ("\\OE{}", stdout);
-                      break;
-
-                    case 'a':
-                      fputs ("\\ae{}", stdout);
-                      break;
-
-                    case 'A':
-                      fputs ("\\AE{}", stdout);
-                      break;
-
-                    default:
-                      putchar (' ');
-                    }
-                  break;
-
-                case 2:		/* Acute accent */
-                  printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 3:		/* Grave accent */
-                  printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 4:		/* Circumflex accent */
-                  printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 5:		/* Diaeresis */
-                  printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 6:		/* Tilde accent */
-                  printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
-                  break;
-
-                case 7:		/* Cedilla */
-                  printf ("\\c{%c}", base);
-                  break;
-
-                case 8:		/* Small circle beneath */
-                  switch (base)
-                    {
-                    case 'a':
-                      fputs ("\\aa{}", stdout);
-                      break;
-
-                    case 'A':
-                      fputs ("\\AA{}", stdout);
-                      break;
-
-                    default:
-                      putchar (' ');
-                    }
-                  break;
-
-                case 9:		/* Strike through */
-                  switch (base)
-                    {
-                    case 'o':
-                      fputs ("\\o{}", stdout);
-                      break;
-
-                    case 'O':
-                      fputs ("\\O{}", stdout);
-                      break;
-
-                    default:
-                      putchar (' ');
-                    }
-                  break;
-                }
-            }
-          else
-
-            /* This is not a diacritic character, so handle cases which are
-               really specific to 'roff' or TeX.  All white space processing
-               is done as the default case of this switch.  */
-
-            switch (character)
-              {
-              case '"':
-                /* In roff output format, double any quote.  */
-                putchar ('"');
-                putchar ('"');
-                break;
+            case '"':
+              /* In roff output format, double any quote.  */
+              putchar ('"');
+              putchar ('"');
+              break;
 
-              case '$':
-              case '%':
-              case '&':
-              case '#':
-              case '_':
-                /* In TeX output format, precede these with a backslash.  */
-                putchar ('\\');
-                putchar (character);
-                break;
+            case '$':
+            case '%':
+            case '&':
+            case '#':
+            case '_':
+              /* In TeX output format, precede these with a backslash.  */
+              putchar ('\\');
+              putchar (character);
+              break;
 
-              case '{':
-              case '}':
-                /* In TeX output format, precede these with a backslash and
-                   force mathematical mode.  */
-                printf ("$\\%c$", character);
-                break;
+            case '{':
+            case '}':
+              /* In TeX output format, precede these with a backslash and
+                 force mathematical mode.  */
+              printf ("$\\%c$", character);
+              break;
 
-              case '\\':
-                /* In TeX output mode, request production of a backslash.  */
-                fputs ("\\backslash{}", stdout);
-                break;
+            case '\\':
+              /* In TeX output mode, request production of a backslash.  */
+              fputs ("\\backslash{}", stdout);
+              break;
 
-              default:
-                /* Any other flagged character produces a single space.  */
-                putchar (' ');
-              }
+            default:
+              /* Any other flagged character produces a single space.  */
+              putchar (' ');
+            }
         }
       else
         putchar (*cursor);
@@ -1331,11 +1232,6 @@ fix_output_parameters (void)
       for (cursor = "$%&#_{}\\"; *cursor; cursor++)
         edited_flag[to_uchar (*cursor)] = 1;
 
-      /* Any character with 8th bit set will print to a single space, unless
-         it is diacriticized.  */
-
-      for (character = 0200; character < CHAR_SET_SIZE; character++)
-        edited_flag[character] = todiac (character) != 0;
       break;
     }
 }
-- 
2.47.2