From a71539e62830927957ddea3ddb7615f68ffcc55b Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 21 Mar 2021 14:00:26 -0700 Subject: [PATCH] ptx: remove use of diacrit module MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The diacrit module is obsolete, and ptx’s use of it is obsolete too; it assumes an 8-bit locale (not that common these days) and that TeX cannot process the 8-bit characters (nowadays, it can). * NEWS, doc/coreutils.texi (Charset selection in ptx): Document this. * bootstrap.conf (gnulib_modules): Remove diacrit. * src/ptx.c: Do not include diacrit.h. (print_field, fix_output_parameters): Remove obsolete support for 8-bit diacritics. --- NEWS | 4 ++ bootstrap.conf | 1 - doc/coreutils.texi | 17 ++--- src/ptx.c | 168 +++++++++------------------------------------ 4 files changed, 45 insertions(+), 145 deletions(-) diff --git a/NEWS b/NEWS index 89f60d28fe..97cb4bd647 100644 --- a/NEWS +++ b/NEWS @@ -56,6 +56,10 @@ GNU coreutils NEWS -*- outline -*- directory merely because it was removed. This reverts a change that was made in release 8.32. + ptx -T no longer attempts to substitute old-fashioned TeX escapes + for 8-bit non-ASCII alphabetic characters. TeX indexes should + instead use '\usepackage[latin1]{inputenc}' or equivalent. + ** New Features expr and factor now support bignums on all platforms. diff --git a/bootstrap.conf b/bootstrap.conf index 63435fc641..ab6b3ef0c8 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -64,7 +64,6 @@ gnulib_modules=" d-ino d-type di-set - diacrit dirfd dirname do-release-commit-and-tag diff --git a/doc/coreutils.texi b/doc/coreutils.texi index af8a02eaa4..ac0b4467d8 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5481,18 +5481,19 @@ processing. @node Charset selection in ptx @subsection Charset selection -@c FIXME: People don't necessarily know what an IBM-PC was these days. -As it is set up now, the program assumes that the input file is coded -using 8-bit ISO 8859-1 code, also known as Latin-1 character set, -@emph{unless} it is compiled for MS-DOS, in which case it uses the -character set of the IBM-PC@. (GNU @command{ptx} is not known to work on -smaller MS-DOS machines anymore.) Compared to 7-bit ASCII, the set -of characters which are letters is different; this alters the behavior -of regular expression matching. Thus, the default regular expression +As it is set up now, @command{ptx} assumes that the input file is coded +using 8-bit characters, and it may not work well in multibyte locales. +In a single-byte locale, the default regular expression for a keyword allows foreign or diacriticized letters. Keyword sorting, however, is still crude; it obeys the underlying character set ordering quite blindly. +The output of @command{ptx} assumes the locale's character encoding. +For example, with @command{ptx}'s @option{-T} option, if the locale +uses the Latin-1 encoding you may need a LaTeX directive like +@samp{\usepackage[latin1]@{inputenc@}} to render non-ASCII characters +correctly. + @table @samp @item -f diff --git a/src/ptx.c b/src/ptx.c index 4864fdc751..5d3c2c087b 100644 --- a/src/ptx.c +++ b/src/ptx.c @@ -25,7 +25,6 @@ #include "die.h" #include #include "argmatch.h" -#include "diacrit.h" #include "error.h" #include "fadvise.h" #include "quote.h" @@ -1033,8 +1032,6 @@ static void print_field (BLOCK field) { char *cursor; /* Cursor in field to print */ - int base; /* Base character, without diacritic */ - int diacritic; /* Diacritic code for the character */ /* Whitespace is not really compressed. Instead, each white space character (tab, vt, ht etc.) is printed as one single space. */ @@ -1044,140 +1041,44 @@ print_field (BLOCK field) unsigned char character = *cursor; if (edited_flag[character]) { + /* Handle cases which are specific to 'roff' or TeX. All + white space processing is done as the default case of + this switch. */ - /* First check if this is a diacriticized character. - - This works only for TeX. I do not know how diacriticized - letters work with 'roff'. Please someone explain it to me! */ - - diacritic = todiac (character); - if (diacritic != 0 && output_format == TEX_FORMAT) + switch (character) { - base = tobase (character); - switch (diacritic) - { - - case 1: /* Latin diphthongs */ - switch (base) - { - case 'o': - fputs ("\\oe{}", stdout); - break; - - case 'O': - fputs ("\\OE{}", stdout); - break; - - case 'a': - fputs ("\\ae{}", stdout); - break; - - case 'A': - fputs ("\\AE{}", stdout); - break; - - default: - putchar (' '); - } - break; - - case 2: /* Acute accent */ - printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base); - break; - - case 3: /* Grave accent */ - printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base); - break; - - case 4: /* Circumflex accent */ - printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base); - break; - - case 5: /* Diaeresis */ - printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base); - break; - - case 6: /* Tilde accent */ - printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base); - break; - - case 7: /* Cedilla */ - printf ("\\c{%c}", base); - break; - - case 8: /* Small circle beneath */ - switch (base) - { - case 'a': - fputs ("\\aa{}", stdout); - break; - - case 'A': - fputs ("\\AA{}", stdout); - break; - - default: - putchar (' '); - } - break; - - case 9: /* Strike through */ - switch (base) - { - case 'o': - fputs ("\\o{}", stdout); - break; - - case 'O': - fputs ("\\O{}", stdout); - break; - - default: - putchar (' '); - } - break; - } - } - else - - /* This is not a diacritic character, so handle cases which are - really specific to 'roff' or TeX. All white space processing - is done as the default case of this switch. */ - - switch (character) - { - case '"': - /* In roff output format, double any quote. */ - putchar ('"'); - putchar ('"'); - break; + case '"': + /* In roff output format, double any quote. */ + putchar ('"'); + putchar ('"'); + break; - case '$': - case '%': - case '&': - case '#': - case '_': - /* In TeX output format, precede these with a backslash. */ - putchar ('\\'); - putchar (character); - break; + case '$': + case '%': + case '&': + case '#': + case '_': + /* In TeX output format, precede these with a backslash. */ + putchar ('\\'); + putchar (character); + break; - case '{': - case '}': - /* In TeX output format, precede these with a backslash and - force mathematical mode. */ - printf ("$\\%c$", character); - break; + case '{': + case '}': + /* In TeX output format, precede these with a backslash and + force mathematical mode. */ + printf ("$\\%c$", character); + break; - case '\\': - /* In TeX output mode, request production of a backslash. */ - fputs ("\\backslash{}", stdout); - break; + case '\\': + /* In TeX output mode, request production of a backslash. */ + fputs ("\\backslash{}", stdout); + break; - default: - /* Any other flagged character produces a single space. */ - putchar (' '); - } + default: + /* Any other flagged character produces a single space. */ + putchar (' '); + } } else putchar (*cursor); @@ -1331,11 +1232,6 @@ fix_output_parameters (void) for (cursor = "$%&#_{}\\"; *cursor; cursor++) edited_flag[to_uchar (*cursor)] = 1; - /* Any character with 8th bit set will print to a single space, unless - it is diacriticized. */ - - for (character = 0200; character < CHAR_SET_SIZE; character++) - edited_flag[character] = todiac (character) != 0; break; } } -- 2.47.2