@node Charset selection in ptx
@subsection Charset selection
-@c FIXME: People don't necessarily know what an IBM-PC was these days.
-As it is set up now, the program assumes that the input file is coded
-using 8-bit ISO 8859-1 code, also known as Latin-1 character set,
-@emph{unless} it is compiled for MS-DOS, in which case it uses the
-character set of the IBM-PC@. (GNU @command{ptx} is not known to work on
-smaller MS-DOS machines anymore.) Compared to 7-bit ASCII, the set
-of characters which are letters is different; this alters the behavior
-of regular expression matching. Thus, the default regular expression
+As it is set up now, @command{ptx} assumes that the input file is coded
+using 8-bit characters, and it may not work well in multibyte locales.
+In a single-byte locale, the default regular expression
for a keyword allows foreign or diacriticized letters. Keyword sorting,
however, is still crude; it obeys the underlying character set ordering
quite blindly.
+The output of @command{ptx} assumes the locale's character encoding.
+For example, with @command{ptx}'s @option{-T} option, if the locale
+uses the Latin-1 encoding you may need a LaTeX directive like
+@samp{\usepackage[latin1]@{inputenc@}} to render non-ASCII characters
+correctly.
+
@table @samp
@item -f
#include "die.h"
#include <regex.h>
#include "argmatch.h"
-#include "diacrit.h"
#include "error.h"
#include "fadvise.h"
#include "quote.h"
print_field (BLOCK field)
{
char *cursor; /* Cursor in field to print */
- int base; /* Base character, without diacritic */
- int diacritic; /* Diacritic code for the character */
/* Whitespace is not really compressed. Instead, each white space
character (tab, vt, ht etc.) is printed as one single space. */
unsigned char character = *cursor;
if (edited_flag[character])
{
+ /* Handle cases which are specific to 'roff' or TeX. All
+ white space processing is done as the default case of
+ this switch. */
- /* First check if this is a diacriticized character.
-
- This works only for TeX. I do not know how diacriticized
- letters work with 'roff'. Please someone explain it to me! */
-
- diacritic = todiac (character);
- if (diacritic != 0 && output_format == TEX_FORMAT)
+ switch (character)
{
- base = tobase (character);
- switch (diacritic)
- {
-
- case 1: /* Latin diphthongs */
- switch (base)
- {
- case 'o':
- fputs ("\\oe{}", stdout);
- break;
-
- case 'O':
- fputs ("\\OE{}", stdout);
- break;
-
- case 'a':
- fputs ("\\ae{}", stdout);
- break;
-
- case 'A':
- fputs ("\\AE{}", stdout);
- break;
-
- default:
- putchar (' ');
- }
- break;
-
- case 2: /* Acute accent */
- printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
- break;
-
- case 3: /* Grave accent */
- printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
- break;
-
- case 4: /* Circumflex accent */
- printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
- break;
-
- case 5: /* Diaeresis */
- printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
- break;
-
- case 6: /* Tilde accent */
- printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
- break;
-
- case 7: /* Cedilla */
- printf ("\\c{%c}", base);
- break;
-
- case 8: /* Small circle beneath */
- switch (base)
- {
- case 'a':
- fputs ("\\aa{}", stdout);
- break;
-
- case 'A':
- fputs ("\\AA{}", stdout);
- break;
-
- default:
- putchar (' ');
- }
- break;
-
- case 9: /* Strike through */
- switch (base)
- {
- case 'o':
- fputs ("\\o{}", stdout);
- break;
-
- case 'O':
- fputs ("\\O{}", stdout);
- break;
-
- default:
- putchar (' ');
- }
- break;
- }
- }
- else
-
- /* This is not a diacritic character, so handle cases which are
- really specific to 'roff' or TeX. All white space processing
- is done as the default case of this switch. */
-
- switch (character)
- {
- case '"':
- /* In roff output format, double any quote. */
- putchar ('"');
- putchar ('"');
- break;
+ case '"':
+ /* In roff output format, double any quote. */
+ putchar ('"');
+ putchar ('"');
+ break;
- case '$':
- case '%':
- case '&':
- case '#':
- case '_':
- /* In TeX output format, precede these with a backslash. */
- putchar ('\\');
- putchar (character);
- break;
+ case '$':
+ case '%':
+ case '&':
+ case '#':
+ case '_':
+ /* In TeX output format, precede these with a backslash. */
+ putchar ('\\');
+ putchar (character);
+ break;
- case '{':
- case '}':
- /* In TeX output format, precede these with a backslash and
- force mathematical mode. */
- printf ("$\\%c$", character);
- break;
+ case '{':
+ case '}':
+ /* In TeX output format, precede these with a backslash and
+ force mathematical mode. */
+ printf ("$\\%c$", character);
+ break;
- case '\\':
- /* In TeX output mode, request production of a backslash. */
- fputs ("\\backslash{}", stdout);
- break;
+ case '\\':
+ /* In TeX output mode, request production of a backslash. */
+ fputs ("\\backslash{}", stdout);
+ break;
- default:
- /* Any other flagged character produces a single space. */
- putchar (' ');
- }
+ default:
+ /* Any other flagged character produces a single space. */
+ putchar (' ');
+ }
}
else
putchar (*cursor);
for (cursor = "$%&#_{}\\"; *cursor; cursor++)
edited_flag[to_uchar (*cursor)] = 1;
- /* Any character with 8th bit set will print to a single space, unless
- it is diacriticized. */
-
- for (character = 0200; character < CHAR_SET_SIZE; character++)
- edited_flag[character] = todiac (character) != 0;
break;
}
}