readline/mbutil.c

   1 /* mbutil.c -- readline multibyte character utility functions */
   2
   3 /* Copyright (C) 2001-2017 Free Software Foundation, Inc.
   4
   5    This file is part of the GNU Readline Library (Readline), a library
   6    for reading lines of text with interactive input and history editing.
   7
   8    Readline is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    Readline is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with Readline.  If not, see <http://www.gnu.org/licenses/>.
  20 */
  21
  22 #define READLINE_LIBRARY
  23
  24 #if defined (HAVE_CONFIG_H)
  25 #  include <config.h>
  26 #endif
  27
  28 #include <sys/types.h>
  29 #include <fcntl.h>
  30 #include "posixjmp.h"
  31
  32 #if defined (HAVE_UNISTD_H)
  33 #  include <unistd.h>      /* for _POSIX_VERSION */
  34 #endif /* HAVE_UNISTD_H */
  35
  36 #if defined (HAVE_STDLIB_H)
  37 #  include <stdlib.h>
  38 #else
  39 #  include "ansi_stdlib.h"
  40 #endif /* HAVE_STDLIB_H */
  41
  42 #include <stdio.h>
  43 #include <ctype.h>
  44
  45 /* System-specific feature definitions and include files. */
  46 #include "rldefs.h"
  47 #include "rlmbutil.h"
  48
  49 #if defined (TIOCSTAT_IN_SYS_IOCTL)
  50 #  include <sys/ioctl.h>
  51 #endif /* TIOCSTAT_IN_SYS_IOCTL */
  52
  53 /* Some standard library routines. */
  54 #include "readline.h"
  55
  56 #include "rlprivate.h"
  57 #include "xmalloc.h"
  58
  59 /* Declared here so it can be shared between the readline and history
  60    libraries. */
  61 #if defined (HANDLE_MULTIBYTE)
  62 int rl_byte_oriented = 0;
  63 #else
  64 int rl_byte_oriented = 1;
  65 #endif
  66
  67 /* Ditto */
  68 int _rl_utf8locale = 0;
  69
  70 /* **************************************************************** */
  71 /*                                                                  */
  72 /*              Multibyte Character Utility Functions               */
  73 /*                                                                  */
  74 /* **************************************************************** */
  75
  76 #if defined(HANDLE_MULTIBYTE)
  77
  78 /* **************************************************************** */
  79 /*                                                                  */
  80 /*              UTF-8 specific Character Utility Functions          */
  81 /*                                                                  */
  82 /* **************************************************************** */
  83
  84 /* Return the length in bytes of the possibly-multibyte character beginning
  85    at S. Encoding is UTF-8. */
  86 static int
  87 _rl_utf8_mblen (const char *s, size_t n)
  88 {
  89   unsigned char c, c1;
  90
  91   if (s == 0)
  92     return (0); /* no shift states */
  93   if (n <= 0)
  94     return (-1);
  95
  96   c = (unsigned char)*s;
  97   if (c < 0x80)
  98     return (c != 0);
  99   if (c >= 0xc2)
 100     {
 101       c1 = (unsigned char)s[1];
 102       if (c < 0xe0)
 103         {
 104           if (n >= 2 && (s[1] ^ 0x80) < 0x40)
 105             return 2;
 106         }
 107       else if (c < 0xf0)
 108         {
 109           if (n >= 3
 110                 && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
 111                 && (c >= 0xe1 || c1 >= 0xa0)
 112                 && (c != 0xed || c1 < 0xa0))
 113             return 3;
 114         }
 115       else if (c < 0xf8)
 116         {
 117           if (n >= 4
 118                 && (s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
 119                 && (s[3] ^ 0x80) < 0x40
 120                 && (c >= 0xf1 || c1 >= 0x90)
 121                 && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
 122             return 4;
 123         }
 124     }
 125   /* invalid or incomplete multibyte character */
 126   return -1;
 127 }
 128
 129 static int
 130 _rl_find_next_mbchar_internal (char *string, int seed, int count, int find_non_zero)
 131 {
 132   size_t tmp, len;
 133   mbstate_t ps;
 134   int point;
 135   wchar_t wc;
 136
 137   tmp = 0;
 138
 139   memset(&ps, 0, sizeof (mbstate_t));
 140   if (seed < 0)
 141     seed = 0;
 142   if (count <= 0)
 143     return seed;
 144
 145   point = seed + _rl_adjust_point (string, seed, &ps);
 146   /* if _rl_adjust_point returns -1, the character or string is invalid.
 147      treat as a byte. */
 148   if (point == seed - 1)        /* invalid */
 149     return seed + 1;
 150
 151   /* if this is true, means that seed was not pointing to a byte indicating
 152      the beginning of a multibyte character.  Correct the point and consume
 153      one char. */
 154   if (seed < point)
 155     count--;
 156
 157   while (count > 0)
 158     {
 159       len = strlen (string + point);
 160       if (len == 0)
 161         break;
 162       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
 163         {
 164           tmp = 1;
 165           wc = (wchar_t) string[point];
 166           memset(&ps, 0, sizeof(mbstate_t));
 167         }
 168       else
 169         tmp = mbrtowc (&wc, string+point, len, &ps);
 170       if (MB_INVALIDCH ((size_t)tmp))
 171         {
 172           /* invalid bytes. assume a byte represents a character */
 173           point++;
 174           count--;
 175           /* reset states. */
 176           memset(&ps, 0, sizeof(mbstate_t));
 177         }
 178       else if (MB_NULLWCH (tmp))
 179         break;                  /* found wide '\0' */
 180       else
 181         {
 182           /* valid bytes */
 183           point += tmp;
 184           if (find_non_zero)
 185             {
 186               if (WCWIDTH (wc) == 0)
 187                 continue;
 188               else
 189                 count--;
 190             }
 191           else
 192             count--;
 193         }
 194     }
 195
 196   if (find_non_zero)
 197     {
 198       tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
 199       while (MB_NULLWCH (tmp) == 0 && MB_INVALIDCH (tmp) == 0 && WCWIDTH (wc) == 0)
 200         {
 201           point += tmp;
 202           tmp = mbrtowc (&wc, string + point, strlen (string + point), &ps);
 203         }
 204     }
 205
 206   return point;
 207 }
 208
 209 /*static*/ int
 210 _rl_find_prev_mbchar_internal (char *string, int seed, int find_non_zero)
 211 {
 212   mbstate_t ps;
 213   int prev, non_zero_prev, point, length;
 214   size_t tmp;
 215   wchar_t wc;
 216
 217   memset(&ps, 0, sizeof(mbstate_t));
 218   length = strlen(string);
 219
 220   if (seed < 0)
 221     return 0;
 222   else if (length < seed)
 223     return length;
 224
 225   prev = non_zero_prev = point = 0;
 226   while (point < seed)
 227     {
 228       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[point]))
 229         {
 230           tmp = 1;
 231           wc = (wchar_t) string[point];
 232           memset(&ps, 0, sizeof(mbstate_t));
 233         }
 234       else
 235         tmp = mbrtowc (&wc, string + point, length - point, &ps);
 236       if (MB_INVALIDCH ((size_t)tmp))
 237         {
 238           /* in this case, bytes are invalid or too short to compose
 239              multibyte char, so assume that the first byte represents
 240              a single character anyway. */
 241           tmp = 1;
 242           /* clear the state of the byte sequence, because
 243              in this case effect of mbstate is undefined  */
 244           memset(&ps, 0, sizeof (mbstate_t));
 245
 246           /* Since we're assuming that this byte represents a single
 247              non-zero-width character, don't forget about it. */
 248           prev = point;
 249         }
 250       else if (MB_NULLWCH (tmp))
 251         break;                  /* Found '\0' char.  Can this happen? */
 252       else
 253         {
 254           if (find_non_zero)
 255             {
 256               if (WCWIDTH (wc) != 0)
 257                 prev = point;
 258             }
 259           else
 260             prev = point;
 261         }
 262
 263       point += tmp;
 264     }
 265
 266   return prev;
 267 }
 268
 269 /* return the number of bytes parsed from the multibyte sequence starting
 270    at src, if a non-L'\0' wide character was recognized. It returns 0,
 271    if a L'\0' wide character was recognized. It  returns (size_t)(-1),
 272    if an invalid multibyte sequence was encountered. It returns (size_t)(-2)
 273    if it couldn't parse a complete  multibyte character.  */
 274 int
 275 _rl_get_char_len (char *src, mbstate_t *ps)
 276 {
 277   size_t tmp, l;
 278   int mb_cur_max;
 279
 280   /* Look at no more than MB_CUR_MAX characters */
 281   l = (size_t)strlen (src);
 282   if (_rl_utf8locale && l > 0 && UTF8_SINGLEBYTE(*src))
 283     tmp = (*src != 0) ? 1 : 0;
 284   else
 285     {
 286       mb_cur_max = MB_CUR_MAX;
 287       tmp = mbrlen((const char *)src, (l < mb_cur_max) ? l : mb_cur_max, ps);
 288     }
 289   if (tmp == (size_t)(-2))
 290     {
 291       /* too short to compose multibyte char */
 292       if (ps)
 293         memset (ps, 0, sizeof(mbstate_t));
 294       return -2;
 295     }
 296   else if (tmp == (size_t)(-1))
 297     {
 298       /* invalid to compose multibyte char */
 299       /* initialize the conversion state */
 300       if (ps)
 301         memset (ps, 0, sizeof(mbstate_t));
 302       return -1;
 303     }
 304   else if (tmp == (size_t)0)
 305     return 0;
 306   else
 307     return (int)tmp;
 308 }
 309
 310 /* compare the specified two characters. If the characters matched,
 311    return 1. Otherwise return 0. */
 312 int
 313 _rl_compare_chars (char *buf1, int pos1, mbstate_t *ps1, char *buf2, int pos2, mbstate_t *ps2)
 314 {
 315   int i, w1, w2;
 316
 317   if ((w1 = _rl_get_char_len (&buf1[pos1], ps1)) <= 0 ||
 318         (w2 = _rl_get_char_len (&buf2[pos2], ps2)) <= 0 ||
 319         (w1 != w2) ||
 320         (buf1[pos1] != buf2[pos2]))
 321     return 0;
 322
 323   for (i = 1; i < w1; i++)
 324     if (buf1[pos1+i] != buf2[pos2+i])
 325       return 0;
 326
 327   return 1;
 328 }
 329
 330 /* adjust pointed byte and find mbstate of the point of string.
 331    adjusted point will be point <= adjusted_point, and returns
 332    differences of the byte(adjusted_point - point).
 333    if point is invalid (point < 0 || more than string length),
 334    it returns -1 */
 335 int
 336 _rl_adjust_point (char *string, int point, mbstate_t *ps)
 337 {
 338   size_t tmp;
 339   int length, pos;
 340
 341   tmp = 0;
 342   pos = 0;
 343   length = strlen(string);
 344   if (point < 0)
 345     return -1;
 346   if (length < point)
 347     return -1;
 348
 349   while (pos < point)
 350     {
 351       if (_rl_utf8locale && UTF8_SINGLEBYTE(string[pos]))
 352         tmp = 1;
 353       else
 354         tmp = mbrlen (string + pos, length - pos, ps);
 355       if (MB_INVALIDCH ((size_t)tmp))
 356         {
 357           /* in this case, bytes are invalid or too short to compose
 358              multibyte char, so assume that the first byte represents
 359              a single character anyway. */
 360           pos++;
 361           /* clear the state of the byte sequence, because
 362              in this case effect of mbstate is undefined  */
 363           if (ps)
 364             memset (ps, 0, sizeof (mbstate_t));
 365         }
 366       else if (MB_NULLWCH (tmp))
 367         pos++;
 368       else
 369         pos += tmp;
 370     }
 371
 372   return (pos - point);
 373 }
 374
 375 int
 376 _rl_is_mbchar_matched (char *string, int seed, int end, char *mbchar, int length)
 377 {
 378   int i;
 379
 380   if ((end - seed) < length)
 381     return 0;
 382
 383   for (i = 0; i < length; i++)
 384     if (string[seed + i] != mbchar[i])
 385       return 0;
 386   return 1;
 387 }
 388
 389 wchar_t
 390 _rl_char_value (char *buf, int ind)
 391 {
 392   size_t tmp;
 393   wchar_t wc;
 394   mbstate_t ps;
 395   int l;
 396
 397   if (MB_LEN_MAX == 1 || rl_byte_oriented)
 398     return ((wchar_t) buf[ind]);
 399   if (_rl_utf8locale && UTF8_SINGLEBYTE(buf[ind]))
 400     return ((wchar_t) buf[ind]);
 401   l = strlen (buf);
 402   if (ind >= l - 1)
 403     return ((wchar_t) buf[ind]);
 404   if (l < ind)                  /* Sanity check */
 405     l = strlen (buf+ind);
 406   memset (&ps, 0, sizeof (mbstate_t));
 407   tmp = mbrtowc (&wc, buf + ind, l - ind, &ps);
 408   if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))
 409     return ((wchar_t) buf[ind]);
 410   return wc;
 411 }
 412 #endif /* HANDLE_MULTIBYTE */
 413
 414 /* Find next `count' characters started byte point of the specified seed.
 415    If flags is MB_FIND_NONZERO, we look for non-zero-width multibyte
 416    characters. */
 417 #undef _rl_find_next_mbchar
 418 int
 419 _rl_find_next_mbchar (char *string, int seed, int count, int flags)
 420 {
 421 #if defined (HANDLE_MULTIBYTE)
 422   return _rl_find_next_mbchar_internal (string, seed, count, flags);
 423 #else
 424   return (seed + count);
 425 #endif
 426 }
 427
 428 /* Find previous character started byte point of the specified seed.
 429    Returned point will be point <= seed.  If flags is MB_FIND_NONZERO,
 430    we look for non-zero-width multibyte characters. */
 431 #undef _rl_find_prev_mbchar
 432 int
 433 _rl_find_prev_mbchar (char *string, int seed, int flags)
 434 {
 435 #if defined (HANDLE_MULTIBYTE)
 436   return _rl_find_prev_mbchar_internal (string, seed, flags);
 437 #else
 438   return ((seed == 0) ? seed : seed - 1);
 439 #endif
 440 }