libblkid/src/encode.c

   1
   2 /*
   3  * encode.c - string conversion routines (mostly for compatibility with
   4  *            udev/volume_id)
   5  *
   6  * Copyright (C) 2008 Kay Sievers <kay.sievers@vrfy.org>
   7  * Copyright (C) 2009 Karel Zak <kzak@redhat.com>
   8  *
   9  * This file may be redistributed under the terms of the
  10  * GNU Lesser General Public License.
  11  */
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <stddef.h>
  15 #include <unistd.h>
  16 #include <errno.h>
  17 #include <string.h>
  18 #include <ctype.h>
  19
  20 #include "blkidP.h"
  21
  22 #define UDEV_ALLOWED_CHARS_INPUT               "/ $%?,"
  23
  24 /**
  25  * SECTION: encode
  26  * @title: Encoding utils
  27  * @short_description: encode strings to safe udev-compatible formats
  28  *
  29  */
  30
  31 /* count of characters used to encode one unicode char */
  32 static int utf8_encoded_expected_len(const char *str)
  33 {
  34         unsigned char c = (unsigned char)str[0];
  35
  36         if (c < 0x80)
  37                 return 1;
  38         if ((c & 0xe0) == 0xc0)
  39                 return 2;
  40         if ((c & 0xf0) == 0xe0)
  41                 return 3;
  42         if ((c & 0xf8) == 0xf0)
  43                 return 4;
  44         if ((c & 0xfc) == 0xf8)
  45                 return 5;
  46         if ((c & 0xfe) == 0xfc)
  47                 return 6;
  48         return 0;
  49 }
  50
  51 /* decode one unicode char */
  52 static int utf8_encoded_to_unichar(const char *str)
  53 {
  54         int unichar;
  55         int len;
  56         int i;
  57
  58         len = utf8_encoded_expected_len(str);
  59         switch (len) {
  60         case 1:
  61                 return (int)str[0];
  62         case 2:
  63                 unichar = str[0] & 0x1f;
  64                 break;
  65         case 3:
  66                 unichar = (int)str[0] & 0x0f;
  67                 break;
  68         case 4:
  69                 unichar = (int)str[0] & 0x07;
  70                 break;
  71         case 5:
  72                 unichar = (int)str[0] & 0x03;
  73                 break;
  74         case 6:
  75                 unichar = (int)str[0] & 0x01;
  76                 break;
  77         default:
  78                 return -1;
  79         }
  80
  81         for (i = 1; i < len; i++) {
  82                 if (((int)str[i] & 0xc0) != 0x80)
  83                         return -1;
  84                 unichar <<= 6;
  85                 unichar |= (int)str[i] & 0x3f;
  86         }
  87
  88         return unichar;
  89 }
  90
  91 /* expected size used to encode one unicode char */
  92 static int utf8_unichar_to_encoded_len(int unichar)
  93 {
  94         if (unichar < 0x80)
  95                 return 1;
  96         if (unichar < 0x800)
  97                 return 2;
  98         if (unichar < 0x10000)
  99                 return 3;
 100         if (unichar < 0x200000)
 101                 return 4;
 102         if (unichar < 0x4000000)
 103                 return 5;
 104         return 6;
 105 }
 106
 107 /* check if unicode char has a valid numeric range */
 108 static int utf8_unichar_valid_range(int unichar)
 109 {
 110         if (unichar > 0x10ffff)
 111                 return 0;
 112         if ((unichar & 0xfffff800) == 0xd800)
 113                 return 0;
 114         if ((unichar > 0xfdcf) && (unichar < 0xfdf0))
 115                 return 0;
 116         if ((unichar & 0xffff) == 0xffff)
 117                 return 0;
 118         return 1;
 119 }
 120
 121 /* validate one encoded unicode char and return its length */
 122 static int utf8_encoded_valid_unichar(const char *str)
 123 {
 124         int len;
 125         int unichar;
 126         int i;
 127
 128         len = utf8_encoded_expected_len(str);
 129         if (len == 0)
 130                 return -1;
 131
 132         /* ascii is valid */
 133         if (len == 1)
 134                 return 1;
 135
 136         /* check if expected encoded chars are available */
 137         for (i = 0; i < len; i++)
 138                 if ((str[i] & 0x80) != 0x80)
 139                         return -1;
 140
 141         unichar = utf8_encoded_to_unichar(str);
 142
 143         /* check if encoded length matches encoded value */
 144         if (utf8_unichar_to_encoded_len(unichar) != len)
 145                 return -1;
 146
 147         /* check if value has valid range */
 148         if (!utf8_unichar_valid_range(unichar))
 149                 return -1;
 150
 151         return len;
 152 }
 153
 154 static int replace_whitespace(const char *str, char *to, size_t len)
 155 {
 156         size_t i, j;
 157
 158         /* strip trailing whitespace */
 159         len = strnlen(str, len);
 160         while (len && isspace(str[len-1]))
 161                 len--;
 162
 163         /* strip leading whitespace */
 164         i = 0;
 165         while ((i < len) && isspace(str[i]))
 166                 i++;
 167
 168         j = 0;
 169         while (i < len) {
 170                 /* substitute multiple whitespace with a single '_' */
 171                 if (isspace(str[i])) {
 172                         while (isspace(str[i]))
 173                                 i++;
 174                         to[j++] = '_';
 175                 }
 176                 to[j++] = str[i++];
 177         }
 178         to[j] = '\0';
 179         return 0;
 180 }
 181
 182 static int is_whitelisted(char c, const char *white)
 183 {
 184         if ((c >= '0' && c <= '9') ||
 185             (c >= 'A' && c <= 'Z') ||
 186             (c >= 'a' && c <= 'z') ||
 187             strchr("#+-.:=@_", c) != NULL ||
 188             (white != NULL && strchr(white, c) != NULL))
 189                 return 1;
 190         return 0;
 191 }
 192
 193 /* allow chars in whitelist, plain ascii, hex-escaping and valid utf8 */
 194 static int replace_chars(char *str, const char *white)
 195 {
 196         size_t i = 0;
 197         int replaced = 0;
 198
 199         while (str[i] != '\0') {
 200                 int len;
 201
 202                 if (is_whitelisted(str[i], white)) {
 203                         i++;
 204                         continue;
 205                 }
 206
 207                 /* accept hex encoding */
 208                 if (str[i] == '\\' && str[i+1] == 'x') {
 209                         i += 2;
 210                         continue;
 211                 }
 212
 213                 /* accept valid utf8 */
 214                 len = utf8_encoded_valid_unichar(&str[i]);
 215                 if (len > 1) {
 216                         i += len;
 217                         continue;
 218                 }
 219
 220                 /* if space is allowed, replace whitespace with ordinary space */
 221                 if (isspace(str[i]) && white != NULL && strchr(white, ' ') != NULL) {
 222                         str[i] = ' ';
 223                         i++;
 224                         replaced++;
 225                         continue;
 226                 }
 227
 228                 /* everything else is replaced with '_' */
 229                 str[i] = '_';
 230                 i++;
 231                 replaced++;
 232         }
 233         return replaced;
 234 }
 235
 236 /**
 237  * blkid_encode_string:
 238  * @str: input string to be encoded
 239  * @str_enc: output string to store the encoded input string
 240  * @len: maximum size of the output string, which may be
 241  *       four times as long as the input string
 242  *
 243  * Encode all potentially unsafe characters of a string to the
 244  * corresponding hex value prefixed by '\x'.
 245  *
 246  * Returns: 0 if the entire string was copied, non-zero otherwise.
 247  **/
 248 int blkid_encode_string(const char *str, char *str_enc, size_t len)
 249 {
 250         size_t i, j;
 251
 252         if (!str || !str_enc || !len)
 253                 return -1;
 254
 255         for (i = 0, j = 0; str[i] != '\0'; i++) {
 256                 int seqlen;
 257
 258                 seqlen = utf8_encoded_valid_unichar(&str[i]);
 259                 if (seqlen > 1) {
 260                         if (len-j < (size_t)seqlen)
 261                                 goto err;
 262                         memcpy(&str_enc[j], &str[i], seqlen);
 263                         j += seqlen;
 264                         i += (seqlen-1);
 265                 } else if (str[i] == '\\' || !is_whitelisted(str[i], NULL)) {
 266                         if (len-j < 4)
 267                                 goto err;
 268                         sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
 269                         j += 4;
 270                 } else {
 271                         if (len-j < 1)
 272                                 goto err;
 273                         str_enc[j] = str[i];
 274                         j++;
 275                 }
 276                 if (j+3 >= len)
 277                         goto err;
 278         }
 279         if (len-j < 1)
 280                 goto err;
 281         str_enc[j] = '\0';
 282         return 0;
 283 err:
 284         return -1;
 285 }
 286
 287 /**
 288  * blkid_safe_string:
 289  * @str: input string
 290  * @str_safe: output string
 291  * @len: size of output string
 292  *
 293  * Allows plain ascii, hex-escaping and valid utf8. Replaces all whitespaces
 294  * with '_'.
 295  *
 296  * Returns: 0 on success or -1 in case of error.
 297  */
 298 int blkid_safe_string(const char *str, char *str_safe, size_t len)
 299 {
 300         if (!str || !str_safe || !len)
 301                 return -1;
 302         replace_whitespace(str, str_safe, len);
 303         replace_chars(str_safe, UDEV_ALLOWED_CHARS_INPUT);
 304         return 0;
 305 }