convert.c

   1 #include "cache.h"
   2 #include "config.h"
   3 #include "hex.h"
   4 #include "object-store.h"
   5 #include "attr.h"
   6 #include "run-command.h"
   7 #include "quote.h"
   8 #include "sigchain.h"
   9 #include "pkt-line.h"
  10 #include "sub-process.h"
  11 #include "utf8.h"
  12 #include "ll-merge.h"
  13
  14 /*
  15  * convert.c - convert a file when checking it out and checking it in.
  16  *
  17  * This should use the pathname to decide on whether it wants to do some
  18  * more interesting conversions (automatic gzip/unzip, general format
  19  * conversions etc etc), but by default it just does automatic CRLF<->LF
  20  * translation when the "text" attribute or "auto_crlf" option is set.
  21  */
  22
  23 /* Stat bits: When BIN is set, the txt bits are unset */
  24 #define CONVERT_STAT_BITS_TXT_LF    0x1
  25 #define CONVERT_STAT_BITS_TXT_CRLF  0x2
  26 #define CONVERT_STAT_BITS_BIN       0x4
  27
  28 struct text_stat {
  29         /* NUL, CR, LF and CRLF counts */
  30         unsigned nul, lonecr, lonelf, crlf;
  31
  32         /* These are just approximations! */
  33         unsigned printable, nonprintable;
  34 };
  35
  36 static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
  37 {
  38         unsigned long i;
  39
  40         memset(stats, 0, sizeof(*stats));
  41
  42         for (i = 0; i < size; i++) {
  43                 unsigned char c = buf[i];
  44                 if (c == '\r') {
  45                         if (i+1 < size && buf[i+1] == '\n') {
  46                                 stats->crlf++;
  47                                 i++;
  48                         } else
  49                                 stats->lonecr++;
  50                         continue;
  51                 }
  52                 if (c == '\n') {
  53                         stats->lonelf++;
  54                         continue;
  55                 }
  56                 if (c == 127)
  57                         /* DEL */
  58                         stats->nonprintable++;
  59                 else if (c < 32) {
  60                         switch (c) {
  61                                 /* BS, HT, ESC and FF */
  62                         case '\b': case '\t': case '\033': case '\014':
  63                                 stats->printable++;
  64                                 break;
  65                         case 0:
  66                                 stats->nul++;
  67                                 /* fall through */
  68                         default:
  69                                 stats->nonprintable++;
  70                         }
  71                 }
  72                 else
  73                         stats->printable++;
  74         }
  75
  76         /* If file ends with EOF then don't count this EOF as non-printable. */
  77         if (size >= 1 && buf[size-1] == '\032')
  78                 stats->nonprintable--;
  79 }
  80
  81 /*
  82  * The same heuristics as diff.c::mmfile_is_binary()
  83  * We treat files with bare CR as binary
  84  */
  85 static int convert_is_binary(const struct text_stat *stats)
  86 {
  87         if (stats->lonecr)
  88                 return 1;
  89         if (stats->nul)
  90                 return 1;
  91         if ((stats->printable >> 7) < stats->nonprintable)
  92                 return 1;
  93         return 0;
  94 }
  95
  96 static unsigned int gather_convert_stats(const char *data, unsigned long size)
  97 {
  98         struct text_stat stats;
  99         int ret = 0;
 100         if (!data || !size)
 101                 return 0;
 102         gather_stats(data, size, &stats);
 103         if (convert_is_binary(&stats))
 104                 ret |= CONVERT_STAT_BITS_BIN;
 105         if (stats.crlf)
 106                 ret |= CONVERT_STAT_BITS_TXT_CRLF;
 107         if (stats.lonelf)
 108                 ret |=  CONVERT_STAT_BITS_TXT_LF;
 109
 110         return ret;
 111 }
 112
 113 static const char *gather_convert_stats_ascii(const char *data, unsigned long size)
 114 {
 115         unsigned int convert_stats = gather_convert_stats(data, size);
 116
 117         if (convert_stats & CONVERT_STAT_BITS_BIN)
 118                 return "-text";
 119         switch (convert_stats) {
 120         case CONVERT_STAT_BITS_TXT_LF:
 121                 return "lf";
 122         case CONVERT_STAT_BITS_TXT_CRLF:
 123                 return "crlf";
 124         case CONVERT_STAT_BITS_TXT_LF | CONVERT_STAT_BITS_TXT_CRLF:
 125                 return "mixed";
 126         default:
 127                 return "none";
 128         }
 129 }
 130
 131 const char *get_cached_convert_stats_ascii(struct index_state *istate,
 132                                            const char *path)
 133 {
 134         const char *ret;
 135         unsigned long sz;
 136         void *data = read_blob_data_from_index(istate, path, &sz);
 137         ret = gather_convert_stats_ascii(data, sz);
 138         free(data);
 139         return ret;
 140 }
 141
 142 const char *get_wt_convert_stats_ascii(const char *path)
 143 {
 144         const char *ret = "";
 145         struct strbuf sb = STRBUF_INIT;
 146         if (strbuf_read_file(&sb, path, 0) >= 0)
 147                 ret = gather_convert_stats_ascii(sb.buf, sb.len);
 148         strbuf_release(&sb);
 149         return ret;
 150 }
 151
 152 static int text_eol_is_crlf(void)
 153 {
 154         if (auto_crlf == AUTO_CRLF_TRUE)
 155                 return 1;
 156         else if (auto_crlf == AUTO_CRLF_INPUT)
 157                 return 0;
 158         if (core_eol == EOL_CRLF)
 159                 return 1;
 160         if (core_eol == EOL_UNSET && EOL_NATIVE == EOL_CRLF)
 161                 return 1;
 162         return 0;
 163 }
 164
 165 static enum eol output_eol(enum convert_crlf_action crlf_action)
 166 {
 167         switch (crlf_action) {
 168         case CRLF_BINARY:
 169                 return EOL_UNSET;
 170         case CRLF_TEXT_CRLF:
 171                 return EOL_CRLF;
 172         case CRLF_TEXT_INPUT:
 173                 return EOL_LF;
 174         case CRLF_UNDEFINED:
 175         case CRLF_AUTO_CRLF:
 176                 return EOL_CRLF;
 177         case CRLF_AUTO_INPUT:
 178                 return EOL_LF;
 179         case CRLF_TEXT:
 180         case CRLF_AUTO:
 181                 /* fall through */
 182                 return text_eol_is_crlf() ? EOL_CRLF : EOL_LF;
 183         }
 184         warning(_("illegal crlf_action %d"), (int)crlf_action);
 185         return core_eol;
 186 }
 187
 188 static void check_global_conv_flags_eol(const char *path,
 189                             struct text_stat *old_stats, struct text_stat *new_stats,
 190                             int conv_flags)
 191 {
 192         if (old_stats->crlf && !new_stats->crlf ) {
 193                 /*
 194                  * CRLFs would not be restored by checkout
 195                  */
 196                 if (conv_flags & CONV_EOL_RNDTRP_DIE)
 197                         die(_("CRLF would be replaced by LF in %s"), path);
 198                 else if (conv_flags & CONV_EOL_RNDTRP_WARN)
 199                         warning(_("in the working copy of '%s', CRLF will be"
 200                                   " replaced by LF the next time Git touches"
 201                                   " it"), path);
 202         } else if (old_stats->lonelf && !new_stats->lonelf ) {
 203                 /*
 204                  * CRLFs would be added by checkout
 205                  */
 206                 if (conv_flags & CONV_EOL_RNDTRP_DIE)
 207                         die(_("LF would be replaced by CRLF in %s"), path);
 208                 else if (conv_flags & CONV_EOL_RNDTRP_WARN)
 209                         warning(_("in the working copy of '%s', LF will be"
 210                                   " replaced by CRLF the next time Git touches"
 211                                   " it"), path);
 212         }
 213 }
 214
 215 static int has_crlf_in_index(struct index_state *istate, const char *path)
 216 {
 217         unsigned long sz;
 218         void *data;
 219         const char *crp;
 220         int has_crlf = 0;
 221
 222         data = read_blob_data_from_index(istate, path, &sz);
 223         if (!data)
 224                 return 0;
 225
 226         crp = memchr(data, '\r', sz);
 227         if (crp) {
 228                 unsigned int ret_stats;
 229                 ret_stats = gather_convert_stats(data, sz);
 230                 if (!(ret_stats & CONVERT_STAT_BITS_BIN) &&
 231                     (ret_stats & CONVERT_STAT_BITS_TXT_CRLF))
 232                         has_crlf = 1;
 233         }
 234         free(data);
 235         return has_crlf;
 236 }
 237
 238 static int will_convert_lf_to_crlf(struct text_stat *stats,
 239                                    enum convert_crlf_action crlf_action)
 240 {
 241         if (output_eol(crlf_action) != EOL_CRLF)
 242                 return 0;
 243         /* No "naked" LF? Nothing to convert, regardless. */
 244         if (!stats->lonelf)
 245                 return 0;
 246
 247         if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
 248                 /* If we have any CR or CRLF line endings, we do not touch it */
 249                 /* This is the new safer autocrlf-handling */
 250                 if (stats->lonecr || stats->crlf)
 251                         return 0;
 252
 253                 if (convert_is_binary(stats))
 254                         return 0;
 255         }
 256         return 1;
 257
 258 }
 259
 260 static int validate_encoding(const char *path, const char *enc,
 261                       const char *data, size_t len, int die_on_error)
 262 {
 263         const char *stripped;
 264
 265         /* We only check for UTF here as UTF?? can be an alias for UTF-?? */
 266         if (skip_iprefix(enc, "UTF", &stripped)) {
 267                 skip_prefix(stripped, "-", &stripped);
 268
 269                 /*
 270                  * Check for detectable errors in UTF encodings
 271                  */
 272                 if (has_prohibited_utf_bom(enc, data, len)) {
 273                         const char *error_msg = _(
 274                                 "BOM is prohibited in '%s' if encoded as %s");
 275                         /*
 276                          * This advice is shown for UTF-??BE and UTF-??LE encodings.
 277                          * We cut off the last two characters of the encoding name
 278                          * to generate the encoding name suitable for BOMs.
 279                          */
 280                         const char *advise_msg = _(
 281                                 "The file '%s' contains a byte order "
 282                                 "mark (BOM). Please use UTF-%.*s as "
 283                                 "working-tree-encoding.");
 284                         int stripped_len = strlen(stripped) - strlen("BE");
 285                         advise(advise_msg, path, stripped_len, stripped);
 286                         if (die_on_error)
 287                                 die(error_msg, path, enc);
 288                         else {
 289                                 return error(error_msg, path, enc);
 290                         }
 291
 292                 } else if (is_missing_required_utf_bom(enc, data, len)) {
 293                         const char *error_msg = _(
 294                                 "BOM is required in '%s' if encoded as %s");
 295                         const char *advise_msg = _(
 296                                 "The file '%s' is missing a byte order "
 297                                 "mark (BOM). Please use UTF-%sBE or UTF-%sLE "
 298                                 "(depending on the byte order) as "
 299                                 "working-tree-encoding.");
 300                         advise(advise_msg, path, stripped, stripped);
 301                         if (die_on_error)
 302                                 die(error_msg, path, enc);
 303                         else {
 304                                 return error(error_msg, path, enc);
 305                         }
 306                 }
 307
 308         }
 309         return 0;
 310 }
 311
 312 static void trace_encoding(const char *context, const char *path,
 313                            const char *encoding, const char *buf, size_t len)
 314 {
 315         static struct trace_key coe = TRACE_KEY_INIT(WORKING_TREE_ENCODING);
 316         struct strbuf trace = STRBUF_INIT;
 317         int i;
 318
 319         strbuf_addf(&trace, "%s (%s, considered %s):\n", context, path, encoding);
 320         for (i = 0; i < len && buf; ++i) {
 321                 strbuf_addf(
 322                         &trace, "| \033[2m%2i:\033[0m %2x \033[2m%c\033[0m%c",
 323                         i,
 324                         (unsigned char) buf[i],
 325                         (buf[i] > 32 && buf[i] < 127 ? buf[i] : ' '),
 326                         ((i+1) % 8 && (i+1) < len ? ' ' : '\n')
 327                 );
 328         }
 329         strbuf_addchars(&trace, '\n', 1);
 330
 331         trace_strbuf(&coe, &trace);
 332         strbuf_release(&trace);
 333 }
 334
 335 static int check_roundtrip(const char *enc_name)
 336 {
 337         /*
 338          * check_roundtrip_encoding contains a string of comma and/or
 339          * space separated encodings (eg. "UTF-16, ASCII, CP1125").
 340          * Search for the given encoding in that string.
 341          */
 342         const char *found = strcasestr(check_roundtrip_encoding, enc_name);
 343         const char *next;
 344         int len;
 345         if (!found)
 346                 return 0;
 347         next = found + strlen(enc_name);
 348         len = strlen(check_roundtrip_encoding);
 349         return (found && (
 350                         /*
 351                          * check that the found encoding is at the
 352                          * beginning of check_roundtrip_encoding or
 353                          * that it is prefixed with a space or comma
 354                          */
 355                         found == check_roundtrip_encoding || (
 356                                 (isspace(found[-1]) || found[-1] == ',')
 357                         )
 358                 ) && (
 359                         /*
 360                          * check that the found encoding is at the
 361                          * end of check_roundtrip_encoding or
 362                          * that it is suffixed with a space or comma
 363                          */
 364                         next == check_roundtrip_encoding + len || (
 365                                 next < check_roundtrip_encoding + len &&
 366                                 (isspace(next[0]) || next[0] == ',')
 367                         )
 368                 ));
 369 }
 370
 371 static const char *default_encoding = "UTF-8";
 372
 373 static int encode_to_git(const char *path, const char *src, size_t src_len,
 374                          struct strbuf *buf, const char *enc, int conv_flags)
 375 {
 376         char *dst;
 377         size_t dst_len;
 378         int die_on_error = conv_flags & CONV_WRITE_OBJECT;
 379
 380         /*
 381          * No encoding is specified or there is nothing to encode.
 382          * Tell the caller that the content was not modified.
 383          */
 384         if (!enc || (src && !src_len))
 385                 return 0;
 386
 387         /*
 388          * Looks like we got called from "would_convert_to_git()".
 389          * This means Git wants to know if it would encode (= modify!)
 390          * the content. Let's answer with "yes", since an encoding was
 391          * specified.
 392          */
 393         if (!buf && !src)
 394                 return 1;
 395
 396         if (validate_encoding(path, enc, src, src_len, die_on_error))
 397                 return 0;
 398
 399         trace_encoding("source", path, enc, src, src_len);
 400         dst = reencode_string_len(src, src_len, default_encoding, enc,
 401                                   &dst_len);
 402         if (!dst) {
 403                 /*
 404                  * We could add the blob "as-is" to Git. However, on checkout
 405                  * we would try to re-encode to the original encoding. This
 406                  * would fail and we would leave the user with a messed-up
 407                  * working tree. Let's try to avoid this by screaming loud.
 408                  */
 409                 const char* msg = _("failed to encode '%s' from %s to %s");
 410                 if (die_on_error)
 411                         die(msg, path, enc, default_encoding);
 412                 else {
 413                         error(msg, path, enc, default_encoding);
 414                         return 0;
 415                 }
 416         }
 417         trace_encoding("destination", path, default_encoding, dst, dst_len);
 418
 419         /*
 420          * UTF supports lossless conversion round tripping [1] and conversions
 421          * between UTF and other encodings are mostly round trip safe as
 422          * Unicode aims to be a superset of all other character encodings.
 423          * However, certain encodings (e.g. SHIFT-JIS) are known to have round
 424          * trip issues [2]. Check the round trip conversion for all encodings
 425          * listed in core.checkRoundtripEncoding.
 426          *
 427          * The round trip check is only performed if content is written to Git.
 428          * This ensures that no information is lost during conversion to/from
 429          * the internal UTF-8 representation.
 430          *
 431          * Please note, the code below is not tested because I was not able to
 432          * generate a faulty round trip without an iconv error. Iconv errors
 433          * are already caught above.
 434          *
 435          * [1] http://unicode.org/faq/utf_bom.html#gen2
 436          * [2] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode
 437          */
 438         if (die_on_error && check_roundtrip(enc)) {
 439                 char *re_src;
 440                 size_t re_src_len;
 441
 442                 re_src = reencode_string_len(dst, dst_len,
 443                                              enc, default_encoding,
 444                                              &re_src_len);
 445
 446                 trace_printf("Checking roundtrip encoding for %s...\n", enc);
 447                 trace_encoding("reencoded source", path, enc,
 448                                re_src, re_src_len);
 449
 450                 if (!re_src || src_len != re_src_len ||
 451                     memcmp(src, re_src, src_len)) {
 452                         const char* msg = _("encoding '%s' from %s to %s and "
 453                                             "back is not the same");
 454                         die(msg, path, enc, default_encoding);
 455                 }
 456
 457                 free(re_src);
 458         }
 459
 460         strbuf_attach(buf, dst, dst_len, dst_len + 1);
 461         return 1;
 462 }
 463
 464 static int encode_to_worktree(const char *path, const char *src, size_t src_len,
 465                               struct strbuf *buf, const char *enc)
 466 {
 467         char *dst;
 468         size_t dst_len;
 469
 470         /*
 471          * No encoding is specified or there is nothing to encode.
 472          * Tell the caller that the content was not modified.
 473          */
 474         if (!enc || (src && !src_len))
 475                 return 0;
 476
 477         dst = reencode_string_len(src, src_len, enc, default_encoding,
 478                                   &dst_len);
 479         if (!dst) {
 480                 error(_("failed to encode '%s' from %s to %s"),
 481                       path, default_encoding, enc);
 482                 return 0;
 483         }
 484
 485         strbuf_attach(buf, dst, dst_len, dst_len + 1);
 486         return 1;
 487 }
 488
 489 static int crlf_to_git(struct index_state *istate,
 490                        const char *path, const char *src, size_t len,
 491                        struct strbuf *buf,
 492                        enum convert_crlf_action crlf_action, int conv_flags)
 493 {
 494         struct text_stat stats;
 495         char *dst;
 496         int convert_crlf_into_lf;
 497
 498         if (crlf_action == CRLF_BINARY ||
 499             (src && !len))
 500                 return 0;
 501
 502         /*
 503          * If we are doing a dry-run and have no source buffer, there is
 504          * nothing to analyze; we must assume we would convert.
 505          */
 506         if (!buf && !src)
 507                 return 1;
 508
 509         gather_stats(src, len, &stats);
 510         /* Optimization: No CRLF? Nothing to convert, regardless. */
 511         convert_crlf_into_lf = !!stats.crlf;
 512
 513         if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
 514                 if (convert_is_binary(&stats))
 515                         return 0;
 516                 /*
 517                  * If the file in the index has any CR in it, do not
 518                  * convert.  This is the new safer autocrlf handling,
 519                  * unless we want to renormalize in a merge or
 520                  * cherry-pick.
 521                  */
 522                 if ((!(conv_flags & CONV_EOL_RENORMALIZE)) &&
 523                     has_crlf_in_index(istate, path))
 524                         convert_crlf_into_lf = 0;
 525         }
 526         if (((conv_flags & CONV_EOL_RNDTRP_WARN) ||
 527              ((conv_flags & CONV_EOL_RNDTRP_DIE) && len))) {
 528                 struct text_stat new_stats;
 529                 memcpy(&new_stats, &stats, sizeof(new_stats));
 530                 /* simulate "git add" */
 531                 if (convert_crlf_into_lf) {
 532                         new_stats.lonelf += new_stats.crlf;
 533                         new_stats.crlf = 0;
 534                 }
 535                 /* simulate "git checkout" */
 536                 if (will_convert_lf_to_crlf(&new_stats, crlf_action)) {
 537                         new_stats.crlf += new_stats.lonelf;
 538                         new_stats.lonelf = 0;
 539                 }
 540                 check_global_conv_flags_eol(path, &stats, &new_stats, conv_flags);
 541         }
 542         if (!convert_crlf_into_lf)
 543                 return 0;
 544
 545         /*
 546          * At this point all of our source analysis is done, and we are sure we
 547          * would convert. If we are in dry-run mode, we can give an answer.
 548          */
 549         if (!buf)
 550                 return 1;
 551
 552         /* only grow if not in place */
 553         if (strbuf_avail(buf) + buf->len < len)
 554                 strbuf_grow(buf, len - buf->len);
 555         dst = buf->buf;
 556         if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) {
 557                 /*
 558                  * If we guessed, we already know we rejected a file with
 559                  * lone CR, and we can strip a CR without looking at what
 560                  * follow it.
 561                  */
 562                 do {
 563                         unsigned char c = *src++;
 564                         if (c != '\r')
 565                                 *dst++ = c;
 566                 } while (--len);
 567         } else {
 568                 do {
 569                         unsigned char c = *src++;
 570                         if (! (c == '\r' && (1 < len && *src == '\n')))
 571                                 *dst++ = c;
 572                 } while (--len);
 573         }
 574         strbuf_setlen(buf, dst - buf->buf);
 575         return 1;
 576 }
 577
 578 static int crlf_to_worktree(const char *src, size_t len, struct strbuf *buf,
 579                             enum convert_crlf_action crlf_action)
 580 {
 581         char *to_free = NULL;
 582         struct text_stat stats;
 583
 584         if (!len || output_eol(crlf_action) != EOL_CRLF)
 585                 return 0;
 586
 587         gather_stats(src, len, &stats);
 588         if (!will_convert_lf_to_crlf(&stats, crlf_action))
 589                 return 0;
 590
 591         /* are we "faking" in place editing ? */
 592         if (src == buf->buf)
 593                 to_free = strbuf_detach(buf, NULL);
 594
 595         strbuf_grow(buf, len + stats.lonelf);
 596         for (;;) {
 597                 const char *nl = memchr(src, '\n', len);
 598                 if (!nl)
 599                         break;
 600                 if (nl > src && nl[-1] == '\r') {
 601                         strbuf_add(buf, src, nl + 1 - src);
 602                 } else {
 603                         strbuf_add(buf, src, nl - src);
 604                         strbuf_addstr(buf, "\r\n");
 605                 }
 606                 len -= nl + 1 - src;
 607                 src  = nl + 1;
 608         }
 609         strbuf_add(buf, src, len);
 610
 611         free(to_free);
 612         return 1;
 613 }
 614
 615 struct filter_params {
 616         const char *src;
 617         size_t size;
 618         int fd;
 619         const char *cmd;
 620         const char *path;
 621 };
 622
 623 static int filter_buffer_or_fd(int in UNUSED, int out, void *data)
 624 {
 625         /*
 626          * Spawn cmd and feed the buffer contents through its stdin.
 627          */
 628         struct child_process child_process = CHILD_PROCESS_INIT;
 629         struct filter_params *params = (struct filter_params *)data;
 630         int write_err, status;
 631
 632         /* apply % substitution to cmd */
 633         struct strbuf cmd = STRBUF_INIT;
 634         struct strbuf path = STRBUF_INIT;
 635         struct strbuf_expand_dict_entry dict[] = {
 636                 { "f", NULL, },
 637                 { NULL, NULL, },
 638         };
 639
 640         /* quote the path to preserve spaces, etc. */
 641         sq_quote_buf(&path, params->path);
 642         dict[0].value = path.buf;
 643
 644         /* expand all %f with the quoted path */
 645         strbuf_expand(&cmd, params->cmd, strbuf_expand_dict_cb, &dict);
 646         strbuf_release(&path);
 647
 648         strvec_push(&child_process.args, cmd.buf);
 649         child_process.use_shell = 1;
 650         child_process.in = -1;
 651         child_process.out = out;
 652
 653         if (start_command(&child_process)) {
 654                 strbuf_release(&cmd);
 655                 return error(_("cannot fork to run external filter '%s'"),
 656                              params->cmd);
 657         }
 658
 659         sigchain_push(SIGPIPE, SIG_IGN);
 660
 661         if (params->src) {
 662                 write_err = (write_in_full(child_process.in,
 663                                            params->src, params->size) < 0);
 664                 if (errno == EPIPE)
 665                         write_err = 0;
 666         } else {
 667                 write_err = copy_fd(params->fd, child_process.in);
 668                 if (write_err == COPY_WRITE_ERROR && errno == EPIPE)
 669                         write_err = 0;
 670         }
 671
 672         if (close(child_process.in))
 673                 write_err = 1;
 674         if (write_err)
 675                 error(_("cannot feed the input to external filter '%s'"),
 676                       params->cmd);
 677
 678         sigchain_pop(SIGPIPE);
 679
 680         status = finish_command(&child_process);
 681         if (status)
 682                 error(_("external filter '%s' failed %d"), params->cmd, status);
 683
 684         strbuf_release(&cmd);
 685         return (write_err || status);
 686 }
 687
 688 static int apply_single_file_filter(const char *path, const char *src, size_t len, int fd,
 689                                     struct strbuf *dst, const char *cmd)
 690 {
 691         /*
 692          * Create a pipeline to have the command filter the buffer's
 693          * contents.
 694          *
 695          * (child --> cmd) --> us
 696          */
 697         int err = 0;
 698         struct strbuf nbuf = STRBUF_INIT;
 699         struct async async;
 700         struct filter_params params;
 701
 702         memset(&async, 0, sizeof(async));
 703         async.proc = filter_buffer_or_fd;
 704         async.data = &params;
 705         async.out = -1;
 706         params.src = src;
 707         params.size = len;
 708         params.fd = fd;
 709         params.cmd = cmd;
 710         params.path = path;
 711
 712         fflush(NULL);
 713         if (start_async(&async))
 714                 return 0;       /* error was already reported */
 715
 716         if (strbuf_read(&nbuf, async.out, 0) < 0) {
 717                 err = error(_("read from external filter '%s' failed"), cmd);
 718         }
 719         if (close(async.out)) {
 720                 err = error(_("read from external filter '%s' failed"), cmd);
 721         }
 722         if (finish_async(&async)) {
 723                 err = error(_("external filter '%s' failed"), cmd);
 724         }
 725
 726         if (!err) {
 727                 strbuf_swap(dst, &nbuf);
 728         }
 729         strbuf_release(&nbuf);
 730         return !err;
 731 }
 732
 733 #define CAP_CLEAN    (1u<<0)
 734 #define CAP_SMUDGE   (1u<<1)
 735 #define CAP_DELAY    (1u<<2)
 736
 737 struct cmd2process {
 738         struct subprocess_entry subprocess; /* must be the first member! */
 739         unsigned int supported_capabilities;
 740 };
 741
 742 static int subprocess_map_initialized;
 743 static struct hashmap subprocess_map;
 744
 745 static int start_multi_file_filter_fn(struct subprocess_entry *subprocess)
 746 {
 747         static int versions[] = {2, 0};
 748         static struct subprocess_capability capabilities[] = {
 749                 { "clean",  CAP_CLEAN  },
 750                 { "smudge", CAP_SMUDGE },
 751                 { "delay",  CAP_DELAY  },
 752                 { NULL, 0 }
 753         };
 754         struct cmd2process *entry = (struct cmd2process *)subprocess;
 755         return subprocess_handshake(subprocess, "git-filter", versions, NULL,
 756                                     capabilities,
 757                                     &entry->supported_capabilities);
 758 }
 759
 760 static void handle_filter_error(const struct strbuf *filter_status,
 761                                 struct cmd2process *entry,
 762                                 const unsigned int wanted_capability)
 763 {
 764         if (!strcmp(filter_status->buf, "error"))
 765                 ; /* The filter signaled a problem with the file. */
 766         else if (!strcmp(filter_status->buf, "abort") && wanted_capability) {
 767                 /*
 768                  * The filter signaled a permanent problem. Don't try to filter
 769                  * files with the same command for the lifetime of the current
 770                  * Git process.
 771                  */
 772                  entry->supported_capabilities &= ~wanted_capability;
 773         } else {
 774                 /*
 775                  * Something went wrong with the protocol filter.
 776                  * Force shutdown and restart if another blob requires filtering.
 777                  */
 778                 error(_("external filter '%s' failed"), entry->subprocess.cmd);
 779                 subprocess_stop(&subprocess_map, &entry->subprocess);
 780                 free(entry);
 781         }
 782 }
 783
 784 static int apply_multi_file_filter(const char *path, const char *src, size_t len,
 785                                    int fd, struct strbuf *dst, const char *cmd,
 786                                    const unsigned int wanted_capability,
 787                                    const struct checkout_metadata *meta,
 788                                    struct delayed_checkout *dco)
 789 {
 790         int err;
 791         int can_delay = 0;
 792         struct cmd2process *entry;
 793         struct child_process *process;
 794         struct strbuf nbuf = STRBUF_INIT;
 795         struct strbuf filter_status = STRBUF_INIT;
 796         const char *filter_type;
 797
 798         if (!subprocess_map_initialized) {
 799                 subprocess_map_initialized = 1;
 800                 hashmap_init(&subprocess_map, cmd2process_cmp, NULL, 0);
 801                 entry = NULL;
 802         } else {
 803                 entry = (struct cmd2process *)subprocess_find_entry(&subprocess_map, cmd);
 804         }
 805
 806         fflush(NULL);
 807
 808         if (!entry) {
 809                 entry = xmalloc(sizeof(*entry));
 810                 entry->supported_capabilities = 0;
 811
 812                 if (subprocess_start(&subprocess_map, &entry->subprocess, cmd, start_multi_file_filter_fn)) {
 813                         free(entry);
 814                         return 0;
 815                 }
 816         }
 817         process = &entry->subprocess.process;
 818
 819         if (!(entry->supported_capabilities & wanted_capability))
 820                 return 0;
 821
 822         if (wanted_capability & CAP_CLEAN)
 823                 filter_type = "clean";
 824         else if (wanted_capability & CAP_SMUDGE)
 825                 filter_type = "smudge";
 826         else
 827                 die(_("unexpected filter type"));
 828
 829         sigchain_push(SIGPIPE, SIG_IGN);
 830
 831         assert(strlen(filter_type) < LARGE_PACKET_DATA_MAX - strlen("command=\n"));
 832         err = packet_write_fmt_gently(process->in, "command=%s\n", filter_type);
 833         if (err)
 834                 goto done;
 835
 836         err = strlen(path) > LARGE_PACKET_DATA_MAX - strlen("pathname=\n");
 837         if (err) {
 838                 error(_("path name too long for external filter"));
 839                 goto done;
 840         }
 841
 842         err = packet_write_fmt_gently(process->in, "pathname=%s\n", path);
 843         if (err)
 844                 goto done;
 845
 846         if (meta && meta->refname) {
 847                 err = packet_write_fmt_gently(process->in, "ref=%s\n", meta->refname);
 848                 if (err)
 849                         goto done;
 850         }
 851
 852         if (meta && !is_null_oid(&meta->treeish)) {
 853                 err = packet_write_fmt_gently(process->in, "treeish=%s\n", oid_to_hex(&meta->treeish));
 854                 if (err)
 855                         goto done;
 856         }
 857
 858         if (meta && !is_null_oid(&meta->blob)) {
 859                 err = packet_write_fmt_gently(process->in, "blob=%s\n", oid_to_hex(&meta->blob));
 860                 if (err)
 861                         goto done;
 862         }
 863
 864         if ((entry->supported_capabilities & CAP_DELAY) &&
 865             dco && dco->state == CE_CAN_DELAY) {
 866                 can_delay = 1;
 867                 err = packet_write_fmt_gently(process->in, "can-delay=1\n");
 868                 if (err)
 869                         goto done;
 870         }
 871
 872         err = packet_flush_gently(process->in);
 873         if (err)
 874                 goto done;
 875
 876         if (fd >= 0)
 877                 err = write_packetized_from_fd_no_flush(fd, process->in);
 878         else
 879                 err = write_packetized_from_buf_no_flush(src, len, process->in);
 880         if (err)
 881                 goto done;
 882
 883         err = packet_flush_gently(process->in);
 884         if (err)
 885                 goto done;
 886
 887         err = subprocess_read_status(process->out, &filter_status);
 888         if (err)
 889                 goto done;
 890
 891         if (can_delay && !strcmp(filter_status.buf, "delayed")) {
 892                 string_list_insert(&dco->filters, cmd);
 893                 string_list_insert(&dco->paths, path);
 894         } else {
 895                 /* The filter got the blob and wants to send us a response. */
 896                 err = strcmp(filter_status.buf, "success");
 897                 if (err)
 898                         goto done;
 899
 900                 err = read_packetized_to_strbuf(process->out, &nbuf,
 901                                                 PACKET_READ_GENTLE_ON_EOF) < 0;
 902                 if (err)
 903                         goto done;
 904
 905                 err = subprocess_read_status(process->out, &filter_status);
 906                 if (err)
 907                         goto done;
 908
 909                 err = strcmp(filter_status.buf, "success");
 910         }
 911
 912 done:
 913         sigchain_pop(SIGPIPE);
 914
 915         if (err)
 916                 handle_filter_error(&filter_status, entry, wanted_capability);
 917         else
 918                 strbuf_swap(dst, &nbuf);
 919         strbuf_release(&nbuf);
 920         strbuf_release(&filter_status);
 921         return !err;
 922 }
 923
 924
 925 int async_query_available_blobs(const char *cmd, struct string_list *available_paths)
 926 {
 927         int err;
 928         char *line;
 929         struct cmd2process *entry;
 930         struct child_process *process;
 931         struct strbuf filter_status = STRBUF_INIT;
 932
 933         assert(subprocess_map_initialized);
 934         entry = (struct cmd2process *)subprocess_find_entry(&subprocess_map, cmd);
 935         if (!entry) {
 936                 error(_("external filter '%s' is not available anymore although "
 937                         "not all paths have been filtered"), cmd);
 938                 return 0;
 939         }
 940         process = &entry->subprocess.process;
 941         sigchain_push(SIGPIPE, SIG_IGN);
 942
 943         err = packet_write_fmt_gently(
 944                 process->in, "command=list_available_blobs\n");
 945         if (err)
 946                 goto done;
 947
 948         err = packet_flush_gently(process->in);
 949         if (err)
 950                 goto done;
 951
 952         while ((line = packet_read_line(process->out, NULL))) {
 953                 const char *path;
 954                 if (skip_prefix(line, "pathname=", &path))
 955                         string_list_insert(available_paths, xstrdup(path));
 956                 else
 957                         ; /* ignore unknown keys */
 958         }
 959
 960         err = subprocess_read_status(process->out, &filter_status);
 961         if (err)
 962                 goto done;
 963
 964         err = strcmp(filter_status.buf, "success");
 965
 966 done:
 967         sigchain_pop(SIGPIPE);
 968
 969         if (err)
 970                 handle_filter_error(&filter_status, entry, 0);
 971         strbuf_release(&filter_status);
 972         return !err;
 973 }
 974
 975 static struct convert_driver {
 976         const char *name;
 977         struct convert_driver *next;
 978         const char *smudge;
 979         const char *clean;
 980         const char *process;
 981         int required;
 982 } *user_convert, **user_convert_tail;
 983
 984 static int apply_filter(const char *path, const char *src, size_t len,
 985                         int fd, struct strbuf *dst, struct convert_driver *drv,
 986                         const unsigned int wanted_capability,
 987                         const struct checkout_metadata *meta,
 988                         struct delayed_checkout *dco)
 989 {
 990         const char *cmd = NULL;
 991
 992         if (!drv)
 993                 return 0;
 994
 995         if (!dst)
 996                 return 1;
 997
 998         if ((wanted_capability & CAP_CLEAN) && !drv->process && drv->clean)
 999                 cmd = drv->clean;
1000         else if ((wanted_capability & CAP_SMUDGE) && !drv->process && drv->smudge)
1001                 cmd = drv->smudge;
1002
1003         if (cmd && *cmd)
1004                 return apply_single_file_filter(path, src, len, fd, dst, cmd);
1005         else if (drv->process && *drv->process)
1006                 return apply_multi_file_filter(path, src, len, fd, dst,
1007                         drv->process, wanted_capability, meta, dco);
1008
1009         return 0;
1010 }
1011
1012 static int read_convert_config(const char *var, const char *value, void *cb UNUSED)
1013 {
1014         const char *key, *name;
1015         size_t namelen;
1016         struct convert_driver *drv;
1017
1018         /*
1019          * External conversion drivers are configured using
1020          * "filter.<name>.variable".
1021          */
1022         if (parse_config_key(var, "filter", &name, &namelen, &key) < 0 || !name)
1023                 return 0;
1024         for (drv = user_convert; drv; drv = drv->next)
1025                 if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
1026                         break;
1027         if (!drv) {
1028                 CALLOC_ARRAY(drv, 1);
1029                 drv->name = xmemdupz(name, namelen);
1030                 *user_convert_tail = drv;
1031                 user_convert_tail = &(drv->next);
1032         }
1033
1034         /*
1035          * filter.<name>.smudge and filter.<name>.clean specifies
1036          * the command line:
1037          *
1038          *      command-line
1039          *
1040          * The command-line will not be interpolated in any way.
1041          */
1042
1043         if (!strcmp("smudge", key))
1044                 return git_config_string(&drv->smudge, var, value);
1045
1046         if (!strcmp("clean", key))
1047                 return git_config_string(&drv->clean, var, value);
1048
1049         if (!strcmp("process", key))
1050                 return git_config_string(&drv->process, var, value);
1051
1052         if (!strcmp("required", key)) {
1053                 drv->required = git_config_bool(var, value);
1054                 return 0;
1055         }
1056
1057         return 0;
1058 }
1059
1060 static int count_ident(const char *cp, unsigned long size)
1061 {
1062         /*
1063          * "$Id: 0000000000000000000000000000000000000000 $" <=> "$Id$"
1064          */
1065         int cnt = 0;
1066         char ch;
1067
1068         while (size) {
1069                 ch = *cp++;
1070                 size--;
1071                 if (ch != '$')
1072                         continue;
1073                 if (size < 3)
1074                         break;
1075                 if (memcmp("Id", cp, 2))
1076                         continue;
1077                 ch = cp[2];
1078                 cp += 3;
1079                 size -= 3;
1080                 if (ch == '$')
1081                         cnt++; /* $Id$ */
1082                 if (ch != ':')
1083                         continue;
1084
1085                 /*
1086                  * "$Id: ... "; scan up to the closing dollar sign and discard.
1087                  */
1088                 while (size) {
1089                         ch = *cp++;
1090                         size--;
1091                         if (ch == '$') {
1092                                 cnt++;
1093                                 break;
1094                         }
1095                         if (ch == '\n')
1096                                 break;
1097                 }
1098         }
1099         return cnt;
1100 }
1101
1102 static int ident_to_git(const char *src, size_t len,
1103                         struct strbuf *buf, int ident)
1104 {
1105         char *dst, *dollar;
1106
1107         if (!ident || (src && !count_ident(src, len)))
1108                 return 0;
1109
1110         if (!buf)
1111                 return 1;
1112
1113         /* only grow if not in place */
1114         if (strbuf_avail(buf) + buf->len < len)
1115                 strbuf_grow(buf, len - buf->len);
1116         dst = buf->buf;
1117         for (;;) {
1118                 dollar = memchr(src, '$', len);
1119                 if (!dollar)
1120                         break;
1121                 memmove(dst, src, dollar + 1 - src);
1122                 dst += dollar + 1 - src;
1123                 len -= dollar + 1 - src;
1124                 src  = dollar + 1;
1125
1126                 if (len > 3 && !memcmp(src, "Id:", 3)) {
1127                         dollar = memchr(src + 3, '$', len - 3);
1128                         if (!dollar)
1129                                 break;
1130                         if (memchr(src + 3, '\n', dollar - src - 3)) {
1131                                 /* Line break before the next dollar. */
1132                                 continue;
1133                         }
1134
1135                         memcpy(dst, "Id$", 3);
1136                         dst += 3;
1137                         len -= dollar + 1 - src;
1138                         src  = dollar + 1;
1139                 }
1140         }
1141         memmove(dst, src, len);
1142         strbuf_setlen(buf, dst + len - buf->buf);
1143         return 1;
1144 }
1145
1146 static int ident_to_worktree(const char *src, size_t len,
1147                              struct strbuf *buf, int ident)
1148 {
1149         struct object_id oid;
1150         char *to_free = NULL, *dollar, *spc;
1151         int cnt;
1152
1153         if (!ident)
1154                 return 0;
1155
1156         cnt = count_ident(src, len);
1157         if (!cnt)
1158                 return 0;
1159
1160         /* are we "faking" in place editing ? */
1161         if (src == buf->buf)
1162                 to_free = strbuf_detach(buf, NULL);
1163         hash_object_file(the_hash_algo, src, len, OBJ_BLOB, &oid);
1164
1165         strbuf_grow(buf, len + cnt * (the_hash_algo->hexsz + 3));
1166         for (;;) {
1167                 /* step 1: run to the next '$' */
1168                 dollar = memchr(src, '$', len);
1169                 if (!dollar)
1170                         break;
1171                 strbuf_add(buf, src, dollar + 1 - src);
1172                 len -= dollar + 1 - src;
1173                 src  = dollar + 1;
1174
1175                 /* step 2: does it looks like a bit like Id:xxx$ or Id$ ? */
1176                 if (len < 3 || memcmp("Id", src, 2))
1177                         continue;
1178
1179                 /* step 3: skip over Id$ or Id:xxxxx$ */
1180                 if (src[2] == '$') {
1181                         src += 3;
1182                         len -= 3;
1183                 } else if (src[2] == ':') {
1184                         /*
1185                          * It's possible that an expanded Id has crept its way into the
1186                          * repository, we cope with that by stripping the expansion out.
1187                          * This is probably not a good idea, since it will cause changes
1188                          * on checkout, which won't go away by stash, but let's keep it
1189                          * for git-style ids.
1190                          */
1191                         dollar = memchr(src + 3, '$', len - 3);
1192                         if (!dollar) {
1193                                 /* incomplete keyword, no more '$', so just quit the loop */
1194                                 break;
1195                         }
1196
1197                         if (memchr(src + 3, '\n', dollar - src - 3)) {
1198                                 /* Line break before the next dollar. */
1199                                 continue;
1200                         }
1201
1202                         spc = memchr(src + 4, ' ', dollar - src - 4);
1203                         if (spc && spc < dollar-1) {
1204                                 /* There are spaces in unexpected places.
1205                                  * This is probably an id from some other
1206                                  * versioning system. Keep it for now.
1207                                  */
1208                                 continue;
1209                         }
1210
1211                         len -= dollar + 1 - src;
1212                         src  = dollar + 1;
1213                 } else {
1214                         /* it wasn't a "Id$" or "Id:xxxx$" */
1215                         continue;
1216                 }
1217
1218                 /* step 4: substitute */
1219                 strbuf_addstr(buf, "Id: ");
1220                 strbuf_addstr(buf, oid_to_hex(&oid));
1221                 strbuf_addstr(buf, " $");
1222         }
1223         strbuf_add(buf, src, len);
1224
1225         free(to_free);
1226         return 1;
1227 }
1228
1229 static const char *git_path_check_encoding(struct attr_check_item *check)
1230 {
1231         const char *value = check->value;
1232
1233         if (ATTR_UNSET(value) || !strlen(value))
1234                 return NULL;
1235
1236         if (ATTR_TRUE(value) || ATTR_FALSE(value)) {
1237                 die(_("true/false are no valid working-tree-encodings"));
1238         }
1239
1240         /* Don't encode to the default encoding */
1241         if (same_encoding(value, default_encoding))
1242                 return NULL;
1243
1244         return value;
1245 }
1246
1247 static enum convert_crlf_action git_path_check_crlf(struct attr_check_item *check)
1248 {
1249         const char *value = check->value;
1250
1251         if (ATTR_TRUE(value))
1252                 return CRLF_TEXT;
1253         else if (ATTR_FALSE(value))
1254                 return CRLF_BINARY;
1255         else if (ATTR_UNSET(value))
1256                 ;
1257         else if (!strcmp(value, "input"))
1258                 return CRLF_TEXT_INPUT;
1259         else if (!strcmp(value, "auto"))
1260                 return CRLF_AUTO;
1261         return CRLF_UNDEFINED;
1262 }
1263
1264 static enum eol git_path_check_eol(struct attr_check_item *check)
1265 {
1266         const char *value = check->value;
1267
1268         if (ATTR_UNSET(value))
1269                 ;
1270         else if (!strcmp(value, "lf"))
1271                 return EOL_LF;
1272         else if (!strcmp(value, "crlf"))
1273                 return EOL_CRLF;
1274         return EOL_UNSET;
1275 }
1276
1277 static struct convert_driver *git_path_check_convert(struct attr_check_item *check)
1278 {
1279         const char *value = check->value;
1280         struct convert_driver *drv;
1281
1282         if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
1283                 return NULL;
1284         for (drv = user_convert; drv; drv = drv->next)
1285                 if (!strcmp(value, drv->name))
1286                         return drv;
1287         return NULL;
1288 }
1289
1290 static int git_path_check_ident(struct attr_check_item *check)
1291 {
1292         const char *value = check->value;
1293
1294         return !!ATTR_TRUE(value);
1295 }
1296
1297 static struct attr_check *check;
1298
1299 void convert_attrs(struct index_state *istate,
1300                    struct conv_attrs *ca, const char *path)
1301 {
1302         struct attr_check_item *ccheck = NULL;
1303
1304         if (!check) {
1305                 check = attr_check_initl("crlf", "ident", "filter",
1306                                          "eol", "text", "working-tree-encoding",
1307                                          NULL);
1308                 user_convert_tail = &user_convert;
1309                 git_config(read_convert_config, NULL);
1310         }
1311
1312         git_check_attr(istate, NULL, path, check);
1313         ccheck = check->items;
1314         ca->crlf_action = git_path_check_crlf(ccheck + 4);
1315         if (ca->crlf_action == CRLF_UNDEFINED)
1316                 ca->crlf_action = git_path_check_crlf(ccheck + 0);
1317         ca->ident = git_path_check_ident(ccheck + 1);
1318         ca->drv = git_path_check_convert(ccheck + 2);
1319         if (ca->crlf_action != CRLF_BINARY) {
1320                 enum eol eol_attr = git_path_check_eol(ccheck + 3);
1321                 if (ca->crlf_action == CRLF_AUTO && eol_attr == EOL_LF)
1322                         ca->crlf_action = CRLF_AUTO_INPUT;
1323                 else if (ca->crlf_action == CRLF_AUTO && eol_attr == EOL_CRLF)
1324                         ca->crlf_action = CRLF_AUTO_CRLF;
1325                 else if (eol_attr == EOL_LF)
1326                         ca->crlf_action = CRLF_TEXT_INPUT;
1327                 else if (eol_attr == EOL_CRLF)
1328                         ca->crlf_action = CRLF_TEXT_CRLF;
1329         }
1330         ca->working_tree_encoding = git_path_check_encoding(ccheck + 5);
1331
1332         /* Save attr and make a decision for action */
1333         ca->attr_action = ca->crlf_action;
1334         if (ca->crlf_action == CRLF_TEXT)
1335                 ca->crlf_action = text_eol_is_crlf() ? CRLF_TEXT_CRLF : CRLF_TEXT_INPUT;
1336         if (ca->crlf_action == CRLF_UNDEFINED && auto_crlf == AUTO_CRLF_FALSE)
1337                 ca->crlf_action = CRLF_BINARY;
1338         if (ca->crlf_action == CRLF_UNDEFINED && auto_crlf == AUTO_CRLF_TRUE)
1339                 ca->crlf_action = CRLF_AUTO_CRLF;
1340         if (ca->crlf_action == CRLF_UNDEFINED && auto_crlf == AUTO_CRLF_INPUT)
1341                 ca->crlf_action = CRLF_AUTO_INPUT;
1342 }
1343
1344 void reset_parsed_attributes(void)
1345 {
1346         struct convert_driver *drv, *next;
1347
1348         attr_check_free(check);
1349         check = NULL;
1350         reset_merge_attributes();
1351
1352         for (drv = user_convert; drv; drv = next) {
1353                 next = drv->next;
1354                 free((void *)drv->name);
1355                 free(drv);
1356         }
1357         user_convert = NULL;
1358         user_convert_tail = NULL;
1359 }
1360
1361 int would_convert_to_git_filter_fd(struct index_state *istate, const char *path)
1362 {
1363         struct conv_attrs ca;
1364
1365         convert_attrs(istate, &ca, path);
1366         if (!ca.drv)
1367                 return 0;
1368
1369         /*
1370          * Apply a filter to an fd only if the filter is required to succeed.
1371          * We must die if the filter fails, because the original data before
1372          * filtering is not available.
1373          */
1374         if (!ca.drv->required)
1375                 return 0;
1376
1377         return apply_filter(path, NULL, 0, -1, NULL, ca.drv, CAP_CLEAN, NULL, NULL);
1378 }
1379
1380 const char *get_convert_attr_ascii(struct index_state *istate, const char *path)
1381 {
1382         struct conv_attrs ca;
1383
1384         convert_attrs(istate, &ca, path);
1385         switch (ca.attr_action) {
1386         case CRLF_UNDEFINED:
1387                 return "";
1388         case CRLF_BINARY:
1389                 return "-text";
1390         case CRLF_TEXT:
1391                 return "text";
1392         case CRLF_TEXT_INPUT:
1393                 return "text eol=lf";
1394         case CRLF_TEXT_CRLF:
1395                 return "text eol=crlf";
1396         case CRLF_AUTO:
1397                 return "text=auto";
1398         case CRLF_AUTO_CRLF:
1399                 return "text=auto eol=crlf";
1400         case CRLF_AUTO_INPUT:
1401                 return "text=auto eol=lf";
1402         }
1403         return "";
1404 }
1405
1406 int convert_to_git(struct index_state *istate,
1407                    const char *path, const char *src, size_t len,
1408                    struct strbuf *dst, int conv_flags)
1409 {
1410         int ret = 0;
1411         struct conv_attrs ca;
1412
1413         convert_attrs(istate, &ca, path);
1414
1415         ret |= apply_filter(path, src, len, -1, dst, ca.drv, CAP_CLEAN, NULL, NULL);
1416         if (!ret && ca.drv && ca.drv->required)
1417                 die(_("%s: clean filter '%s' failed"), path, ca.drv->name);
1418
1419         if (ret && dst) {
1420                 src = dst->buf;
1421                 len = dst->len;
1422         }
1423
1424         ret |= encode_to_git(path, src, len, dst, ca.working_tree_encoding, conv_flags);
1425         if (ret && dst) {
1426                 src = dst->buf;
1427                 len = dst->len;
1428         }
1429
1430         if (!(conv_flags & CONV_EOL_KEEP_CRLF)) {
1431                 ret |= crlf_to_git(istate, path, src, len, dst, ca.crlf_action, conv_flags);
1432                 if (ret && dst) {
1433                         src = dst->buf;
1434                         len = dst->len;
1435                 }
1436         }
1437         return ret | ident_to_git(src, len, dst, ca.ident);
1438 }
1439
1440 void convert_to_git_filter_fd(struct index_state *istate,
1441                               const char *path, int fd, struct strbuf *dst,
1442                               int conv_flags)
1443 {
1444         struct conv_attrs ca;
1445         convert_attrs(istate, &ca, path);
1446
1447         assert(ca.drv);
1448
1449         if (!apply_filter(path, NULL, 0, fd, dst, ca.drv, CAP_CLEAN, NULL, NULL))
1450                 die(_("%s: clean filter '%s' failed"), path, ca.drv->name);
1451
1452         encode_to_git(path, dst->buf, dst->len, dst, ca.working_tree_encoding, conv_flags);
1453         crlf_to_git(istate, path, dst->buf, dst->len, dst, ca.crlf_action, conv_flags);
1454         ident_to_git(dst->buf, dst->len, dst, ca.ident);
1455 }
1456
1457 static int convert_to_working_tree_ca_internal(const struct conv_attrs *ca,
1458                                                const char *path, const char *src,
1459                                                size_t len, struct strbuf *dst,
1460                                                int normalizing,
1461                                                const struct checkout_metadata *meta,
1462                                                struct delayed_checkout *dco)
1463 {
1464         int ret = 0, ret_filter = 0;
1465
1466         ret |= ident_to_worktree(src, len, dst, ca->ident);
1467         if (ret) {
1468                 src = dst->buf;
1469                 len = dst->len;
1470         }
1471         /*
1472          * CRLF conversion can be skipped if normalizing, unless there
1473          * is a smudge or process filter (even if the process filter doesn't
1474          * support smudge).  The filters might expect CRLFs.
1475          */
1476         if ((ca->drv && (ca->drv->smudge || ca->drv->process)) || !normalizing) {
1477                 ret |= crlf_to_worktree(src, len, dst, ca->crlf_action);
1478                 if (ret) {
1479                         src = dst->buf;
1480                         len = dst->len;
1481                 }
1482         }
1483
1484         ret |= encode_to_worktree(path, src, len, dst, ca->working_tree_encoding);
1485         if (ret) {
1486                 src = dst->buf;
1487                 len = dst->len;
1488         }
1489
1490         ret_filter = apply_filter(
1491                 path, src, len, -1, dst, ca->drv, CAP_SMUDGE, meta, dco);
1492         if (!ret_filter && ca->drv && ca->drv->required)
1493                 die(_("%s: smudge filter %s failed"), path, ca->drv->name);
1494
1495         return ret | ret_filter;
1496 }
1497
1498 int async_convert_to_working_tree_ca(const struct conv_attrs *ca,
1499                                      const char *path, const char *src,
1500                                      size_t len, struct strbuf *dst,
1501                                      const struct checkout_metadata *meta,
1502                                      void *dco)
1503 {
1504         return convert_to_working_tree_ca_internal(ca, path, src, len, dst, 0,
1505                                                    meta, dco);
1506 }
1507
1508 int convert_to_working_tree_ca(const struct conv_attrs *ca,
1509                                const char *path, const char *src,
1510                                size_t len, struct strbuf *dst,
1511                                const struct checkout_metadata *meta)
1512 {
1513         return convert_to_working_tree_ca_internal(ca, path, src, len, dst, 0,
1514                                                    meta, NULL);
1515 }
1516
1517 int renormalize_buffer(struct index_state *istate, const char *path,
1518                        const char *src, size_t len, struct strbuf *dst)
1519 {
1520         struct conv_attrs ca;
1521         int ret;
1522
1523         convert_attrs(istate, &ca, path);
1524         ret = convert_to_working_tree_ca_internal(&ca, path, src, len, dst, 1,
1525                                                   NULL, NULL);
1526         if (ret) {
1527                 src = dst->buf;
1528                 len = dst->len;
1529         }
1530         return ret | convert_to_git(istate, path, src, len, dst, CONV_EOL_RENORMALIZE);
1531 }
1532
1533 /*****************************************************************
1534  *
1535  * Streaming conversion support
1536  *
1537  *****************************************************************/
1538
1539 typedef int (*filter_fn)(struct stream_filter *,
1540                          const char *input, size_t *isize_p,
1541                          char *output, size_t *osize_p);
1542 typedef void (*free_fn)(struct stream_filter *);
1543
1544 struct stream_filter_vtbl {
1545         filter_fn filter;
1546         free_fn free;
1547 };
1548
1549 struct stream_filter {
1550         struct stream_filter_vtbl *vtbl;
1551 };
1552
1553 static int null_filter_fn(struct stream_filter *filter UNUSED,
1554                           const char *input, size_t *isize_p,
1555                           char *output, size_t *osize_p)
1556 {
1557         size_t count;
1558
1559         if (!input)
1560                 return 0; /* we do not keep any states */
1561         count = *isize_p;
1562         if (*osize_p < count)
1563                 count = *osize_p;
1564         if (count) {
1565                 memmove(output, input, count);
1566                 *isize_p -= count;
1567                 *osize_p -= count;
1568         }
1569         return 0;
1570 }
1571
1572 static void null_free_fn(struct stream_filter *filter UNUSED)
1573 {
1574         ; /* nothing -- null instances are shared */
1575 }
1576
1577 static struct stream_filter_vtbl null_vtbl = {
1578         .filter = null_filter_fn,
1579         .free = null_free_fn,
1580 };
1581
1582 static struct stream_filter null_filter_singleton = {
1583         .vtbl = &null_vtbl,
1584 };
1585
1586 int is_null_stream_filter(struct stream_filter *filter)
1587 {
1588         return filter == &null_filter_singleton;
1589 }
1590
1591
1592 /*
1593  * LF-to-CRLF filter
1594  */
1595
1596 struct lf_to_crlf_filter {
1597         struct stream_filter filter;
1598         unsigned has_held:1;
1599         char held;
1600 };
1601
1602 static int lf_to_crlf_filter_fn(struct stream_filter *filter,
1603                                 const char *input, size_t *isize_p,
1604                                 char *output, size_t *osize_p)
1605 {
1606         size_t count, o = 0;
1607         struct lf_to_crlf_filter *lf_to_crlf = (struct lf_to_crlf_filter *)filter;
1608
1609         /*
1610          * We may be holding onto the CR to see if it is followed by a
1611          * LF, in which case we would need to go to the main loop.
1612          * Otherwise, just emit it to the output stream.
1613          */
1614         if (lf_to_crlf->has_held && (lf_to_crlf->held != '\r' || !input)) {
1615                 output[o++] = lf_to_crlf->held;
1616                 lf_to_crlf->has_held = 0;
1617         }
1618
1619         /* We are told to drain */
1620         if (!input) {
1621                 *osize_p -= o;
1622                 return 0;
1623         }
1624
1625         count = *isize_p;
1626         if (count || lf_to_crlf->has_held) {
1627                 size_t i;
1628                 int was_cr = 0;
1629
1630                 if (lf_to_crlf->has_held) {
1631                         was_cr = 1;
1632                         lf_to_crlf->has_held = 0;
1633                 }
1634
1635                 for (i = 0; o < *osize_p && i < count; i++) {
1636                         char ch = input[i];
1637
1638                         if (ch == '\n') {
1639                                 output[o++] = '\r';
1640                         } else if (was_cr) {
1641                                 /*
1642                                  * Previous round saw CR and it is not followed
1643                                  * by a LF; emit the CR before processing the
1644                                  * current character.
1645                                  */
1646                                 output[o++] = '\r';
1647                         }
1648
1649                         /*
1650                          * We may have consumed the last output slot,
1651                          * in which case we need to break out of this
1652                          * loop; hold the current character before
1653                          * returning.
1654                          */
1655                         if (*osize_p <= o) {
1656                                 lf_to_crlf->has_held = 1;
1657                                 lf_to_crlf->held = ch;
1658                                 continue; /* break but increment i */
1659                         }
1660
1661                         if (ch == '\r') {
1662                                 was_cr = 1;
1663                                 continue;
1664                         }
1665
1666                         was_cr = 0;
1667                         output[o++] = ch;
1668                 }
1669
1670                 *osize_p -= o;
1671                 *isize_p -= i;
1672
1673                 if (!lf_to_crlf->has_held && was_cr) {
1674                         lf_to_crlf->has_held = 1;
1675                         lf_to_crlf->held = '\r';
1676                 }
1677         }
1678         return 0;
1679 }
1680
1681 static void lf_to_crlf_free_fn(struct stream_filter *filter)
1682 {
1683         free(filter);
1684 }
1685
1686 static struct stream_filter_vtbl lf_to_crlf_vtbl = {
1687         .filter = lf_to_crlf_filter_fn,
1688         .free = lf_to_crlf_free_fn,
1689 };
1690
1691 static struct stream_filter *lf_to_crlf_filter(void)
1692 {
1693         struct lf_to_crlf_filter *lf_to_crlf = xcalloc(1, sizeof(*lf_to_crlf));
1694
1695         lf_to_crlf->filter.vtbl = &lf_to_crlf_vtbl;
1696         return (struct stream_filter *)lf_to_crlf;
1697 }
1698
1699 /*
1700  * Cascade filter
1701  */
1702 #define FILTER_BUFFER 1024
1703 struct cascade_filter {
1704         struct stream_filter filter;
1705         struct stream_filter *one;
1706         struct stream_filter *two;
1707         char buf[FILTER_BUFFER];
1708         int end, ptr;
1709 };
1710
1711 static int cascade_filter_fn(struct stream_filter *filter,
1712                              const char *input, size_t *isize_p,
1713                              char *output, size_t *osize_p)
1714 {
1715         struct cascade_filter *cas = (struct cascade_filter *) filter;
1716         size_t filled = 0;
1717         size_t sz = *osize_p;
1718         size_t to_feed, remaining;
1719
1720         /*
1721          * input -- (one) --> buf -- (two) --> output
1722          */
1723         while (filled < sz) {
1724                 remaining = sz - filled;
1725
1726                 /* do we already have something to feed two with? */
1727                 if (cas->ptr < cas->end) {
1728                         to_feed = cas->end - cas->ptr;
1729                         if (stream_filter(cas->two,
1730                                           cas->buf + cas->ptr, &to_feed,
1731                                           output + filled, &remaining))
1732                                 return -1;
1733                         cas->ptr += (cas->end - cas->ptr) - to_feed;
1734                         filled = sz - remaining;
1735                         continue;
1736                 }
1737
1738                 /* feed one from upstream and have it emit into our buffer */
1739                 to_feed = input ? *isize_p : 0;
1740                 if (input && !to_feed)
1741                         break;
1742                 remaining = sizeof(cas->buf);
1743                 if (stream_filter(cas->one,
1744                                   input, &to_feed,
1745                                   cas->buf, &remaining))
1746                         return -1;
1747                 cas->end = sizeof(cas->buf) - remaining;
1748                 cas->ptr = 0;
1749                 if (input) {
1750                         size_t fed = *isize_p - to_feed;
1751                         *isize_p -= fed;
1752                         input += fed;
1753                 }
1754
1755                 /* do we know that we drained one completely? */
1756                 if (input || cas->end)
1757                         continue;
1758
1759                 /* tell two to drain; we have nothing more to give it */
1760                 to_feed = 0;
1761                 remaining = sz - filled;
1762                 if (stream_filter(cas->two,
1763                                   NULL, &to_feed,
1764                                   output + filled, &remaining))
1765                         return -1;
1766                 if (remaining == (sz - filled))
1767                         break; /* completely drained two */
1768                 filled = sz - remaining;
1769         }
1770         *osize_p -= filled;
1771         return 0;
1772 }
1773
1774 static void cascade_free_fn(struct stream_filter *filter)
1775 {
1776         struct cascade_filter *cas = (struct cascade_filter *)filter;
1777         free_stream_filter(cas->one);
1778         free_stream_filter(cas->two);
1779         free(filter);
1780 }
1781
1782 static struct stream_filter_vtbl cascade_vtbl = {
1783         .filter = cascade_filter_fn,
1784         .free = cascade_free_fn,
1785 };
1786
1787 static struct stream_filter *cascade_filter(struct stream_filter *one,
1788                                             struct stream_filter *two)
1789 {
1790         struct cascade_filter *cascade;
1791
1792         if (!one || is_null_stream_filter(one))
1793                 return two;
1794         if (!two || is_null_stream_filter(two))
1795                 return one;
1796
1797         cascade = xmalloc(sizeof(*cascade));
1798         cascade->one = one;
1799         cascade->two = two;
1800         cascade->end = cascade->ptr = 0;
1801         cascade->filter.vtbl = &cascade_vtbl;
1802         return (struct stream_filter *)cascade;
1803 }
1804
1805 /*
1806  * ident filter
1807  */
1808 #define IDENT_DRAINING (-1)
1809 #define IDENT_SKIPPING (-2)
1810 struct ident_filter {
1811         struct stream_filter filter;
1812         struct strbuf left;
1813         int state;
1814         char ident[GIT_MAX_HEXSZ + 5]; /* ": x40 $" */
1815 };
1816
1817 static int is_foreign_ident(const char *str)
1818 {
1819         int i;
1820
1821         if (!skip_prefix(str, "$Id: ", &str))
1822                 return 0;
1823         for (i = 0; str[i]; i++) {
1824                 if (isspace(str[i]) && str[i+1] != '$')
1825                         return 1;
1826         }
1827         return 0;
1828 }
1829
1830 static void ident_drain(struct ident_filter *ident, char **output_p, size_t *osize_p)
1831 {
1832         size_t to_drain = ident->left.len;
1833
1834         if (*osize_p < to_drain)
1835                 to_drain = *osize_p;
1836         if (to_drain) {
1837                 memcpy(*output_p, ident->left.buf, to_drain);
1838                 strbuf_remove(&ident->left, 0, to_drain);
1839                 *output_p += to_drain;
1840                 *osize_p -= to_drain;
1841         }
1842         if (!ident->left.len)
1843                 ident->state = 0;
1844 }
1845
1846 static int ident_filter_fn(struct stream_filter *filter,
1847                            const char *input, size_t *isize_p,
1848                            char *output, size_t *osize_p)
1849 {
1850         struct ident_filter *ident = (struct ident_filter *)filter;
1851         static const char head[] = "$Id";
1852
1853         if (!input) {
1854                 /* drain upon eof */
1855                 switch (ident->state) {
1856                 default:
1857                         strbuf_add(&ident->left, head, ident->state);
1858                         /* fallthrough */
1859                 case IDENT_SKIPPING:
1860                         /* fallthrough */
1861                 case IDENT_DRAINING:
1862                         ident_drain(ident, &output, osize_p);
1863                 }
1864                 return 0;
1865         }
1866
1867         while (*isize_p || (ident->state == IDENT_DRAINING)) {
1868                 int ch;
1869
1870                 if (ident->state == IDENT_DRAINING) {
1871                         ident_drain(ident, &output, osize_p);
1872                         if (!*osize_p)
1873                                 break;
1874                         continue;
1875                 }
1876
1877                 ch = *(input++);
1878                 (*isize_p)--;
1879
1880                 if (ident->state == IDENT_SKIPPING) {
1881                         /*
1882                          * Skipping until '$' or LF, but keeping them
1883                          * in case it is a foreign ident.
1884                          */
1885                         strbuf_addch(&ident->left, ch);
1886                         if (ch != '\n' && ch != '$')
1887                                 continue;
1888                         if (ch == '$' && !is_foreign_ident(ident->left.buf)) {
1889                                 strbuf_setlen(&ident->left, sizeof(head) - 1);
1890                                 strbuf_addstr(&ident->left, ident->ident);
1891                         }
1892                         ident->state = IDENT_DRAINING;
1893                         continue;
1894                 }
1895
1896                 if (ident->state < sizeof(head) &&
1897                     head[ident->state] == ch) {
1898                         ident->state++;
1899                         continue;
1900                 }
1901
1902                 if (ident->state)
1903                         strbuf_add(&ident->left, head, ident->state);
1904                 if (ident->state == sizeof(head) - 1) {
1905                         if (ch != ':' && ch != '$') {
1906                                 strbuf_addch(&ident->left, ch);
1907                                 ident->state = 0;
1908                                 continue;
1909                         }
1910
1911                         if (ch == ':') {
1912                                 strbuf_addch(&ident->left, ch);
1913                                 ident->state = IDENT_SKIPPING;
1914                         } else {
1915                                 strbuf_addstr(&ident->left, ident->ident);
1916                                 ident->state = IDENT_DRAINING;
1917                         }
1918                         continue;
1919                 }
1920
1921                 strbuf_addch(&ident->left, ch);
1922                 ident->state = IDENT_DRAINING;
1923         }
1924         return 0;
1925 }
1926
1927 static void ident_free_fn(struct stream_filter *filter)
1928 {
1929         struct ident_filter *ident = (struct ident_filter *)filter;
1930         strbuf_release(&ident->left);
1931         free(filter);
1932 }
1933
1934 static struct stream_filter_vtbl ident_vtbl = {
1935         .filter = ident_filter_fn,
1936         .free = ident_free_fn,
1937 };
1938
1939 static struct stream_filter *ident_filter(const struct object_id *oid)
1940 {
1941         struct ident_filter *ident = xmalloc(sizeof(*ident));
1942
1943         xsnprintf(ident->ident, sizeof(ident->ident),
1944                   ": %s $", oid_to_hex(oid));
1945         strbuf_init(&ident->left, 0);
1946         ident->filter.vtbl = &ident_vtbl;
1947         ident->state = 0;
1948         return (struct stream_filter *)ident;
1949 }
1950
1951 /*
1952  * Return an appropriately constructed filter for the given ca, or NULL if
1953  * the contents cannot be filtered without reading the whole thing
1954  * in-core.
1955  *
1956  * Note that you would be crazy to set CRLF, smudge/clean or ident to a
1957  * large binary blob you would want us not to slurp into the memory!
1958  */
1959 struct stream_filter *get_stream_filter_ca(const struct conv_attrs *ca,
1960                                            const struct object_id *oid)
1961 {
1962         struct stream_filter *filter = NULL;
1963
1964         if (classify_conv_attrs(ca) != CA_CLASS_STREAMABLE)
1965                 return NULL;
1966
1967         if (ca->ident)
1968                 filter = ident_filter(oid);
1969
1970         if (output_eol(ca->crlf_action) == EOL_CRLF)
1971                 filter = cascade_filter(filter, lf_to_crlf_filter());
1972         else
1973                 filter = cascade_filter(filter, &null_filter_singleton);
1974
1975         return filter;
1976 }
1977
1978 struct stream_filter *get_stream_filter(struct index_state *istate,
1979                                         const char *path,
1980                                         const struct object_id *oid)
1981 {
1982         struct conv_attrs ca;
1983         convert_attrs(istate, &ca, path);
1984         return get_stream_filter_ca(&ca, oid);
1985 }
1986
1987 void free_stream_filter(struct stream_filter *filter)
1988 {
1989         filter->vtbl->free(filter);
1990 }
1991
1992 int stream_filter(struct stream_filter *filter,
1993                   const char *input, size_t *isize_p,
1994                   char *output, size_t *osize_p)
1995 {
1996         return filter->vtbl->filter(filter, input, isize_p, output, osize_p);
1997 }
1998
1999 void init_checkout_metadata(struct checkout_metadata *meta, const char *refname,
2000                             const struct object_id *treeish,
2001                             const struct object_id *blob)
2002 {
2003         memset(meta, 0, sizeof(*meta));
2004         if (refname)
2005                 meta->refname = refname;
2006         if (treeish)
2007                 oidcpy(&meta->treeish, treeish);
2008         if (blob)
2009                 oidcpy(&meta->blob, blob);
2010 }
2011
2012 void clone_checkout_metadata(struct checkout_metadata *dst,
2013                              const struct checkout_metadata *src,
2014                              const struct object_id *blob)
2015 {
2016         memcpy(dst, src, sizeof(*dst));
2017         if (blob)
2018                 oidcpy(&dst->blob, blob);
2019 }
2020
2021 enum conv_attrs_classification classify_conv_attrs(const struct conv_attrs *ca)
2022 {
2023         if (ca->drv) {
2024                 if (ca->drv->process)
2025                         return CA_CLASS_INCORE_PROCESS;
2026                 if (ca->drv->smudge || ca->drv->clean)
2027                         return CA_CLASS_INCORE_FILTER;
2028         }
2029
2030         if (ca->working_tree_encoding)
2031                 return CA_CLASS_INCORE;
2032
2033         if (ca->crlf_action == CRLF_AUTO || ca->crlf_action == CRLF_AUTO_CRLF)
2034                 return CA_CLASS_INCORE;
2035
2036         return CA_CLASS_STREAMABLE;
2037 }