builtin/unpack-objects.c

   1 #include "builtin.h"
   2 #include "bulk-checkin.h"
   3 #include "config.h"
   4 #include "environment.h"
   5 #include "gettext.h"
   6 #include "git-zlib.h"
   7 #include "hex.h"
   8 #include "object-store-ll.h"
   9 #include "object.h"
  10 #include "delta.h"
  11 #include "pack.h"
  12 #include "blob.h"
  13 #include "commit.h"
  14 #include "replace-object.h"
  15 #include "strbuf.h"
  16 #include "tag.h"
  17 #include "tree.h"
  18 #include "tree-walk.h"
  19 #include "progress.h"
  20 #include "decorate.h"
  21 #include "fsck.h"
  22
  23 static int dry_run, quiet, recover, has_errors, strict;
  24 static const char unpack_usage[] = "git unpack-objects [-n] [-q] [-r] [--strict]";
  25
  26 /* We always read in 4kB chunks. */
  27 static unsigned char buffer[4096];
  28 static unsigned int offset, len;
  29 static off_t consumed_bytes;
  30 static off_t max_input_size;
  31 static git_hash_ctx ctx;
  32 static struct fsck_options fsck_options = FSCK_OPTIONS_STRICT;
  33 static struct progress *progress;
  34
  35 /*
  36  * When running under --strict mode, objects whose reachability are
  37  * suspect are kept in core without getting written in the object
  38  * store.
  39  */
  40 struct obj_buffer {
  41         char *buffer;
  42         unsigned long size;
  43 };
  44
  45 static struct decoration obj_decorate;
  46
  47 static struct obj_buffer *lookup_object_buffer(struct object *base)
  48 {
  49         return lookup_decoration(&obj_decorate, base);
  50 }
  51
  52 static void add_object_buffer(struct object *object, char *buffer, unsigned long size)
  53 {
  54         struct obj_buffer *obj;
  55         CALLOC_ARRAY(obj, 1);
  56         obj->buffer = buffer;
  57         obj->size = size;
  58         if (add_decoration(&obj_decorate, object, obj))
  59                 die("object %s tried to add buffer twice!", oid_to_hex(&object->oid));
  60 }
  61
  62 /*
  63  * Make sure at least "min" bytes are available in the buffer, and
  64  * return the pointer to the buffer.
  65  */
  66 static void *fill(int min)
  67 {
  68         if (min <= len)
  69                 return buffer + offset;
  70         if (min > sizeof(buffer))
  71                 die("cannot fill %d bytes", min);
  72         if (offset) {
  73                 the_hash_algo->update_fn(&ctx, buffer, offset);
  74                 memmove(buffer, buffer + offset, len);
  75                 offset = 0;
  76         }
  77         do {
  78                 ssize_t ret = xread(0, buffer + len, sizeof(buffer) - len);
  79                 if (ret <= 0) {
  80                         if (!ret)
  81                                 die("early EOF");
  82                         die_errno("read error on input");
  83                 }
  84                 len += ret;
  85         } while (len < min);
  86         return buffer;
  87 }
  88
  89 static void use(int bytes)
  90 {
  91         if (bytes > len)
  92                 die("used more bytes than were available");
  93         len -= bytes;
  94         offset += bytes;
  95
  96         /* make sure off_t is sufficiently large not to wrap */
  97         if (signed_add_overflows(consumed_bytes, bytes))
  98                 die("pack too large for current definition of off_t");
  99         consumed_bytes += bytes;
 100         if (max_input_size && consumed_bytes > max_input_size)
 101                 die(_("pack exceeds maximum allowed size"));
 102         display_throughput(progress, consumed_bytes);
 103 }
 104
 105 /*
 106  * Decompress zstream from the standard input into a newly
 107  * allocated buffer of specified size and return the buffer.
 108  * The caller is responsible to free the returned buffer.
 109  *
 110  * But for dry_run mode, "get_data()" is only used to check the
 111  * integrity of data, and the returned buffer is not used at all.
 112  * Therefore, in dry_run mode, "get_data()" will release the small
 113  * allocated buffer which is reused to hold temporary zstream output
 114  * and return NULL instead of returning garbage data.
 115  */
 116 static void *get_data(unsigned long size)
 117 {
 118         git_zstream stream;
 119         unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
 120         void *buf = xmallocz(bufsize);
 121
 122         memset(&stream, 0, sizeof(stream));
 123
 124         stream.next_out = buf;
 125         stream.avail_out = bufsize;
 126         stream.next_in = fill(1);
 127         stream.avail_in = len;
 128         git_inflate_init(&stream);
 129
 130         for (;;) {
 131                 int ret = git_inflate(&stream, 0);
 132                 use(len - stream.avail_in);
 133                 if (stream.total_out == size && ret == Z_STREAM_END)
 134                         break;
 135                 if (ret != Z_OK) {
 136                         error("inflate returned %d", ret);
 137                         FREE_AND_NULL(buf);
 138                         if (!recover)
 139                                 exit(1);
 140                         has_errors = 1;
 141                         break;
 142                 }
 143                 stream.next_in = fill(1);
 144                 stream.avail_in = len;
 145                 if (dry_run) {
 146                         /* reuse the buffer in dry_run mode */
 147                         stream.next_out = buf;
 148                         stream.avail_out = bufsize > size - stream.total_out ?
 149                                                    size - stream.total_out :
 150                                                    bufsize;
 151                 }
 152         }
 153         git_inflate_end(&stream);
 154         if (dry_run)
 155                 FREE_AND_NULL(buf);
 156         return buf;
 157 }
 158
 159 struct delta_info {
 160         struct object_id base_oid;
 161         unsigned nr;
 162         off_t base_offset;
 163         unsigned long size;
 164         void *delta;
 165         struct delta_info *next;
 166 };
 167
 168 static struct delta_info *delta_list;
 169
 170 static void add_delta_to_list(unsigned nr, const struct object_id *base_oid,
 171                               off_t base_offset,
 172                               void *delta, unsigned long size)
 173 {
 174         struct delta_info *info = xmalloc(sizeof(*info));
 175
 176         oidcpy(&info->base_oid, base_oid);
 177         info->base_offset = base_offset;
 178         info->size = size;
 179         info->delta = delta;
 180         info->nr = nr;
 181         info->next = delta_list;
 182         delta_list = info;
 183 }
 184
 185 struct obj_info {
 186         off_t offset;
 187         struct object_id oid;
 188         struct object *obj;
 189 };
 190
 191 /* Remember to update object flag allocation in object.h */
 192 #define FLAG_OPEN (1u<<20)
 193 #define FLAG_WRITTEN (1u<<21)
 194
 195 static struct obj_info *obj_list;
 196 static unsigned nr_objects;
 197
 198 /*
 199  * Called only from check_object() after it verified this object
 200  * is Ok.
 201  */
 202 static void write_cached_object(struct object *obj, struct obj_buffer *obj_buf)
 203 {
 204         struct object_id oid;
 205
 206         if (write_object_file(obj_buf->buffer, obj_buf->size,
 207                               obj->type, &oid) < 0)
 208                 die("failed to write object %s", oid_to_hex(&obj->oid));
 209         obj->flags |= FLAG_WRITTEN;
 210 }
 211
 212 /*
 213  * At the very end of the processing, write_rest() scans the objects
 214  * that have reachability requirements and calls this function.
 215  * Verify its reachability and validity recursively and write it out.
 216  */
 217 static int check_object(struct object *obj, enum object_type type,
 218                         void *data UNUSED,
 219                         struct fsck_options *options UNUSED)
 220 {
 221         struct obj_buffer *obj_buf;
 222
 223         if (!obj)
 224                 return 1;
 225
 226         if (obj->flags & FLAG_WRITTEN)
 227                 return 0;
 228
 229         if (type != OBJ_ANY && obj->type != type)
 230                 die("object type mismatch");
 231
 232         if (!(obj->flags & FLAG_OPEN)) {
 233                 unsigned long size;
 234                 int type = oid_object_info(the_repository, &obj->oid, &size);
 235                 if (type != obj->type || type <= 0)
 236                         die("object of unexpected type");
 237                 obj->flags |= FLAG_WRITTEN;
 238                 return 0;
 239         }
 240
 241         obj_buf = lookup_object_buffer(obj);
 242         if (!obj_buf)
 243                 die("Whoops! Cannot find object '%s'", oid_to_hex(&obj->oid));
 244         if (fsck_object(obj, obj_buf->buffer, obj_buf->size, &fsck_options))
 245                 die("fsck error in packed object");
 246         fsck_options.walk = check_object;
 247         if (fsck_walk(obj, NULL, &fsck_options))
 248                 die("Error on reachable objects of %s", oid_to_hex(&obj->oid));
 249         write_cached_object(obj, obj_buf);
 250         return 0;
 251 }
 252
 253 static void write_rest(void)
 254 {
 255         unsigned i;
 256         for (i = 0; i < nr_objects; i++) {
 257                 if (obj_list[i].obj)
 258                         check_object(obj_list[i].obj, OBJ_ANY, NULL, NULL);
 259         }
 260 }
 261
 262 static void added_object(unsigned nr, enum object_type type,
 263                          void *data, unsigned long size);
 264
 265 /*
 266  * Write out nr-th object from the list, now we know the contents
 267  * of it.  Under --strict, this buffers structured objects in-core,
 268  * to be checked at the end.
 269  */
 270 static void write_object(unsigned nr, enum object_type type,
 271                          void *buf, unsigned long size)
 272 {
 273         if (!strict) {
 274                 if (write_object_file(buf, size, type,
 275                                       &obj_list[nr].oid) < 0)
 276                         die("failed to write object");
 277                 added_object(nr, type, buf, size);
 278                 free(buf);
 279                 obj_list[nr].obj = NULL;
 280         } else if (type == OBJ_BLOB) {
 281                 struct blob *blob;
 282                 if (write_object_file(buf, size, type,
 283                                       &obj_list[nr].oid) < 0)
 284                         die("failed to write object");
 285                 added_object(nr, type, buf, size);
 286                 free(buf);
 287
 288                 blob = lookup_blob(the_repository, &obj_list[nr].oid);
 289                 if (blob)
 290                         blob->object.flags |= FLAG_WRITTEN;
 291                 else
 292                         die("invalid blob object");
 293                 obj_list[nr].obj = NULL;
 294         } else {
 295                 struct object *obj;
 296                 int eaten;
 297                 hash_object_file(the_hash_algo, buf, size, type,
 298                                  &obj_list[nr].oid);
 299                 added_object(nr, type, buf, size);
 300                 obj = parse_object_buffer(the_repository, &obj_list[nr].oid,
 301                                           type, size, buf,
 302                                           &eaten);
 303                 if (!obj)
 304                         die("invalid %s", type_name(type));
 305                 add_object_buffer(obj, buf, size);
 306                 obj->flags |= FLAG_OPEN;
 307                 obj_list[nr].obj = obj;
 308         }
 309 }
 310
 311 static void resolve_delta(unsigned nr, enum object_type type,
 312                           void *base, unsigned long base_size,
 313                           void *delta, unsigned long delta_size)
 314 {
 315         void *result;
 316         unsigned long result_size;
 317
 318         result = patch_delta(base, base_size,
 319                              delta, delta_size,
 320                              &result_size);
 321         if (!result)
 322                 die("failed to apply delta");
 323         free(delta);
 324         write_object(nr, type, result, result_size);
 325 }
 326
 327 /*
 328  * We now know the contents of an object (which is nr-th in the pack);
 329  * resolve all the deltified objects that are based on it.
 330  */
 331 static void added_object(unsigned nr, enum object_type type,
 332                          void *data, unsigned long size)
 333 {
 334         struct delta_info **p = &delta_list;
 335         struct delta_info *info;
 336
 337         while ((info = *p) != NULL) {
 338                 if (oideq(&info->base_oid, &obj_list[nr].oid) ||
 339                     info->base_offset == obj_list[nr].offset) {
 340                         *p = info->next;
 341                         p = &delta_list;
 342                         resolve_delta(info->nr, type, data, size,
 343                                       info->delta, info->size);
 344                         free(info);
 345                         continue;
 346                 }
 347                 p = &info->next;
 348         }
 349 }
 350
 351 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 352                                    unsigned nr)
 353 {
 354         void *buf = get_data(size);
 355
 356         if (buf)
 357                 write_object(nr, type, buf, size);
 358 }
 359
 360 struct input_zstream_data {
 361         git_zstream *zstream;
 362         unsigned char buf[8192];
 363         int status;
 364 };
 365
 366 static const void *feed_input_zstream(struct input_stream *in_stream,
 367                                       unsigned long *readlen)
 368 {
 369         struct input_zstream_data *data = in_stream->data;
 370         git_zstream *zstream = data->zstream;
 371         void *in = fill(1);
 372
 373         if (in_stream->is_finished) {
 374                 *readlen = 0;
 375                 return NULL;
 376         }
 377
 378         zstream->next_out = data->buf;
 379         zstream->avail_out = sizeof(data->buf);
 380         zstream->next_in = in;
 381         zstream->avail_in = len;
 382
 383         data->status = git_inflate(zstream, 0);
 384
 385         in_stream->is_finished = data->status != Z_OK;
 386         use(len - zstream->avail_in);
 387         *readlen = sizeof(data->buf) - zstream->avail_out;
 388
 389         return data->buf;
 390 }
 391
 392 static void stream_blob(unsigned long size, unsigned nr)
 393 {
 394         git_zstream zstream = { 0 };
 395         struct input_zstream_data data = { 0 };
 396         struct input_stream in_stream = {
 397                 .read = feed_input_zstream,
 398                 .data = &data,
 399         };
 400         struct obj_info *info = &obj_list[nr];
 401
 402         data.zstream = &zstream;
 403         git_inflate_init(&zstream);
 404
 405         if (stream_loose_object(&in_stream, size, &info->oid))
 406                 die(_("failed to write object in stream"));
 407
 408         if (data.status != Z_STREAM_END)
 409                 die(_("inflate returned (%d)"), data.status);
 410         git_inflate_end(&zstream);
 411
 412         if (strict) {
 413                 struct blob *blob = lookup_blob(the_repository, &info->oid);
 414
 415                 if (!blob)
 416                         die(_("invalid blob object from stream"));
 417                 blob->object.flags |= FLAG_WRITTEN;
 418         }
 419         info->obj = NULL;
 420 }
 421
 422 static int resolve_against_held(unsigned nr, const struct object_id *base,
 423                                 void *delta_data, unsigned long delta_size)
 424 {
 425         struct object *obj;
 426         struct obj_buffer *obj_buffer;
 427         obj = lookup_object(the_repository, base);
 428         if (!obj)
 429                 return 0;
 430         obj_buffer = lookup_object_buffer(obj);
 431         if (!obj_buffer)
 432                 return 0;
 433         resolve_delta(nr, obj->type, obj_buffer->buffer,
 434                       obj_buffer->size, delta_data, delta_size);
 435         return 1;
 436 }
 437
 438 static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 439                                unsigned nr)
 440 {
 441         void *delta_data, *base;
 442         unsigned long base_size;
 443         struct object_id base_oid;
 444
 445         if (type == OBJ_REF_DELTA) {
 446                 oidread(&base_oid, fill(the_hash_algo->rawsz));
 447                 use(the_hash_algo->rawsz);
 448                 delta_data = get_data(delta_size);
 449                 if (!delta_data)
 450                         return;
 451                 if (repo_has_object_file(the_repository, &base_oid))
 452                         ; /* Ok we have this one */
 453                 else if (resolve_against_held(nr, &base_oid,
 454                                               delta_data, delta_size))
 455                         return; /* we are done */
 456                 else {
 457                         /* cannot resolve yet --- queue it */
 458                         oidclr(&obj_list[nr].oid);
 459                         add_delta_to_list(nr, &base_oid, 0, delta_data, delta_size);
 460                         return;
 461                 }
 462         } else {
 463                 unsigned base_found = 0;
 464                 unsigned char *pack, c;
 465                 off_t base_offset;
 466                 unsigned lo, mid, hi;
 467
 468                 pack = fill(1);
 469                 c = *pack;
 470                 use(1);
 471                 base_offset = c & 127;
 472                 while (c & 128) {
 473                         base_offset += 1;
 474                         if (!base_offset || MSB(base_offset, 7))
 475                                 die("offset value overflow for delta base object");
 476                         pack = fill(1);
 477                         c = *pack;
 478                         use(1);
 479                         base_offset = (base_offset << 7) + (c & 127);
 480                 }
 481                 base_offset = obj_list[nr].offset - base_offset;
 482                 if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 483                         die("offset value out of bound for delta base object");
 484
 485                 delta_data = get_data(delta_size);
 486                 if (!delta_data)
 487                         return;
 488                 lo = 0;
 489                 hi = nr;
 490                 while (lo < hi) {
 491                         mid = lo + (hi - lo) / 2;
 492                         if (base_offset < obj_list[mid].offset) {
 493                                 hi = mid;
 494                         } else if (base_offset > obj_list[mid].offset) {
 495                                 lo = mid + 1;
 496                         } else {
 497                                 oidcpy(&base_oid, &obj_list[mid].oid);
 498                                 base_found = !is_null_oid(&base_oid);
 499                                 break;
 500                         }
 501                 }
 502                 if (!base_found) {
 503                         /*
 504                          * The delta base object is itself a delta that
 505                          * has not been resolved yet.
 506                          */
 507                         oidclr(&obj_list[nr].oid);
 508                         add_delta_to_list(nr, null_oid(), base_offset,
 509                                           delta_data, delta_size);
 510                         return;
 511                 }
 512         }
 513
 514         if (resolve_against_held(nr, &base_oid, delta_data, delta_size))
 515                 return;
 516
 517         base = repo_read_object_file(the_repository, &base_oid, &type,
 518                                      &base_size);
 519         if (!base) {
 520                 error("failed to read delta-pack base object %s",
 521                       oid_to_hex(&base_oid));
 522                 if (!recover)
 523                         exit(1);
 524                 has_errors = 1;
 525                 return;
 526         }
 527         resolve_delta(nr, type, base, base_size, delta_data, delta_size);
 528         free(base);
 529 }
 530
 531 static void unpack_one(unsigned nr)
 532 {
 533         unsigned shift;
 534         unsigned char *pack;
 535         unsigned long size, c;
 536         enum object_type type;
 537
 538         obj_list[nr].offset = consumed_bytes;
 539
 540         pack = fill(1);
 541         c = *pack;
 542         use(1);
 543         type = (c >> 4) & 7;
 544         size = (c & 15);
 545         shift = 4;
 546         while (c & 0x80) {
 547                 pack = fill(1);
 548                 c = *pack;
 549                 use(1);
 550                 size += (c & 0x7f) << shift;
 551                 shift += 7;
 552         }
 553
 554         switch (type) {
 555         case OBJ_BLOB:
 556                 if (!dry_run && size > big_file_threshold) {
 557                         stream_blob(size, nr);
 558                         return;
 559                 }
 560                 /* fallthrough */
 561         case OBJ_COMMIT:
 562         case OBJ_TREE:
 563         case OBJ_TAG:
 564                 unpack_non_delta_entry(type, size, nr);
 565                 return;
 566         case OBJ_REF_DELTA:
 567         case OBJ_OFS_DELTA:
 568                 unpack_delta_entry(type, size, nr);
 569                 return;
 570         default:
 571                 error("bad object type %d", type);
 572                 has_errors = 1;
 573                 if (recover)
 574                         return;
 575                 exit(1);
 576         }
 577 }
 578
 579 static void unpack_all(void)
 580 {
 581         int i;
 582         struct pack_header *hdr = fill(sizeof(struct pack_header));
 583
 584         nr_objects = ntohl(hdr->hdr_entries);
 585
 586         if (ntohl(hdr->hdr_signature) != PACK_SIGNATURE)
 587                 die("bad pack file");
 588         if (!pack_version_ok(hdr->hdr_version))
 589                 die("unknown pack file version %"PRIu32,
 590                         ntohl(hdr->hdr_version));
 591         use(sizeof(struct pack_header));
 592
 593         if (!quiet)
 594                 progress = start_progress(_("Unpacking objects"), nr_objects);
 595         CALLOC_ARRAY(obj_list, nr_objects);
 596         begin_odb_transaction();
 597         for (i = 0; i < nr_objects; i++) {
 598                 unpack_one(i);
 599                 display_progress(progress, i + 1);
 600         }
 601         end_odb_transaction();
 602         stop_progress(&progress);
 603
 604         if (delta_list)
 605                 die("unresolved deltas left after unpacking");
 606 }
 607
 608 int cmd_unpack_objects(int argc, const char **argv, const char *prefix UNUSED)
 609 {
 610         int i;
 611         struct object_id oid;
 612         git_hash_ctx tmp_ctx;
 613
 614         disable_replace_refs();
 615
 616         git_config(git_default_config, NULL);
 617
 618         quiet = !isatty(2);
 619
 620         for (i = 1 ; i < argc; i++) {
 621                 const char *arg = argv[i];
 622
 623                 if (*arg == '-') {
 624                         if (!strcmp(arg, "-n")) {
 625                                 dry_run = 1;
 626                                 continue;
 627                         }
 628                         if (!strcmp(arg, "-q")) {
 629                                 quiet = 1;
 630                                 continue;
 631                         }
 632                         if (!strcmp(arg, "-r")) {
 633                                 recover = 1;
 634                                 continue;
 635                         }
 636                         if (!strcmp(arg, "--strict")) {
 637                                 strict = 1;
 638                                 continue;
 639                         }
 640                         if (skip_prefix(arg, "--strict=", &arg)) {
 641                                 strict = 1;
 642                                 fsck_set_msg_types(&fsck_options, arg);
 643                                 continue;
 644                         }
 645                         if (starts_with(arg, "--pack_header=")) {
 646                                 struct pack_header *hdr;
 647                                 char *c;
 648
 649                                 hdr = (struct pack_header *)buffer;
 650                                 hdr->hdr_signature = htonl(PACK_SIGNATURE);
 651                                 hdr->hdr_version = htonl(strtoul(arg + 14, &c, 10));
 652                                 if (*c != ',')
 653                                         die("bad %s", arg);
 654                                 hdr->hdr_entries = htonl(strtoul(c + 1, &c, 10));
 655                                 if (*c)
 656                                         die("bad %s", arg);
 657                                 len = sizeof(*hdr);
 658                                 continue;
 659                         }
 660                         if (skip_prefix(arg, "--max-input-size=", &arg)) {
 661                                 max_input_size = strtoumax(arg, NULL, 10);
 662                                 continue;
 663                         }
 664                         usage(unpack_usage);
 665                 }
 666
 667                 /* We don't take any non-flag arguments now.. Maybe some day */
 668                 usage(unpack_usage);
 669         }
 670         the_hash_algo->init_fn(&ctx);
 671         unpack_all();
 672         the_hash_algo->update_fn(&ctx, buffer, offset);
 673         the_hash_algo->init_fn(&tmp_ctx);
 674         the_hash_algo->clone_fn(&tmp_ctx, &ctx);
 675         the_hash_algo->final_oid_fn(&oid, &tmp_ctx);
 676         if (strict) {
 677                 write_rest();
 678                 if (fsck_finish(&fsck_options))
 679                         die(_("fsck error in pack objects"));
 680         }
 681         if (!hasheq(fill(the_hash_algo->rawsz), oid.hash))
 682                 die("final sha1 did not match");
 683         use(the_hash_algo->rawsz);
 684
 685         /* Write the last part of the buffer to stdout */
 686         while (len) {
 687                 int ret = xwrite(1, buffer + offset, len);
 688                 if (ret <= 0)
 689                         break;
 690                 len -= ret;
 691                 offset += ret;
 692         }
 693
 694         /* All done */
 695         return has_errors;
 696 }