builtin/unpack-objects.c

   1 #include "builtin.h"
   2 #include "cache.h"
   3 #include "bulk-checkin.h"
   4 #include "config.h"
   5 #include "object-store.h"
   6 #include "object.h"
   7 #include "delta.h"
   8 #include "pack.h"
   9 #include "blob.h"
  10 #include "commit.h"
  11 #include "tag.h"
  12 #include "tree.h"
  13 #include "tree-walk.h"
  14 #include "progress.h"
  15 #include "decorate.h"
  16 #include "fsck.h"
  17
  18 static int dry_run, quiet, recover, has_errors, strict;
  19 static const char unpack_usage[] = "git unpack-objects [-n] [-q] [-r] [--strict]";
  20
  21 /* We always read in 4kB chunks. */
  22 static unsigned char buffer[4096];
  23 static unsigned int offset, len;
  24 static off_t consumed_bytes;
  25 static off_t max_input_size;
  26 static git_hash_ctx ctx;
  27 static struct fsck_options fsck_options = FSCK_OPTIONS_STRICT;
  28 static struct progress *progress;
  29
  30 /*
  31  * When running under --strict mode, objects whose reachability are
  32  * suspect are kept in core without getting written in the object
  33  * store.
  34  */
  35 struct obj_buffer {
  36         char *buffer;
  37         unsigned long size;
  38 };
  39
  40 static struct decoration obj_decorate;
  41
  42 static struct obj_buffer *lookup_object_buffer(struct object *base)
  43 {
  44         return lookup_decoration(&obj_decorate, base);
  45 }
  46
  47 static void add_object_buffer(struct object *object, char *buffer, unsigned long size)
  48 {
  49         struct obj_buffer *obj;
  50         CALLOC_ARRAY(obj, 1);
  51         obj->buffer = buffer;
  52         obj->size = size;
  53         if (add_decoration(&obj_decorate, object, obj))
  54                 die("object %s tried to add buffer twice!", oid_to_hex(&object->oid));
  55 }
  56
  57 /*
  58  * Make sure at least "min" bytes are available in the buffer, and
  59  * return the pointer to the buffer.
  60  */
  61 static void *fill(int min)
  62 {
  63         if (min <= len)
  64                 return buffer + offset;
  65         if (min > sizeof(buffer))
  66                 die("cannot fill %d bytes", min);
  67         if (offset) {
  68                 the_hash_algo->update_fn(&ctx, buffer, offset);
  69                 memmove(buffer, buffer + offset, len);
  70                 offset = 0;
  71         }
  72         do {
  73                 ssize_t ret = xread(0, buffer + len, sizeof(buffer) - len);
  74                 if (ret <= 0) {
  75                         if (!ret)
  76                                 die("early EOF");
  77                         die_errno("read error on input");
  78                 }
  79                 len += ret;
  80         } while (len < min);
  81         return buffer;
  82 }
  83
  84 static void use(int bytes)
  85 {
  86         if (bytes > len)
  87                 die("used more bytes than were available");
  88         len -= bytes;
  89         offset += bytes;
  90
  91         /* make sure off_t is sufficiently large not to wrap */
  92         if (signed_add_overflows(consumed_bytes, bytes))
  93                 die("pack too large for current definition of off_t");
  94         consumed_bytes += bytes;
  95         if (max_input_size && consumed_bytes > max_input_size)
  96                 die(_("pack exceeds maximum allowed size"));
  97         display_throughput(progress, consumed_bytes);
  98 }
  99
 100 /*
 101  * Decompress zstream from the standard input into a newly
 102  * allocated buffer of specified size and return the buffer.
 103  * The caller is responsible to free the returned buffer.
 104  *
 105  * But for dry_run mode, "get_data()" is only used to check the
 106  * integrity of data, and the returned buffer is not used at all.
 107  * Therefore, in dry_run mode, "get_data()" will release the small
 108  * allocated buffer which is reused to hold temporary zstream output
 109  * and return NULL instead of returning garbage data.
 110  */
 111 static void *get_data(unsigned long size)
 112 {
 113         git_zstream stream;
 114         unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
 115         void *buf = xmallocz(bufsize);
 116
 117         memset(&stream, 0, sizeof(stream));
 118
 119         stream.next_out = buf;
 120         stream.avail_out = bufsize;
 121         stream.next_in = fill(1);
 122         stream.avail_in = len;
 123         git_inflate_init(&stream);
 124
 125         for (;;) {
 126                 int ret = git_inflate(&stream, 0);
 127                 use(len - stream.avail_in);
 128                 if (stream.total_out == size && ret == Z_STREAM_END)
 129                         break;
 130                 if (ret != Z_OK) {
 131                         error("inflate returned %d", ret);
 132                         FREE_AND_NULL(buf);
 133                         if (!recover)
 134                                 exit(1);
 135                         has_errors = 1;
 136                         break;
 137                 }
 138                 stream.next_in = fill(1);
 139                 stream.avail_in = len;
 140                 if (dry_run) {
 141                         /* reuse the buffer in dry_run mode */
 142                         stream.next_out = buf;
 143                         stream.avail_out = bufsize > size - stream.total_out ?
 144                                                    size - stream.total_out :
 145                                                    bufsize;
 146                 }
 147         }
 148         git_inflate_end(&stream);
 149         if (dry_run)
 150                 FREE_AND_NULL(buf);
 151         return buf;
 152 }
 153
 154 struct delta_info {
 155         struct object_id base_oid;
 156         unsigned nr;
 157         off_t base_offset;
 158         unsigned long size;
 159         void *delta;
 160         struct delta_info *next;
 161 };
 162
 163 static struct delta_info *delta_list;
 164
 165 static void add_delta_to_list(unsigned nr, const struct object_id *base_oid,
 166                               off_t base_offset,
 167                               void *delta, unsigned long size)
 168 {
 169         struct delta_info *info = xmalloc(sizeof(*info));
 170
 171         oidcpy(&info->base_oid, base_oid);
 172         info->base_offset = base_offset;
 173         info->size = size;
 174         info->delta = delta;
 175         info->nr = nr;
 176         info->next = delta_list;
 177         delta_list = info;
 178 }
 179
 180 struct obj_info {
 181         off_t offset;
 182         struct object_id oid;
 183         struct object *obj;
 184 };
 185
 186 /* Remember to update object flag allocation in object.h */
 187 #define FLAG_OPEN (1u<<20)
 188 #define FLAG_WRITTEN (1u<<21)
 189
 190 static struct obj_info *obj_list;
 191 static unsigned nr_objects;
 192
 193 /*
 194  * Called only from check_object() after it verified this object
 195  * is Ok.
 196  */
 197 static void write_cached_object(struct object *obj, struct obj_buffer *obj_buf)
 198 {
 199         struct object_id oid;
 200
 201         if (write_object_file(obj_buf->buffer, obj_buf->size,
 202                               obj->type, &oid) < 0)
 203                 die("failed to write object %s", oid_to_hex(&obj->oid));
 204         obj->flags |= FLAG_WRITTEN;
 205 }
 206
 207 /*
 208  * At the very end of the processing, write_rest() scans the objects
 209  * that have reachability requirements and calls this function.
 210  * Verify its reachability and validity recursively and write it out.
 211  */
 212 static int check_object(struct object *obj, enum object_type type,
 213                         void *data, struct fsck_options *options)
 214 {
 215         struct obj_buffer *obj_buf;
 216
 217         if (!obj)
 218                 return 1;
 219
 220         if (obj->flags & FLAG_WRITTEN)
 221                 return 0;
 222
 223         if (type != OBJ_ANY && obj->type != type)
 224                 die("object type mismatch");
 225
 226         if (!(obj->flags & FLAG_OPEN)) {
 227                 unsigned long size;
 228                 int type = oid_object_info(the_repository, &obj->oid, &size);
 229                 if (type != obj->type || type <= 0)
 230                         die("object of unexpected type");
 231                 obj->flags |= FLAG_WRITTEN;
 232                 return 0;
 233         }
 234
 235         obj_buf = lookup_object_buffer(obj);
 236         if (!obj_buf)
 237                 die("Whoops! Cannot find object '%s'", oid_to_hex(&obj->oid));
 238         if (fsck_object(obj, obj_buf->buffer, obj_buf->size, &fsck_options))
 239                 die("fsck error in packed object");
 240         fsck_options.walk = check_object;
 241         if (fsck_walk(obj, NULL, &fsck_options))
 242                 die("Error on reachable objects of %s", oid_to_hex(&obj->oid));
 243         write_cached_object(obj, obj_buf);
 244         return 0;
 245 }
 246
 247 static void write_rest(void)
 248 {
 249         unsigned i;
 250         for (i = 0; i < nr_objects; i++) {
 251                 if (obj_list[i].obj)
 252                         check_object(obj_list[i].obj, OBJ_ANY, NULL, NULL);
 253         }
 254 }
 255
 256 static void added_object(unsigned nr, enum object_type type,
 257                          void *data, unsigned long size);
 258
 259 /*
 260  * Write out nr-th object from the list, now we know the contents
 261  * of it.  Under --strict, this buffers structured objects in-core,
 262  * to be checked at the end.
 263  */
 264 static void write_object(unsigned nr, enum object_type type,
 265                          void *buf, unsigned long size)
 266 {
 267         if (!strict) {
 268                 if (write_object_file(buf, size, type,
 269                                       &obj_list[nr].oid) < 0)
 270                         die("failed to write object");
 271                 added_object(nr, type, buf, size);
 272                 free(buf);
 273                 obj_list[nr].obj = NULL;
 274         } else if (type == OBJ_BLOB) {
 275                 struct blob *blob;
 276                 if (write_object_file(buf, size, type,
 277                                       &obj_list[nr].oid) < 0)
 278                         die("failed to write object");
 279                 added_object(nr, type, buf, size);
 280                 free(buf);
 281
 282                 blob = lookup_blob(the_repository, &obj_list[nr].oid);
 283                 if (blob)
 284                         blob->object.flags |= FLAG_WRITTEN;
 285                 else
 286                         die("invalid blob object");
 287                 obj_list[nr].obj = NULL;
 288         } else {
 289                 struct object *obj;
 290                 int eaten;
 291                 hash_object_file(the_hash_algo, buf, size, type,
 292                                  &obj_list[nr].oid);
 293                 added_object(nr, type, buf, size);
 294                 obj = parse_object_buffer(the_repository, &obj_list[nr].oid,
 295                                           type, size, buf,
 296                                           &eaten);
 297                 if (!obj)
 298                         die("invalid %s", type_name(type));
 299                 add_object_buffer(obj, buf, size);
 300                 obj->flags |= FLAG_OPEN;
 301                 obj_list[nr].obj = obj;
 302         }
 303 }
 304
 305 static void resolve_delta(unsigned nr, enum object_type type,
 306                           void *base, unsigned long base_size,
 307                           void *delta, unsigned long delta_size)
 308 {
 309         void *result;
 310         unsigned long result_size;
 311
 312         result = patch_delta(base, base_size,
 313                              delta, delta_size,
 314                              &result_size);
 315         if (!result)
 316                 die("failed to apply delta");
 317         free(delta);
 318         write_object(nr, type, result, result_size);
 319 }
 320
 321 /*
 322  * We now know the contents of an object (which is nr-th in the pack);
 323  * resolve all the deltified objects that are based on it.
 324  */
 325 static void added_object(unsigned nr, enum object_type type,
 326                          void *data, unsigned long size)
 327 {
 328         struct delta_info **p = &delta_list;
 329         struct delta_info *info;
 330
 331         while ((info = *p) != NULL) {
 332                 if (oideq(&info->base_oid, &obj_list[nr].oid) ||
 333                     info->base_offset == obj_list[nr].offset) {
 334                         *p = info->next;
 335                         p = &delta_list;
 336                         resolve_delta(info->nr, type, data, size,
 337                                       info->delta, info->size);
 338                         free(info);
 339                         continue;
 340                 }
 341                 p = &info->next;
 342         }
 343 }
 344
 345 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 346                                    unsigned nr)
 347 {
 348         void *buf = get_data(size);
 349
 350         if (buf)
 351                 write_object(nr, type, buf, size);
 352 }
 353
 354 struct input_zstream_data {
 355         git_zstream *zstream;
 356         unsigned char buf[8192];
 357         int status;
 358 };
 359
 360 static const void *feed_input_zstream(struct input_stream *in_stream,
 361                                       unsigned long *readlen)
 362 {
 363         struct input_zstream_data *data = in_stream->data;
 364         git_zstream *zstream = data->zstream;
 365         void *in = fill(1);
 366
 367         if (in_stream->is_finished) {
 368                 *readlen = 0;
 369                 return NULL;
 370         }
 371
 372         zstream->next_out = data->buf;
 373         zstream->avail_out = sizeof(data->buf);
 374         zstream->next_in = in;
 375         zstream->avail_in = len;
 376
 377         data->status = git_inflate(zstream, 0);
 378
 379         in_stream->is_finished = data->status != Z_OK;
 380         use(len - zstream->avail_in);
 381         *readlen = sizeof(data->buf) - zstream->avail_out;
 382
 383         return data->buf;
 384 }
 385
 386 static void stream_blob(unsigned long size, unsigned nr)
 387 {
 388         git_zstream zstream = { 0 };
 389         struct input_zstream_data data = { 0 };
 390         struct input_stream in_stream = {
 391                 .read = feed_input_zstream,
 392                 .data = &data,
 393         };
 394         struct obj_info *info = &obj_list[nr];
 395
 396         data.zstream = &zstream;
 397         git_inflate_init(&zstream);
 398
 399         if (stream_loose_object(&in_stream, size, &info->oid))
 400                 die(_("failed to write object in stream"));
 401
 402         if (data.status != Z_STREAM_END)
 403                 die(_("inflate returned (%d)"), data.status);
 404         git_inflate_end(&zstream);
 405
 406         if (strict) {
 407                 struct blob *blob = lookup_blob(the_repository, &info->oid);
 408
 409                 if (!blob)
 410                         die(_("invalid blob object from stream"));
 411                 blob->object.flags |= FLAG_WRITTEN;
 412         }
 413         info->obj = NULL;
 414 }
 415
 416 static int resolve_against_held(unsigned nr, const struct object_id *base,
 417                                 void *delta_data, unsigned long delta_size)
 418 {
 419         struct object *obj;
 420         struct obj_buffer *obj_buffer;
 421         obj = lookup_object(the_repository, base);
 422         if (!obj)
 423                 return 0;
 424         obj_buffer = lookup_object_buffer(obj);
 425         if (!obj_buffer)
 426                 return 0;
 427         resolve_delta(nr, obj->type, obj_buffer->buffer,
 428                       obj_buffer->size, delta_data, delta_size);
 429         return 1;
 430 }
 431
 432 static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 433                                unsigned nr)
 434 {
 435         void *delta_data, *base;
 436         unsigned long base_size;
 437         struct object_id base_oid;
 438
 439         if (type == OBJ_REF_DELTA) {
 440                 oidread(&base_oid, fill(the_hash_algo->rawsz));
 441                 use(the_hash_algo->rawsz);
 442                 delta_data = get_data(delta_size);
 443                 if (!delta_data)
 444                         return;
 445                 if (repo_has_object_file(the_repository, &base_oid))
 446                         ; /* Ok we have this one */
 447                 else if (resolve_against_held(nr, &base_oid,
 448                                               delta_data, delta_size))
 449                         return; /* we are done */
 450                 else {
 451                         /* cannot resolve yet --- queue it */
 452                         oidclr(&obj_list[nr].oid);
 453                         add_delta_to_list(nr, &base_oid, 0, delta_data, delta_size);
 454                         return;
 455                 }
 456         } else {
 457                 unsigned base_found = 0;
 458                 unsigned char *pack, c;
 459                 off_t base_offset;
 460                 unsigned lo, mid, hi;
 461
 462                 pack = fill(1);
 463                 c = *pack;
 464                 use(1);
 465                 base_offset = c & 127;
 466                 while (c & 128) {
 467                         base_offset += 1;
 468                         if (!base_offset || MSB(base_offset, 7))
 469                                 die("offset value overflow for delta base object");
 470                         pack = fill(1);
 471                         c = *pack;
 472                         use(1);
 473                         base_offset = (base_offset << 7) + (c & 127);
 474                 }
 475                 base_offset = obj_list[nr].offset - base_offset;
 476                 if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 477                         die("offset value out of bound for delta base object");
 478
 479                 delta_data = get_data(delta_size);
 480                 if (!delta_data)
 481                         return;
 482                 lo = 0;
 483                 hi = nr;
 484                 while (lo < hi) {
 485                         mid = lo + (hi - lo) / 2;
 486                         if (base_offset < obj_list[mid].offset) {
 487                                 hi = mid;
 488                         } else if (base_offset > obj_list[mid].offset) {
 489                                 lo = mid + 1;
 490                         } else {
 491                                 oidcpy(&base_oid, &obj_list[mid].oid);
 492                                 base_found = !is_null_oid(&base_oid);
 493                                 break;
 494                         }
 495                 }
 496                 if (!base_found) {
 497                         /*
 498                          * The delta base object is itself a delta that
 499                          * has not been resolved yet.
 500                          */
 501                         oidclr(&obj_list[nr].oid);
 502                         add_delta_to_list(nr, null_oid(), base_offset,
 503                                           delta_data, delta_size);
 504                         return;
 505                 }
 506         }
 507
 508         if (resolve_against_held(nr, &base_oid, delta_data, delta_size))
 509                 return;
 510
 511         base = repo_read_object_file(the_repository, &base_oid, &type,
 512                                      &base_size);
 513         if (!base) {
 514                 error("failed to read delta-pack base object %s",
 515                       oid_to_hex(&base_oid));
 516                 if (!recover)
 517                         exit(1);
 518                 has_errors = 1;
 519                 return;
 520         }
 521         resolve_delta(nr, type, base, base_size, delta_data, delta_size);
 522         free(base);
 523 }
 524
 525 static void unpack_one(unsigned nr)
 526 {
 527         unsigned shift;
 528         unsigned char *pack;
 529         unsigned long size, c;
 530         enum object_type type;
 531
 532         obj_list[nr].offset = consumed_bytes;
 533
 534         pack = fill(1);
 535         c = *pack;
 536         use(1);
 537         type = (c >> 4) & 7;
 538         size = (c & 15);
 539         shift = 4;
 540         while (c & 0x80) {
 541                 pack = fill(1);
 542                 c = *pack;
 543                 use(1);
 544                 size += (c & 0x7f) << shift;
 545                 shift += 7;
 546         }
 547
 548         switch (type) {
 549         case OBJ_BLOB:
 550                 if (!dry_run && size > big_file_threshold) {
 551                         stream_blob(size, nr);
 552                         return;
 553                 }
 554                 /* fallthrough */
 555         case OBJ_COMMIT:
 556         case OBJ_TREE:
 557         case OBJ_TAG:
 558                 unpack_non_delta_entry(type, size, nr);
 559                 return;
 560         case OBJ_REF_DELTA:
 561         case OBJ_OFS_DELTA:
 562                 unpack_delta_entry(type, size, nr);
 563                 return;
 564         default:
 565                 error("bad object type %d", type);
 566                 has_errors = 1;
 567                 if (recover)
 568                         return;
 569                 exit(1);
 570         }
 571 }
 572
 573 static void unpack_all(void)
 574 {
 575         int i;
 576         struct pack_header *hdr = fill(sizeof(struct pack_header));
 577
 578         nr_objects = ntohl(hdr->hdr_entries);
 579
 580         if (ntohl(hdr->hdr_signature) != PACK_SIGNATURE)
 581                 die("bad pack file");
 582         if (!pack_version_ok(hdr->hdr_version))
 583                 die("unknown pack file version %"PRIu32,
 584                         ntohl(hdr->hdr_version));
 585         use(sizeof(struct pack_header));
 586
 587         if (!quiet)
 588                 progress = start_progress(_("Unpacking objects"), nr_objects);
 589         CALLOC_ARRAY(obj_list, nr_objects);
 590         begin_odb_transaction();
 591         for (i = 0; i < nr_objects; i++) {
 592                 unpack_one(i);
 593                 display_progress(progress, i + 1);
 594         }
 595         end_odb_transaction();
 596         stop_progress(&progress);
 597
 598         if (delta_list)
 599                 die("unresolved deltas left after unpacking");
 600 }
 601
 602 int cmd_unpack_objects(int argc, const char **argv, const char *prefix)
 603 {
 604         int i;
 605         struct object_id oid;
 606
 607         read_replace_refs = 0;
 608
 609         git_config(git_default_config, NULL);
 610
 611         quiet = !isatty(2);
 612
 613         for (i = 1 ; i < argc; i++) {
 614                 const char *arg = argv[i];
 615
 616                 if (*arg == '-') {
 617                         if (!strcmp(arg, "-n")) {
 618                                 dry_run = 1;
 619                                 continue;
 620                         }
 621                         if (!strcmp(arg, "-q")) {
 622                                 quiet = 1;
 623                                 continue;
 624                         }
 625                         if (!strcmp(arg, "-r")) {
 626                                 recover = 1;
 627                                 continue;
 628                         }
 629                         if (!strcmp(arg, "--strict")) {
 630                                 strict = 1;
 631                                 continue;
 632                         }
 633                         if (skip_prefix(arg, "--strict=", &arg)) {
 634                                 strict = 1;
 635                                 fsck_set_msg_types(&fsck_options, arg);
 636                                 continue;
 637                         }
 638                         if (starts_with(arg, "--pack_header=")) {
 639                                 struct pack_header *hdr;
 640                                 char *c;
 641
 642                                 hdr = (struct pack_header *)buffer;
 643                                 hdr->hdr_signature = htonl(PACK_SIGNATURE);
 644                                 hdr->hdr_version = htonl(strtoul(arg + 14, &c, 10));
 645                                 if (*c != ',')
 646                                         die("bad %s", arg);
 647                                 hdr->hdr_entries = htonl(strtoul(c + 1, &c, 10));
 648                                 if (*c)
 649                                         die("bad %s", arg);
 650                                 len = sizeof(*hdr);
 651                                 continue;
 652                         }
 653                         if (skip_prefix(arg, "--max-input-size=", &arg)) {
 654                                 max_input_size = strtoumax(arg, NULL, 10);
 655                                 continue;
 656                         }
 657                         usage(unpack_usage);
 658                 }
 659
 660                 /* We don't take any non-flag arguments now.. Maybe some day */
 661                 usage(unpack_usage);
 662         }
 663         the_hash_algo->init_fn(&ctx);
 664         unpack_all();
 665         the_hash_algo->update_fn(&ctx, buffer, offset);
 666         the_hash_algo->final_oid_fn(&oid, &ctx);
 667         if (strict) {
 668                 write_rest();
 669                 if (fsck_finish(&fsck_options))
 670                         die(_("fsck error in pack objects"));
 671         }
 672         if (!hasheq(fill(the_hash_algo->rawsz), oid.hash))
 673                 die("final sha1 did not match");
 674         use(the_hash_algo->rawsz);
 675
 676         /* Write the last part of the buffer to stdout */
 677         while (len) {
 678                 int ret = xwrite(1, buffer + offset, len);
 679                 if (ret <= 0)
 680                         break;
 681                 len -= ret;
 682                 offset += ret;
 683         }
 684
 685         /* All done */
 686         return has_errors;
 687 }