builtin/unpack-objects.c

   1 #include "builtin.h"
   2 #include "bulk-checkin.h"
   3 #include "config.h"
   4 #include "environment.h"
   5 #include "gettext.h"
   6 #include "git-zlib.h"
   7 #include "hex.h"
   8 #include "object-store.h"
   9 #include "object.h"
  10 #include "delta.h"
  11 #include "pack.h"
  12 #include "blob.h"
  13 #include "commit.h"
  14 #include "replace-object.h"
  15 #include "tag.h"
  16 #include "tree.h"
  17 #include "tree-walk.h"
  18 #include "progress.h"
  19 #include "decorate.h"
  20 #include "fsck.h"
  21
  22 static int dry_run, quiet, recover, has_errors, strict;
  23 static const char unpack_usage[] = "git unpack-objects [-n] [-q] [-r] [--strict]";
  24
  25 /* We always read in 4kB chunks. */
  26 static unsigned char buffer[4096];
  27 static unsigned int offset, len;
  28 static off_t consumed_bytes;
  29 static off_t max_input_size;
  30 static git_hash_ctx ctx;
  31 static struct fsck_options fsck_options = FSCK_OPTIONS_STRICT;
  32 static struct progress *progress;
  33
  34 /*
  35  * When running under --strict mode, objects whose reachability are
  36  * suspect are kept in core without getting written in the object
  37  * store.
  38  */
  39 struct obj_buffer {
  40         char *buffer;
  41         unsigned long size;
  42 };
  43
  44 static struct decoration obj_decorate;
  45
  46 static struct obj_buffer *lookup_object_buffer(struct object *base)
  47 {
  48         return lookup_decoration(&obj_decorate, base);
  49 }
  50
  51 static void add_object_buffer(struct object *object, char *buffer, unsigned long size)
  52 {
  53         struct obj_buffer *obj;
  54         CALLOC_ARRAY(obj, 1);
  55         obj->buffer = buffer;
  56         obj->size = size;
  57         if (add_decoration(&obj_decorate, object, obj))
  58                 die("object %s tried to add buffer twice!", oid_to_hex(&object->oid));
  59 }
  60
  61 /*
  62  * Make sure at least "min" bytes are available in the buffer, and
  63  * return the pointer to the buffer.
  64  */
  65 static void *fill(int min)
  66 {
  67         if (min <= len)
  68                 return buffer + offset;
  69         if (min > sizeof(buffer))
  70                 die("cannot fill %d bytes", min);
  71         if (offset) {
  72                 the_hash_algo->update_fn(&ctx, buffer, offset);
  73                 memmove(buffer, buffer + offset, len);
  74                 offset = 0;
  75         }
  76         do {
  77                 ssize_t ret = xread(0, buffer + len, sizeof(buffer) - len);
  78                 if (ret <= 0) {
  79                         if (!ret)
  80                                 die("early EOF");
  81                         die_errno("read error on input");
  82                 }
  83                 len += ret;
  84         } while (len < min);
  85         return buffer;
  86 }
  87
  88 static void use(int bytes)
  89 {
  90         if (bytes > len)
  91                 die("used more bytes than were available");
  92         len -= bytes;
  93         offset += bytes;
  94
  95         /* make sure off_t is sufficiently large not to wrap */
  96         if (signed_add_overflows(consumed_bytes, bytes))
  97                 die("pack too large for current definition of off_t");
  98         consumed_bytes += bytes;
  99         if (max_input_size && consumed_bytes > max_input_size)
 100                 die(_("pack exceeds maximum allowed size"));
 101         display_throughput(progress, consumed_bytes);
 102 }
 103
 104 /*
 105  * Decompress zstream from the standard input into a newly
 106  * allocated buffer of specified size and return the buffer.
 107  * The caller is responsible to free the returned buffer.
 108  *
 109  * But for dry_run mode, "get_data()" is only used to check the
 110  * integrity of data, and the returned buffer is not used at all.
 111  * Therefore, in dry_run mode, "get_data()" will release the small
 112  * allocated buffer which is reused to hold temporary zstream output
 113  * and return NULL instead of returning garbage data.
 114  */
 115 static void *get_data(unsigned long size)
 116 {
 117         git_zstream stream;
 118         unsigned long bufsize = dry_run && size > 8192 ? 8192 : size;
 119         void *buf = xmallocz(bufsize);
 120
 121         memset(&stream, 0, sizeof(stream));
 122
 123         stream.next_out = buf;
 124         stream.avail_out = bufsize;
 125         stream.next_in = fill(1);
 126         stream.avail_in = len;
 127         git_inflate_init(&stream);
 128
 129         for (;;) {
 130                 int ret = git_inflate(&stream, 0);
 131                 use(len - stream.avail_in);
 132                 if (stream.total_out == size && ret == Z_STREAM_END)
 133                         break;
 134                 if (ret != Z_OK) {
 135                         error("inflate returned %d", ret);
 136                         FREE_AND_NULL(buf);
 137                         if (!recover)
 138                                 exit(1);
 139                         has_errors = 1;
 140                         break;
 141                 }
 142                 stream.next_in = fill(1);
 143                 stream.avail_in = len;
 144                 if (dry_run) {
 145                         /* reuse the buffer in dry_run mode */
 146                         stream.next_out = buf;
 147                         stream.avail_out = bufsize > size - stream.total_out ?
 148                                                    size - stream.total_out :
 149                                                    bufsize;
 150                 }
 151         }
 152         git_inflate_end(&stream);
 153         if (dry_run)
 154                 FREE_AND_NULL(buf);
 155         return buf;
 156 }
 157
 158 struct delta_info {
 159         struct object_id base_oid;
 160         unsigned nr;
 161         off_t base_offset;
 162         unsigned long size;
 163         void *delta;
 164         struct delta_info *next;
 165 };
 166
 167 static struct delta_info *delta_list;
 168
 169 static void add_delta_to_list(unsigned nr, const struct object_id *base_oid,
 170                               off_t base_offset,
 171                               void *delta, unsigned long size)
 172 {
 173         struct delta_info *info = xmalloc(sizeof(*info));
 174
 175         oidcpy(&info->base_oid, base_oid);
 176         info->base_offset = base_offset;
 177         info->size = size;
 178         info->delta = delta;
 179         info->nr = nr;
 180         info->next = delta_list;
 181         delta_list = info;
 182 }
 183
 184 struct obj_info {
 185         off_t offset;
 186         struct object_id oid;
 187         struct object *obj;
 188 };
 189
 190 /* Remember to update object flag allocation in object.h */
 191 #define FLAG_OPEN (1u<<20)
 192 #define FLAG_WRITTEN (1u<<21)
 193
 194 static struct obj_info *obj_list;
 195 static unsigned nr_objects;
 196
 197 /*
 198  * Called only from check_object() after it verified this object
 199  * is Ok.
 200  */
 201 static void write_cached_object(struct object *obj, struct obj_buffer *obj_buf)
 202 {
 203         struct object_id oid;
 204
 205         if (write_object_file(obj_buf->buffer, obj_buf->size,
 206                               obj->type, &oid) < 0)
 207                 die("failed to write object %s", oid_to_hex(&obj->oid));
 208         obj->flags |= FLAG_WRITTEN;
 209 }
 210
 211 /*
 212  * At the very end of the processing, write_rest() scans the objects
 213  * that have reachability requirements and calls this function.
 214  * Verify its reachability and validity recursively and write it out.
 215  */
 216 static int check_object(struct object *obj, enum object_type type,
 217                         void *data, struct fsck_options *options)
 218 {
 219         struct obj_buffer *obj_buf;
 220
 221         if (!obj)
 222                 return 1;
 223
 224         if (obj->flags & FLAG_WRITTEN)
 225                 return 0;
 226
 227         if (type != OBJ_ANY && obj->type != type)
 228                 die("object type mismatch");
 229
 230         if (!(obj->flags & FLAG_OPEN)) {
 231                 unsigned long size;
 232                 int type = oid_object_info(the_repository, &obj->oid, &size);
 233                 if (type != obj->type || type <= 0)
 234                         die("object of unexpected type");
 235                 obj->flags |= FLAG_WRITTEN;
 236                 return 0;
 237         }
 238
 239         obj_buf = lookup_object_buffer(obj);
 240         if (!obj_buf)
 241                 die("Whoops! Cannot find object '%s'", oid_to_hex(&obj->oid));
 242         if (fsck_object(obj, obj_buf->buffer, obj_buf->size, &fsck_options))
 243                 die("fsck error in packed object");
 244         fsck_options.walk = check_object;
 245         if (fsck_walk(obj, NULL, &fsck_options))
 246                 die("Error on reachable objects of %s", oid_to_hex(&obj->oid));
 247         write_cached_object(obj, obj_buf);
 248         return 0;
 249 }
 250
 251 static void write_rest(void)
 252 {
 253         unsigned i;
 254         for (i = 0; i < nr_objects; i++) {
 255                 if (obj_list[i].obj)
 256                         check_object(obj_list[i].obj, OBJ_ANY, NULL, NULL);
 257         }
 258 }
 259
 260 static void added_object(unsigned nr, enum object_type type,
 261                          void *data, unsigned long size);
 262
 263 /*
 264  * Write out nr-th object from the list, now we know the contents
 265  * of it.  Under --strict, this buffers structured objects in-core,
 266  * to be checked at the end.
 267  */
 268 static void write_object(unsigned nr, enum object_type type,
 269                          void *buf, unsigned long size)
 270 {
 271         if (!strict) {
 272                 if (write_object_file(buf, size, type,
 273                                       &obj_list[nr].oid) < 0)
 274                         die("failed to write object");
 275                 added_object(nr, type, buf, size);
 276                 free(buf);
 277                 obj_list[nr].obj = NULL;
 278         } else if (type == OBJ_BLOB) {
 279                 struct blob *blob;
 280                 if (write_object_file(buf, size, type,
 281                                       &obj_list[nr].oid) < 0)
 282                         die("failed to write object");
 283                 added_object(nr, type, buf, size);
 284                 free(buf);
 285
 286                 blob = lookup_blob(the_repository, &obj_list[nr].oid);
 287                 if (blob)
 288                         blob->object.flags |= FLAG_WRITTEN;
 289                 else
 290                         die("invalid blob object");
 291                 obj_list[nr].obj = NULL;
 292         } else {
 293                 struct object *obj;
 294                 int eaten;
 295                 hash_object_file(the_hash_algo, buf, size, type,
 296                                  &obj_list[nr].oid);
 297                 added_object(nr, type, buf, size);
 298                 obj = parse_object_buffer(the_repository, &obj_list[nr].oid,
 299                                           type, size, buf,
 300                                           &eaten);
 301                 if (!obj)
 302                         die("invalid %s", type_name(type));
 303                 add_object_buffer(obj, buf, size);
 304                 obj->flags |= FLAG_OPEN;
 305                 obj_list[nr].obj = obj;
 306         }
 307 }
 308
 309 static void resolve_delta(unsigned nr, enum object_type type,
 310                           void *base, unsigned long base_size,
 311                           void *delta, unsigned long delta_size)
 312 {
 313         void *result;
 314         unsigned long result_size;
 315
 316         result = patch_delta(base, base_size,
 317                              delta, delta_size,
 318                              &result_size);
 319         if (!result)
 320                 die("failed to apply delta");
 321         free(delta);
 322         write_object(nr, type, result, result_size);
 323 }
 324
 325 /*
 326  * We now know the contents of an object (which is nr-th in the pack);
 327  * resolve all the deltified objects that are based on it.
 328  */
 329 static void added_object(unsigned nr, enum object_type type,
 330                          void *data, unsigned long size)
 331 {
 332         struct delta_info **p = &delta_list;
 333         struct delta_info *info;
 334
 335         while ((info = *p) != NULL) {
 336                 if (oideq(&info->base_oid, &obj_list[nr].oid) ||
 337                     info->base_offset == obj_list[nr].offset) {
 338                         *p = info->next;
 339                         p = &delta_list;
 340                         resolve_delta(info->nr, type, data, size,
 341                                       info->delta, info->size);
 342                         free(info);
 343                         continue;
 344                 }
 345                 p = &info->next;
 346         }
 347 }
 348
 349 static void unpack_non_delta_entry(enum object_type type, unsigned long size,
 350                                    unsigned nr)
 351 {
 352         void *buf = get_data(size);
 353
 354         if (buf)
 355                 write_object(nr, type, buf, size);
 356 }
 357
 358 struct input_zstream_data {
 359         git_zstream *zstream;
 360         unsigned char buf[8192];
 361         int status;
 362 };
 363
 364 static const void *feed_input_zstream(struct input_stream *in_stream,
 365                                       unsigned long *readlen)
 366 {
 367         struct input_zstream_data *data = in_stream->data;
 368         git_zstream *zstream = data->zstream;
 369         void *in = fill(1);
 370
 371         if (in_stream->is_finished) {
 372                 *readlen = 0;
 373                 return NULL;
 374         }
 375
 376         zstream->next_out = data->buf;
 377         zstream->avail_out = sizeof(data->buf);
 378         zstream->next_in = in;
 379         zstream->avail_in = len;
 380
 381         data->status = git_inflate(zstream, 0);
 382
 383         in_stream->is_finished = data->status != Z_OK;
 384         use(len - zstream->avail_in);
 385         *readlen = sizeof(data->buf) - zstream->avail_out;
 386
 387         return data->buf;
 388 }
 389
 390 static void stream_blob(unsigned long size, unsigned nr)
 391 {
 392         git_zstream zstream = { 0 };
 393         struct input_zstream_data data = { 0 };
 394         struct input_stream in_stream = {
 395                 .read = feed_input_zstream,
 396                 .data = &data,
 397         };
 398         struct obj_info *info = &obj_list[nr];
 399
 400         data.zstream = &zstream;
 401         git_inflate_init(&zstream);
 402
 403         if (stream_loose_object(&in_stream, size, &info->oid))
 404                 die(_("failed to write object in stream"));
 405
 406         if (data.status != Z_STREAM_END)
 407                 die(_("inflate returned (%d)"), data.status);
 408         git_inflate_end(&zstream);
 409
 410         if (strict) {
 411                 struct blob *blob = lookup_blob(the_repository, &info->oid);
 412
 413                 if (!blob)
 414                         die(_("invalid blob object from stream"));
 415                 blob->object.flags |= FLAG_WRITTEN;
 416         }
 417         info->obj = NULL;
 418 }
 419
 420 static int resolve_against_held(unsigned nr, const struct object_id *base,
 421                                 void *delta_data, unsigned long delta_size)
 422 {
 423         struct object *obj;
 424         struct obj_buffer *obj_buffer;
 425         obj = lookup_object(the_repository, base);
 426         if (!obj)
 427                 return 0;
 428         obj_buffer = lookup_object_buffer(obj);
 429         if (!obj_buffer)
 430                 return 0;
 431         resolve_delta(nr, obj->type, obj_buffer->buffer,
 432                       obj_buffer->size, delta_data, delta_size);
 433         return 1;
 434 }
 435
 436 static void unpack_delta_entry(enum object_type type, unsigned long delta_size,
 437                                unsigned nr)
 438 {
 439         void *delta_data, *base;
 440         unsigned long base_size;
 441         struct object_id base_oid;
 442
 443         if (type == OBJ_REF_DELTA) {
 444                 oidread(&base_oid, fill(the_hash_algo->rawsz));
 445                 use(the_hash_algo->rawsz);
 446                 delta_data = get_data(delta_size);
 447                 if (!delta_data)
 448                         return;
 449                 if (repo_has_object_file(the_repository, &base_oid))
 450                         ; /* Ok we have this one */
 451                 else if (resolve_against_held(nr, &base_oid,
 452                                               delta_data, delta_size))
 453                         return; /* we are done */
 454                 else {
 455                         /* cannot resolve yet --- queue it */
 456                         oidclr(&obj_list[nr].oid);
 457                         add_delta_to_list(nr, &base_oid, 0, delta_data, delta_size);
 458                         return;
 459                 }
 460         } else {
 461                 unsigned base_found = 0;
 462                 unsigned char *pack, c;
 463                 off_t base_offset;
 464                 unsigned lo, mid, hi;
 465
 466                 pack = fill(1);
 467                 c = *pack;
 468                 use(1);
 469                 base_offset = c & 127;
 470                 while (c & 128) {
 471                         base_offset += 1;
 472                         if (!base_offset || MSB(base_offset, 7))
 473                                 die("offset value overflow for delta base object");
 474                         pack = fill(1);
 475                         c = *pack;
 476                         use(1);
 477                         base_offset = (base_offset << 7) + (c & 127);
 478                 }
 479                 base_offset = obj_list[nr].offset - base_offset;
 480                 if (base_offset <= 0 || base_offset >= obj_list[nr].offset)
 481                         die("offset value out of bound for delta base object");
 482
 483                 delta_data = get_data(delta_size);
 484                 if (!delta_data)
 485                         return;
 486                 lo = 0;
 487                 hi = nr;
 488                 while (lo < hi) {
 489                         mid = lo + (hi - lo) / 2;
 490                         if (base_offset < obj_list[mid].offset) {
 491                                 hi = mid;
 492                         } else if (base_offset > obj_list[mid].offset) {
 493                                 lo = mid + 1;
 494                         } else {
 495                                 oidcpy(&base_oid, &obj_list[mid].oid);
 496                                 base_found = !is_null_oid(&base_oid);
 497                                 break;
 498                         }
 499                 }
 500                 if (!base_found) {
 501                         /*
 502                          * The delta base object is itself a delta that
 503                          * has not been resolved yet.
 504                          */
 505                         oidclr(&obj_list[nr].oid);
 506                         add_delta_to_list(nr, null_oid(), base_offset,
 507                                           delta_data, delta_size);
 508                         return;
 509                 }
 510         }
 511
 512         if (resolve_against_held(nr, &base_oid, delta_data, delta_size))
 513                 return;
 514
 515         base = repo_read_object_file(the_repository, &base_oid, &type,
 516                                      &base_size);
 517         if (!base) {
 518                 error("failed to read delta-pack base object %s",
 519                       oid_to_hex(&base_oid));
 520                 if (!recover)
 521                         exit(1);
 522                 has_errors = 1;
 523                 return;
 524         }
 525         resolve_delta(nr, type, base, base_size, delta_data, delta_size);
 526         free(base);
 527 }
 528
 529 static void unpack_one(unsigned nr)
 530 {
 531         unsigned shift;
 532         unsigned char *pack;
 533         unsigned long size, c;
 534         enum object_type type;
 535
 536         obj_list[nr].offset = consumed_bytes;
 537
 538         pack = fill(1);
 539         c = *pack;
 540         use(1);
 541         type = (c >> 4) & 7;
 542         size = (c & 15);
 543         shift = 4;
 544         while (c & 0x80) {
 545                 pack = fill(1);
 546                 c = *pack;
 547                 use(1);
 548                 size += (c & 0x7f) << shift;
 549                 shift += 7;
 550         }
 551
 552         switch (type) {
 553         case OBJ_BLOB:
 554                 if (!dry_run && size > big_file_threshold) {
 555                         stream_blob(size, nr);
 556                         return;
 557                 }
 558                 /* fallthrough */
 559         case OBJ_COMMIT:
 560         case OBJ_TREE:
 561         case OBJ_TAG:
 562                 unpack_non_delta_entry(type, size, nr);
 563                 return;
 564         case OBJ_REF_DELTA:
 565         case OBJ_OFS_DELTA:
 566                 unpack_delta_entry(type, size, nr);
 567                 return;
 568         default:
 569                 error("bad object type %d", type);
 570                 has_errors = 1;
 571                 if (recover)
 572                         return;
 573                 exit(1);
 574         }
 575 }
 576
 577 static void unpack_all(void)
 578 {
 579         int i;
 580         struct pack_header *hdr = fill(sizeof(struct pack_header));
 581
 582         nr_objects = ntohl(hdr->hdr_entries);
 583
 584         if (ntohl(hdr->hdr_signature) != PACK_SIGNATURE)
 585                 die("bad pack file");
 586         if (!pack_version_ok(hdr->hdr_version))
 587                 die("unknown pack file version %"PRIu32,
 588                         ntohl(hdr->hdr_version));
 589         use(sizeof(struct pack_header));
 590
 591         if (!quiet)
 592                 progress = start_progress(_("Unpacking objects"), nr_objects);
 593         CALLOC_ARRAY(obj_list, nr_objects);
 594         begin_odb_transaction();
 595         for (i = 0; i < nr_objects; i++) {
 596                 unpack_one(i);
 597                 display_progress(progress, i + 1);
 598         }
 599         end_odb_transaction();
 600         stop_progress(&progress);
 601
 602         if (delta_list)
 603                 die("unresolved deltas left after unpacking");
 604 }
 605
 606 int cmd_unpack_objects(int argc, const char **argv, const char *prefix UNUSED)
 607 {
 608         int i;
 609         struct object_id oid;
 610
 611         read_replace_refs = 0;
 612
 613         git_config(git_default_config, NULL);
 614
 615         quiet = !isatty(2);
 616
 617         for (i = 1 ; i < argc; i++) {
 618                 const char *arg = argv[i];
 619
 620                 if (*arg == '-') {
 621                         if (!strcmp(arg, "-n")) {
 622                                 dry_run = 1;
 623                                 continue;
 624                         }
 625                         if (!strcmp(arg, "-q")) {
 626                                 quiet = 1;
 627                                 continue;
 628                         }
 629                         if (!strcmp(arg, "-r")) {
 630                                 recover = 1;
 631                                 continue;
 632                         }
 633                         if (!strcmp(arg, "--strict")) {
 634                                 strict = 1;
 635                                 continue;
 636                         }
 637                         if (skip_prefix(arg, "--strict=", &arg)) {
 638                                 strict = 1;
 639                                 fsck_set_msg_types(&fsck_options, arg);
 640                                 continue;
 641                         }
 642                         if (starts_with(arg, "--pack_header=")) {
 643                                 struct pack_header *hdr;
 644                                 char *c;
 645
 646                                 hdr = (struct pack_header *)buffer;
 647                                 hdr->hdr_signature = htonl(PACK_SIGNATURE);
 648                                 hdr->hdr_version = htonl(strtoul(arg + 14, &c, 10));
 649                                 if (*c != ',')
 650                                         die("bad %s", arg);
 651                                 hdr->hdr_entries = htonl(strtoul(c + 1, &c, 10));
 652                                 if (*c)
 653                                         die("bad %s", arg);
 654                                 len = sizeof(*hdr);
 655                                 continue;
 656                         }
 657                         if (skip_prefix(arg, "--max-input-size=", &arg)) {
 658                                 max_input_size = strtoumax(arg, NULL, 10);
 659                                 continue;
 660                         }
 661                         usage(unpack_usage);
 662                 }
 663
 664                 /* We don't take any non-flag arguments now.. Maybe some day */
 665                 usage(unpack_usage);
 666         }
 667         the_hash_algo->init_fn(&ctx);
 668         unpack_all();
 669         the_hash_algo->update_fn(&ctx, buffer, offset);
 670         the_hash_algo->final_oid_fn(&oid, &ctx);
 671         if (strict) {
 672                 write_rest();
 673                 if (fsck_finish(&fsck_options))
 674                         die(_("fsck error in pack objects"));
 675         }
 676         if (!hasheq(fill(the_hash_algo->rawsz), oid.hash))
 677                 die("final sha1 did not match");
 678         use(the_hash_algo->rawsz);
 679
 680         /* Write the last part of the buffer to stdout */
 681         while (len) {
 682                 int ret = xwrite(1, buffer + offset, len);
 683                 if (ret <= 0)
 684                         break;
 685                 len -= ret;
 686                 offset += ret;
 687         }
 688
 689         /* All done */
 690         return has_errors;
 691 }