Merge branch 'ps/packfile-store' into jch

author Junio C Hamano <gitster@pobox.com>

Sun, 5 Oct 2025 22:04:13 +0000 (15:04 -0700)

committer Junio C Hamano <gitster@pobox.com>

Sun, 5 Oct 2025 22:04:13 +0000 (15:04 -0700)
author Junio C Hamano <gitster@pobox.com>
Sun, 5 Oct 2025 22:04:13 +0000 (15:04 -0700)
committer Junio C Hamano <gitster@pobox.com>
Sun, 5 Oct 2025 22:04:13 +0000 (15:04 -0700)
diff --cc builtin/fast-import.c
Simple merge
diff --cc builtin/grep.c
Simple merge
diff --cc builtin/pack-objects.c
Simple merge
diff --cc builtin/pack-redundant.c
Simple merge
diff --cc http.c
Simple merge
diff --cc object-file.c

index 17a236d2fe121bc447f73138c9db4a651b07ca22,2bc36ab3ee8cbf2d83c4b3204a7c5df132b934d6..4675c8ed6b67eb8b1f054aa7326f380d9a0a29b5
--- 1/object-file.c
--- 2/object-file.c
+++ b/object-file.c
@@@ -1331,274 -1243,6 +1331,274 @@@ static int index_core(struct index_stat
         return ret;
   }
   
-       reprepare_packed_git(repo);
+ +static int already_written(struct odb_transaction *transaction,
+ +                         struct object_id *oid)
+ +{
+ +      /* The object may already exist in the repository */
+ +      if (odb_has_object(transaction->odb, oid,
+ +                         HAS_OBJECT_RECHECK_PACKED | HAS_OBJECT_FETCH_PROMISOR))
+ +              return 1;
+ +
+ +      /* Might want to keep the list sorted */
+ +      for (uint32_t i = 0; i < transaction->packfile.nr_written; i++)
+ +              if (oideq(&transaction->packfile.written[i]->oid, oid))
+ +                      return 1;
+ +
+ +      /* This is a new object we need to keep */
+ +      return 0;
+ +}
+ +
+ +/* Lazily create backing packfile for the state */
+ +static void prepare_packfile_transaction(struct odb_transaction *transaction,
+ +                                       unsigned flags)
+ +{
+ +      struct transaction_packfile *state = &transaction->packfile;
+ +      if (!(flags & INDEX_WRITE_OBJECT) || state->f)
+ +              return;
+ +
+ +      state->f = create_tmp_packfile(transaction->odb->repo,
+ +                                     &state->pack_tmp_name);
+ +      reset_pack_idx_option(&state->pack_idx_opts);
+ +
+ +      /* Pretend we are going to write only one object */
+ +      state->offset = write_pack_header(state->f, 1);
+ +      if (!state->offset)
+ +              die_errno("unable to write pack header");
+ +}
+ +
+ +/*
+ + * Read the contents from fd for size bytes, streaming it to the
+ + * packfile in state while updating the hash in ctx. Signal a failure
+ + * by returning a negative value when the resulting pack would exceed
+ + * the pack size limit and this is not the first object in the pack,
+ + * so that the caller can discard what we wrote from the current pack
+ + * by truncating it and opening a new one. The caller will then call
+ + * us again after rewinding the input fd.
+ + *
+ + * The already_hashed_to pointer is kept untouched by the caller to
+ + * make sure we do not hash the same byte when we are called
+ + * again. This way, the caller does not have to checkpoint its hash
+ + * status before calling us just in case we ask it to call us again
+ + * with a new pack.
+ + */
+ +static int stream_blob_to_pack(struct transaction_packfile *state,
+ +                             struct git_hash_ctx *ctx, off_t *already_hashed_to,
+ +                             int fd, size_t size, const char *path,
+ +                             unsigned flags)
+ +{
+ +      git_zstream s;
+ +      unsigned char ibuf[16384];
+ +      unsigned char obuf[16384];
+ +      unsigned hdrlen;
+ +      int status = Z_OK;
+ +      int write_object = (flags & INDEX_WRITE_OBJECT);
+ +      off_t offset = 0;
+ +
+ +      git_deflate_init(&s, pack_compression_level);
+ +
+ +      hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, size);
+ +      s.next_out = obuf + hdrlen;
+ +      s.avail_out = sizeof(obuf) - hdrlen;
+ +
+ +      while (status != Z_STREAM_END) {
+ +              if (size && !s.avail_in) {
+ +                      size_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf);
+ +                      ssize_t read_result = read_in_full(fd, ibuf, rsize);
+ +                      if (read_result < 0)
+ +                              die_errno("failed to read from '%s'", path);
+ +                      if ((size_t)read_result != rsize)
+ +                              die("failed to read %u bytes from '%s'",
+ +                                  (unsigned)rsize, path);
+ +                      offset += rsize;
+ +                      if (*already_hashed_to < offset) {
+ +                              size_t hsize = offset - *already_hashed_to;
+ +                              if (rsize < hsize)
+ +                                      hsize = rsize;
+ +                              if (hsize)
+ +                                      git_hash_update(ctx, ibuf, hsize);
+ +                              *already_hashed_to = offset;
+ +                      }
+ +                      s.next_in = ibuf;
+ +                      s.avail_in = rsize;
+ +                      size -= rsize;
+ +              }
+ +
+ +              status = git_deflate(&s, size ? 0 : Z_FINISH);
+ +
+ +              if (!s.avail_out || status == Z_STREAM_END) {
+ +                      if (write_object) {
+ +                              size_t written = s.next_out - obuf;
+ +
+ +                              /* would we bust the size limit? */
+ +                              if (state->nr_written &&
+ +                                  pack_size_limit_cfg &&
+ +                                  pack_size_limit_cfg < state->offset + written) {
+ +                                      git_deflate_abort(&s);
+ +                                      return -1;
+ +                              }
+ +
+ +                              hashwrite(state->f, obuf, written);
+ +                              state->offset += written;
+ +                      }
+ +                      s.next_out = obuf;
+ +                      s.avail_out = sizeof(obuf);
+ +              }
+ +
+ +              switch (status) {
+ +              case Z_OK:
+ +              case Z_BUF_ERROR:
+ +              case Z_STREAM_END:
+ +                      continue;
+ +              default:
+ +                      die("unexpected deflate failure: %d", status);
+ +              }
+ +      }
+ +      git_deflate_end(&s);
+ +      return 0;
+ +}
+ +
+ +static void flush_packfile_transaction(struct odb_transaction *transaction)
+ +{
+ +      struct transaction_packfile *state = &transaction->packfile;
+ +      struct repository *repo = transaction->odb->repo;
+ +      unsigned char hash[GIT_MAX_RAWSZ];
+ +      struct strbuf packname = STRBUF_INIT;
+ +      char *idx_tmp_name = NULL;
+ +
+ +      if (!state->f)
+ +              return;
+ +
+ +      if (state->nr_written == 0) {
+ +              close(state->f->fd);
+ +              free_hashfile(state->f);
+ +              unlink(state->pack_tmp_name);
+ +              goto clear_exit;
+ +      } else if (state->nr_written == 1) {
+ +              finalize_hashfile(state->f, hash, FSYNC_COMPONENT_PACK,
+ +                                CSUM_HASH_IN_STREAM | CSUM_FSYNC | CSUM_CLOSE);
+ +      } else {
+ +              int fd = finalize_hashfile(state->f, hash, FSYNC_COMPONENT_PACK, 0);
+ +              fixup_pack_header_footer(repo->hash_algo, fd, hash, state->pack_tmp_name,
+ +                                       state->nr_written, hash,
+ +                                       state->offset);
+ +              close(fd);
+ +      }
+ +
+ +      strbuf_addf(&packname, "%s/pack/pack-%s.",
+ +                  repo_get_object_directory(transaction->odb->repo),
+ +                  hash_to_hex_algop(hash, repo->hash_algo));
+ +
+ +      stage_tmp_packfiles(repo, &packname, state->pack_tmp_name,
+ +                          state->written, state->nr_written, NULL,
+ +                          &state->pack_idx_opts, hash, &idx_tmp_name);
+ +      rename_tmp_packfile_idx(repo, &packname, &idx_tmp_name);
+ +
+ +      for (uint32_t i = 0; i < state->nr_written; i++)
+ +              free(state->written[i]);
+ +
+ +clear_exit:
+ +      free(idx_tmp_name);
+ +      free(state->pack_tmp_name);
+ +      free(state->written);
+ +      memset(state, 0, sizeof(*state));
+ +
+ +      strbuf_release(&packname);
+ +      /* Make objects we just wrote available to ourselves */
++      odb_reprepare(repo->objects);
+ +}
+ +
+ +/*
+ + * This writes the specified object to a packfile. Objects written here
+ + * during the same transaction are written to the same packfile. The
+ + * packfile is not flushed until the transaction is flushed. The caller
+ + * is expected to ensure a valid transaction is setup for objects to be
+ + * recorded to.
+ + *
+ + * This also bypasses the usual "convert-to-git" dance, and that is on
+ + * purpose. We could write a streaming version of the converting
+ + * functions and insert that before feeding the data to fast-import
+ + * (or equivalent in-core API described above). However, that is
+ + * somewhat complicated, as we do not know the size of the filter
+ + * result, which we need to know beforehand when writing a git object.
+ + * Since the primary motivation for trying to stream from the working
+ + * tree file and to avoid mmaping it in core is to deal with large
+ + * binary blobs, they generally do not want to get any conversion, and
+ + * callers should avoid this code path when filters are requested.
+ + */
+ +static int index_blob_packfile_transaction(struct odb_transaction *transaction,
+ +                                         struct object_id *result_oid, int fd,
+ +                                         size_t size, const char *path,
+ +                                         unsigned flags)
+ +{
+ +      struct transaction_packfile *state = &transaction->packfile;
+ +      off_t seekback, already_hashed_to;
+ +      struct git_hash_ctx ctx;
+ +      unsigned char obuf[16384];
+ +      unsigned header_len;
+ +      struct hashfile_checkpoint checkpoint;
+ +      struct pack_idx_entry *idx = NULL;
+ +
+ +      seekback = lseek(fd, 0, SEEK_CUR);
+ +      if (seekback == (off_t)-1)
+ +              return error("cannot find the current offset");
+ +
+ +      header_len = format_object_header((char *)obuf, sizeof(obuf),
+ +                                        OBJ_BLOB, size);
+ +      transaction->odb->repo->hash_algo->init_fn(&ctx);
+ +      git_hash_update(&ctx, obuf, header_len);
+ +
+ +      /* Note: idx is non-NULL when we are writing */
+ +      if ((flags & INDEX_WRITE_OBJECT) != 0) {
+ +              CALLOC_ARRAY(idx, 1);
+ +
+ +              prepare_packfile_transaction(transaction, flags);
+ +              hashfile_checkpoint_init(state->f, &checkpoint);
+ +      }
+ +
+ +      already_hashed_to = 0;
+ +
+ +      while (1) {
+ +              prepare_packfile_transaction(transaction, flags);
+ +              if (idx) {
+ +                      hashfile_checkpoint(state->f, &checkpoint);
+ +                      idx->offset = state->offset;
+ +                      crc32_begin(state->f);
+ +              }
+ +              if (!stream_blob_to_pack(state, &ctx, &already_hashed_to,
+ +                                       fd, size, path, flags))
+ +                      break;
+ +              /*
+ +               * Writing this object to the current pack will make
+ +               * it too big; we need to truncate it, start a new
+ +               * pack, and write into it.
+ +               */
+ +              if (!idx)
+ +                      BUG("should not happen");
+ +              hashfile_truncate(state->f, &checkpoint);
+ +              state->offset = checkpoint.offset;
+ +              flush_packfile_transaction(transaction);
+ +              if (lseek(fd, seekback, SEEK_SET) == (off_t)-1)
+ +                      return error("cannot seek back");
+ +      }
+ +      git_hash_final_oid(result_oid, &ctx);
+ +      if (!idx)
+ +              return 0;
+ +
+ +      idx->crc32 = crc32_end(state->f);
+ +      if (already_written(transaction, result_oid)) {
+ +              hashfile_truncate(state->f, &checkpoint);
+ +              state->offset = checkpoint.offset;
+ +              free(idx);
+ +      } else {
+ +              oidcpy(&idx->oid, result_oid);
+ +              ALLOC_GROW(state->written,
+ +                         state->nr_written + 1,
+ +                         state->alloc_written);
+ +              state->written[state->nr_written++] = idx;
+ +      }
+ +      return 0;
+ +}
+ +
   int index_fd(struct index_state *istate, struct object_id *oid,
              int fd, struct stat *st,
              enum object_type type, const char *path, unsigned flags)
diff --cc object-name.c
Simple merge
diff --cc odb.c

index 1fc14888919684c2ef75ef8ef40b3385c29c7f6b,65a6cc67b61ccf99e7b7293906e6772f2927da9b..00a6e71568b5985c0b344bfebbe92d1e0bff1294
--- 1/odb.c
--- 2/odb.c
+++ b/odb.c
@@@ -1035,29 -1034,34 +1034,44 @@@ void odb_clear(struct object_database *
                 free((char *) o->cached_objects[i].value.buf);
         FREE_AND_NULL(o->cached_objects);
   
-       INIT_LIST_HEAD(&o->packed_git_mru);
         close_object_store(o);
+       packfile_store_free(o->packfiles);
+       o->packfiles = NULL;
+ 
+       string_list_clear(&o->submodule_source_paths, 0);
+ }
+ 
+ void odb_reprepare(struct object_database *o)
+ {
+       struct odb_source *source;
+ 
+       obj_read_lock();
   
         /*
-        * `close_object_store()` only closes the packfiles, but doesn't free
-        * them. We thus have to do this manually.
+        * Reprepare alt odbs, in case the alternates file was modified
+        * during the course of this process. This only _adds_ odbs to
+        * the linked list, so existing odbs will continue to exist for
+        * the lifetime of the process.
          */
-       for (struct packed_git *p = o->packed_git, *next; p; p = next) {
-               next = p->next;
-               free(p);
-       }
-       o->packed_git = NULL;
+       o->loaded_alternates = 0;
+       odb_prepare_alternates(o);
   
-       hashmap_clear(&o->pack_map);
-       string_list_clear(&o->submodule_source_paths, 0);
+       for (source = o->sources; source; source = source->next)
+               odb_clear_loose_cache(source);
+ 
+       o->approximate_object_count_valid = 0;
+ 
+       packfile_store_reprepare(o->packfiles);
+ 
+       obj_read_unlock();
   }
+ +
+ +struct odb_transaction *odb_transaction_begin(struct object_database *odb)
+ +{
+ +      return object_file_transaction_begin(odb->sources);
+ +}
+ +
+ +void odb_transaction_commit(struct odb_transaction *transaction)
+ +{
+ +      object_file_transaction_commit(transaction);
+ +}
diff --cc odb.h

index 9e3dd9d6df9db78ce4388bb322b60fea35f7041f,ab39e3605d541997d8b12fbf14bc9e78def515a5..7a3cfd34e149247491d242119db42d866b4efedb
--- 1/odb.h
--- 2/odb.h
+++ b/odb.h
@@@ -91,8 -90,8 +90,9 @@@ struct odb_source 
   };
   
   struct packed_git;
+ struct packfile_store;
   struct cached_object_entry;
+ +struct odb_transaction;
   
   /*
    * The object database encapsulates access to objects in a repository. It
@@@ -192,20 -160,13 +168,26 @@@ struct object_database 
   
   struct object_database *odb_new(struct repository *repo);
   void odb_clear(struct object_database *o);
+ 
+ /*
+  * Clear caches, reload alternates and then reload object sources so that new
+  * objects may become accessible.
+  */
+ void odb_reprepare(struct object_database *o);
   
+ +/*
+ + * Starts an ODB transaction. Subsequent objects are written to the transaction
+ + * and not committed until odb_transaction_commit() is invoked on the
+ + * transaction. If the ODB already has a pending transaction, NULL is returned.
+ + */
+ +struct odb_transaction *odb_transaction_begin(struct object_database *odb);
+ +
+ +/*
+ + * Commits an ODB transaction making the written objects visible. If the
+ + * specified transaction is NULL, the function is a no-op.
+ + */
+ +void odb_transaction_commit(struct odb_transaction *transaction);
+ +
   /*
    * Find source by its object directory path. Returns a `NULL` pointer in case
    * the source could not be found.
author	Junio C Hamano <gitster@pobox.com>
	Sun, 5 Oct 2025 22:04:13 +0000 (15:04 -0700)
committer	Junio C Hamano <gitster@pobox.com>
	Sun, 5 Oct 2025 22:04:13 +0000 (15:04 -0700)
		1	2
builtin/fast-import.c	patch \|	diff1 \|	diff2 \|	blob \| history
builtin/grep.c	patch \|	diff1 \|	diff2 \|	blob \| history
builtin/pack-objects.c	patch \|	diff1 \|	diff2 \|	blob \| history
builtin/pack-redundant.c	patch \|	diff1 \|	diff2 \|	blob \| history
http.c	patch \|	diff1 \|	diff2 \|	blob \| history
object-file.c	patch \|	diff1 \|	diff2 \|	blob \| history
object-name.c	patch \|	diff1 \|	diff2 \|	blob \| history
odb.c	patch \|	diff1 \|	diff2 \|	blob \| history
odb.h	patch \|	diff1 \|	diff2 \|	blob \| history