From: Junio C Hamano Date: Mon, 29 Sep 2025 18:46:45 +0000 (-0700) Subject: Merge branch 'ps/packfile-store' into next X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=342bb57fc27401ce6abd975ebb73479d9b8086c7;p=thirdparty%2Fgit.git Merge branch 'ps/packfile-store' into next Code clean-up around the in-core list of all the pack files and object database(s). * ps/packfile-store: packfile: refactor `get_packed_git_mru()` to work on packfile store packfile: refactor `get_all_packs()` to work on packfile store packfile: refactor `get_packed_git()` to work on packfile store packfile: move `get_multi_pack_index()` into "midx.c" packfile: introduce function to load and add packfiles packfile: refactor `install_packed_git()` to work on packfile store packfile: split up responsibilities of `reprepare_packed_git()` packfile: refactor `prepare_packed_git()` to work on packfile store packfile: reorder functions to avoid function declaration odb: move kept cache into `struct packfile_store` odb: move MRU list of packfiles into `struct packfile_store` odb: move packfile map into `struct packfile_store` odb: move initialization bit into `struct packfile_store` odb: move list of packfiles into `struct packfile_store` packfile: introduce a new `struct packfile_store` --- 342bb57fc27401ce6abd975ebb73479d9b8086c7 diff --cc object-file.c index 17a236d2fe,2bc36ab3ee..4675c8ed6b --- a/object-file.c +++ b/object-file.c @@@ -1331,274 -1243,6 +1331,274 @@@ static int index_core(struct index_stat return ret; } +static int already_written(struct odb_transaction *transaction, + struct object_id *oid) +{ + /* The object may already exist in the repository */ + if (odb_has_object(transaction->odb, oid, + HAS_OBJECT_RECHECK_PACKED | HAS_OBJECT_FETCH_PROMISOR)) + return 1; + + /* Might want to keep the list sorted */ + for (uint32_t i = 0; i < transaction->packfile.nr_written; i++) + if (oideq(&transaction->packfile.written[i]->oid, oid)) + return 1; + + /* This is a new object we need to keep */ + return 0; +} + +/* Lazily create backing packfile for the state */ +static void prepare_packfile_transaction(struct odb_transaction *transaction, + unsigned flags) +{ + struct transaction_packfile *state = &transaction->packfile; + if (!(flags & INDEX_WRITE_OBJECT) || state->f) + return; + + state->f = create_tmp_packfile(transaction->odb->repo, + &state->pack_tmp_name); + reset_pack_idx_option(&state->pack_idx_opts); + + /* Pretend we are going to write only one object */ + state->offset = write_pack_header(state->f, 1); + if (!state->offset) + die_errno("unable to write pack header"); +} + +/* + * Read the contents from fd for size bytes, streaming it to the + * packfile in state while updating the hash in ctx. Signal a failure + * by returning a negative value when the resulting pack would exceed + * the pack size limit and this is not the first object in the pack, + * so that the caller can discard what we wrote from the current pack + * by truncating it and opening a new one. The caller will then call + * us again after rewinding the input fd. + * + * The already_hashed_to pointer is kept untouched by the caller to + * make sure we do not hash the same byte when we are called + * again. This way, the caller does not have to checkpoint its hash + * status before calling us just in case we ask it to call us again + * with a new pack. + */ +static int stream_blob_to_pack(struct transaction_packfile *state, + struct git_hash_ctx *ctx, off_t *already_hashed_to, + int fd, size_t size, const char *path, + unsigned flags) +{ + git_zstream s; + unsigned char ibuf[16384]; + unsigned char obuf[16384]; + unsigned hdrlen; + int status = Z_OK; + int write_object = (flags & INDEX_WRITE_OBJECT); + off_t offset = 0; + + git_deflate_init(&s, pack_compression_level); + + hdrlen = encode_in_pack_object_header(obuf, sizeof(obuf), OBJ_BLOB, size); + s.next_out = obuf + hdrlen; + s.avail_out = sizeof(obuf) - hdrlen; + + while (status != Z_STREAM_END) { + if (size && !s.avail_in) { + size_t rsize = size < sizeof(ibuf) ? size : sizeof(ibuf); + ssize_t read_result = read_in_full(fd, ibuf, rsize); + if (read_result < 0) + die_errno("failed to read from '%s'", path); + if ((size_t)read_result != rsize) + die("failed to read %u bytes from '%s'", + (unsigned)rsize, path); + offset += rsize; + if (*already_hashed_to < offset) { + size_t hsize = offset - *already_hashed_to; + if (rsize < hsize) + hsize = rsize; + if (hsize) + git_hash_update(ctx, ibuf, hsize); + *already_hashed_to = offset; + } + s.next_in = ibuf; + s.avail_in = rsize; + size -= rsize; + } + + status = git_deflate(&s, size ? 0 : Z_FINISH); + + if (!s.avail_out || status == Z_STREAM_END) { + if (write_object) { + size_t written = s.next_out - obuf; + + /* would we bust the size limit? */ + if (state->nr_written && + pack_size_limit_cfg && + pack_size_limit_cfg < state->offset + written) { + git_deflate_abort(&s); + return -1; + } + + hashwrite(state->f, obuf, written); + state->offset += written; + } + s.next_out = obuf; + s.avail_out = sizeof(obuf); + } + + switch (status) { + case Z_OK: + case Z_BUF_ERROR: + case Z_STREAM_END: + continue; + default: + die("unexpected deflate failure: %d", status); + } + } + git_deflate_end(&s); + return 0; +} + +static void flush_packfile_transaction(struct odb_transaction *transaction) +{ + struct transaction_packfile *state = &transaction->packfile; + struct repository *repo = transaction->odb->repo; + unsigned char hash[GIT_MAX_RAWSZ]; + struct strbuf packname = STRBUF_INIT; + char *idx_tmp_name = NULL; + + if (!state->f) + return; + + if (state->nr_written == 0) { + close(state->f->fd); + free_hashfile(state->f); + unlink(state->pack_tmp_name); + goto clear_exit; + } else if (state->nr_written == 1) { + finalize_hashfile(state->f, hash, FSYNC_COMPONENT_PACK, + CSUM_HASH_IN_STREAM | CSUM_FSYNC | CSUM_CLOSE); + } else { + int fd = finalize_hashfile(state->f, hash, FSYNC_COMPONENT_PACK, 0); + fixup_pack_header_footer(repo->hash_algo, fd, hash, state->pack_tmp_name, + state->nr_written, hash, + state->offset); + close(fd); + } + + strbuf_addf(&packname, "%s/pack/pack-%s.", + repo_get_object_directory(transaction->odb->repo), + hash_to_hex_algop(hash, repo->hash_algo)); + + stage_tmp_packfiles(repo, &packname, state->pack_tmp_name, + state->written, state->nr_written, NULL, + &state->pack_idx_opts, hash, &idx_tmp_name); + rename_tmp_packfile_idx(repo, &packname, &idx_tmp_name); + + for (uint32_t i = 0; i < state->nr_written; i++) + free(state->written[i]); + +clear_exit: + free(idx_tmp_name); + free(state->pack_tmp_name); + free(state->written); + memset(state, 0, sizeof(*state)); + + strbuf_release(&packname); + /* Make objects we just wrote available to ourselves */ - reprepare_packed_git(repo); ++ odb_reprepare(repo->objects); +} + +/* + * This writes the specified object to a packfile. Objects written here + * during the same transaction are written to the same packfile. The + * packfile is not flushed until the transaction is flushed. The caller + * is expected to ensure a valid transaction is setup for objects to be + * recorded to. + * + * This also bypasses the usual "convert-to-git" dance, and that is on + * purpose. We could write a streaming version of the converting + * functions and insert that before feeding the data to fast-import + * (or equivalent in-core API described above). However, that is + * somewhat complicated, as we do not know the size of the filter + * result, which we need to know beforehand when writing a git object. + * Since the primary motivation for trying to stream from the working + * tree file and to avoid mmaping it in core is to deal with large + * binary blobs, they generally do not want to get any conversion, and + * callers should avoid this code path when filters are requested. + */ +static int index_blob_packfile_transaction(struct odb_transaction *transaction, + struct object_id *result_oid, int fd, + size_t size, const char *path, + unsigned flags) +{ + struct transaction_packfile *state = &transaction->packfile; + off_t seekback, already_hashed_to; + struct git_hash_ctx ctx; + unsigned char obuf[16384]; + unsigned header_len; + struct hashfile_checkpoint checkpoint; + struct pack_idx_entry *idx = NULL; + + seekback = lseek(fd, 0, SEEK_CUR); + if (seekback == (off_t)-1) + return error("cannot find the current offset"); + + header_len = format_object_header((char *)obuf, sizeof(obuf), + OBJ_BLOB, size); + transaction->odb->repo->hash_algo->init_fn(&ctx); + git_hash_update(&ctx, obuf, header_len); + + /* Note: idx is non-NULL when we are writing */ + if ((flags & INDEX_WRITE_OBJECT) != 0) { + CALLOC_ARRAY(idx, 1); + + prepare_packfile_transaction(transaction, flags); + hashfile_checkpoint_init(state->f, &checkpoint); + } + + already_hashed_to = 0; + + while (1) { + prepare_packfile_transaction(transaction, flags); + if (idx) { + hashfile_checkpoint(state->f, &checkpoint); + idx->offset = state->offset; + crc32_begin(state->f); + } + if (!stream_blob_to_pack(state, &ctx, &already_hashed_to, + fd, size, path, flags)) + break; + /* + * Writing this object to the current pack will make + * it too big; we need to truncate it, start a new + * pack, and write into it. + */ + if (!idx) + BUG("should not happen"); + hashfile_truncate(state->f, &checkpoint); + state->offset = checkpoint.offset; + flush_packfile_transaction(transaction); + if (lseek(fd, seekback, SEEK_SET) == (off_t)-1) + return error("cannot seek back"); + } + git_hash_final_oid(result_oid, &ctx); + if (!idx) + return 0; + + idx->crc32 = crc32_end(state->f); + if (already_written(transaction, result_oid)) { + hashfile_truncate(state->f, &checkpoint); + state->offset = checkpoint.offset; + free(idx); + } else { + oidcpy(&idx->oid, result_oid); + ALLOC_GROW(state->written, + state->nr_written + 1, + state->alloc_written); + state->written[state->nr_written++] = idx; + } + return 0; +} + int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags) diff --cc odb.c index 1fc1488891,65a6cc67b6..00a6e71568 --- a/odb.c +++ b/odb.c @@@ -1035,29 -1034,34 +1034,44 @@@ void odb_clear(struct object_database * free((char *) o->cached_objects[i].value.buf); FREE_AND_NULL(o->cached_objects); - INIT_LIST_HEAD(&o->packed_git_mru); close_object_store(o); + packfile_store_free(o->packfiles); + o->packfiles = NULL; + + string_list_clear(&o->submodule_source_paths, 0); + } + + void odb_reprepare(struct object_database *o) + { + struct odb_source *source; + + obj_read_lock(); /* - * `close_object_store()` only closes the packfiles, but doesn't free - * them. We thus have to do this manually. + * Reprepare alt odbs, in case the alternates file was modified + * during the course of this process. This only _adds_ odbs to + * the linked list, so existing odbs will continue to exist for + * the lifetime of the process. */ - for (struct packed_git *p = o->packed_git, *next; p; p = next) { - next = p->next; - free(p); - } - o->packed_git = NULL; + o->loaded_alternates = 0; + odb_prepare_alternates(o); - hashmap_clear(&o->pack_map); - string_list_clear(&o->submodule_source_paths, 0); + for (source = o->sources; source; source = source->next) + odb_clear_loose_cache(source); + + o->approximate_object_count_valid = 0; + + packfile_store_reprepare(o->packfiles); + + obj_read_unlock(); } + +struct odb_transaction *odb_transaction_begin(struct object_database *odb) +{ + return object_file_transaction_begin(odb->sources); +} + +void odb_transaction_commit(struct odb_transaction *transaction) +{ + object_file_transaction_commit(transaction); +} diff --cc odb.h index 9e3dd9d6df,ab39e3605d..7a3cfd34e1 --- a/odb.h +++ b/odb.h @@@ -91,8 -90,8 +90,9 @@@ struct odb_source }; struct packed_git; + struct packfile_store; struct cached_object_entry; +struct odb_transaction; /* * The object database encapsulates access to objects in a repository. It @@@ -192,20 -160,13 +168,26 @@@ struct object_database struct object_database *odb_new(struct repository *repo); void odb_clear(struct object_database *o); + + /* + * Clear caches, reload alternates and then reload object sources so that new + * objects may become accessible. + */ + void odb_reprepare(struct object_database *o); +/* + * Starts an ODB transaction. Subsequent objects are written to the transaction + * and not committed until odb_transaction_commit() is invoked on the + * transaction. If the ODB already has a pending transaction, NULL is returned. + */ +struct odb_transaction *odb_transaction_begin(struct object_database *odb); + +/* + * Commits an ODB transaction making the written objects visible. If the + * specified transaction is NULL, the function is a no-op. + */ +void odb_transaction_commit(struct odb_transaction *transaction); + /* * Find source by its object directory path. Returns a `NULL` pointer in case * the source could not be found.