]> git.ipfire.org Git - thirdparty/git.git/commitdiff
odb, packfile: use size_t for streaming object sizes
authorJohannes Schindelin <johannes.schindelin@gmx.de>
Fri, 8 May 2026 08:16:41 +0000 (08:16 +0000)
committerJunio C Hamano <gitster@pobox.com>
Sat, 9 May 2026 02:25:31 +0000 (11:25 +0900)
The odb_read_stream structure uses unsigned long for the size field,
which is 32-bit on Windows even in 64-bit builds. When streaming
objects larger than 4GB, the size would be truncated to zero or an
incorrect value, resulting in empty files being written to disk.

Change the size field in odb_read_stream to size_t and introduce
unpack_object_header_sz() to return sizes via size_t pointer. Since
object_info.sizep remains unsigned long for API compatibility, use
temporary variables where the types differ, with comments noting the
truncation limitation for code paths that still use unsigned long.

Widening the producers to size_t in this way introduces a handful of
silent size_t -> unsigned long narrowings on Windows, all in
builtin/pack-objects.c, where the consumers are still typed
unsigned long. Make those narrowings explicit with
cast_size_t_to_ulong() so they assert loudly the moment an object
actually exceeds ULONG_MAX bytes:

  - oe_get_size_slow() returns unsigned long but holds a size_t
    locally; cast at the return.
  - write_reuse_object() passes a size_t into check_pack_inflate(),
    whose expect parameter is unsigned long; cast at the call.
  - check_object() routes a size_t through SET_SIZE() and
    SET_DELTA_SIZE(), both of which take unsigned long via
    oe_set_size() / oe_set_delta_size(); cast at the three call
    sites in the OBJ_OFS_DELTA / OBJ_REF_DELTA branches and in the
    non-delta default arm.

The cast-only treatment is deliberately a stop-gap. Properly
widening oe_set_size, oe_get_size_slow's return type,
check_pack_inflate's expect parameter, object_info.sizep,
patch_delta, and the OE_SIZE_BITS bit-fields cascades into a series
that is too large to be reviewable, so the proper widening is
deferred to a follow-up topic. Until then,
cast_size_t_to_ulong() at least makes the truncation explicit at
the source: it documents the boundary, and on a 64-bit non-Windows
platform it is a no-op.

This was originally authored by LordKiRon <https://github.com/LordKiRon>,
who preferred not to reveal their real name and therefore agreed that I
take over authorship.

Helped-by: Torsten Bögershausen <tboegi@web.de>
Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
builtin/pack-objects.c
object-file.c
odb/streaming.c
odb/streaming.h
oss-fuzz/fuzz-pack-headers.c
pack-bitmap.c
pack-check.c
packfile.c
packfile.h

index dd2480a73d2edf0eca3be11029e49650877d6ab3..480cc0bd8c8d227caa875e9a2293c9df73f6f547 100644 (file)
@@ -629,14 +629,21 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
        struct packed_git *p = IN_PACK(entry);
        struct pack_window *w_curs = NULL;
        uint32_t pos;
-       off_t offset;
+       off_t offset, cur;
        enum object_type type = oe_type(entry);
+       enum object_type in_pack_type;
        off_t datalen;
        unsigned char header[MAX_PACK_OBJECT_HEADER],
                      dheader[MAX_PACK_OBJECT_HEADER];
        unsigned hdrlen;
        const unsigned hashsz = the_hash_algo->rawsz;
-       unsigned long entry_size = SIZE(entry);
+       size_t entry_size;
+
+       cur = entry->in_pack_offset;
+       in_pack_type = unpack_object_header(p, &w_curs, &cur, &entry_size);
+       if (in_pack_type < 0)
+               die(_("write_reuse_object: unable to parse object header of %s"),
+                   oid_to_hex(&entry->idx.oid));
 
        if (DELTA(entry))
                type = (allow_ofs_delta && DELTA(entry)->idx.offset) ?
@@ -664,7 +671,8 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
        datalen -= entry->in_pack_header_size;
 
        if (!pack_to_stdout && p->index_version == 1 &&
-           check_pack_inflate(p, &w_curs, offset, datalen, entry_size)) {
+           check_pack_inflate(p, &w_curs, offset, datalen,
+                              cast_size_t_to_ulong(entry_size))) {
                error(_("corrupt packed object for %s"),
                      oid_to_hex(&entry->idx.oid));
                unuse_pack(&w_curs);
@@ -1087,7 +1095,7 @@ static void write_reused_pack_one(struct packed_git *reuse_packfile,
 {
        off_t offset, next, cur;
        enum object_type type;
-       unsigned long size;
+       size_t size;
 
        offset = pack_pos_to_offset(reuse_packfile, pos);
        next = pack_pos_to_offset(reuse_packfile, pos + 1);
@@ -2243,7 +2251,7 @@ static void check_object(struct object_entry *entry, uint32_t object_index)
                off_t ofs;
                unsigned char *buf, c;
                enum object_type type;
-               unsigned long in_pack_size;
+               size_t in_pack_size;
 
                buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail);
 
@@ -2270,7 +2278,7 @@ static void check_object(struct object_entry *entry, uint32_t object_index)
                default:
                        /* Not a delta hence we've already got all we need. */
                        oe_set_type(entry, entry->in_pack_type);
-                       SET_SIZE(entry, in_pack_size);
+                       SET_SIZE(entry, cast_size_t_to_ulong(in_pack_size));
                        entry->in_pack_header_size = used;
                        if (oe_type(entry) < OBJ_COMMIT || oe_type(entry) > OBJ_BLOB)
                                goto give_up;
@@ -2324,8 +2332,8 @@ static void check_object(struct object_entry *entry, uint32_t object_index)
                if (have_base &&
                    can_reuse_delta(&base_ref, entry, &base_entry)) {
                        oe_set_type(entry, entry->in_pack_type);
-                       SET_SIZE(entry, in_pack_size); /* delta size */
-                       SET_DELTA_SIZE(entry, in_pack_size);
+                       SET_SIZE(entry, cast_size_t_to_ulong(in_pack_size)); /* delta size */
+                       SET_DELTA_SIZE(entry, cast_size_t_to_ulong(in_pack_size));
 
                        if (base_entry) {
                                SET_DELTA(entry, base_entry);
@@ -2734,16 +2742,18 @@ unsigned long oe_get_size_slow(struct packing_data *pack,
        struct pack_window *w_curs;
        unsigned char *buf;
        enum object_type type;
-       unsigned long used, avail, size;
+       unsigned long used, avail;
+       size_t size;
 
        if (e->type_ != OBJ_OFS_DELTA && e->type_ != OBJ_REF_DELTA) {
+               unsigned long sz;
                packing_data_lock(&to_pack);
                if (odb_read_object_info(the_repository->objects,
-                                        &e->idx.oid, &size) < 0)
+                                        &e->idx.oid, &sz) < 0)
                        die(_("unable to get size of %s"),
                            oid_to_hex(&e->idx.oid));
                packing_data_unlock(&to_pack);
-               return size;
+               return sz;
        }
 
        p = oe_in_pack(pack, e);
@@ -2760,7 +2770,7 @@ unsigned long oe_get_size_slow(struct packing_data *pack,
 
        unuse_pack(&w_curs);
        packing_data_unlock(&to_pack);
-       return size;
+       return cast_size_t_to_ulong(size);
 }
 
 static int try_delta(struct unpacked *trg, struct unpacked *src,
index 086b2b65ffe65e0cb7c03c21fd201a0411bb87ab..0be2981c7a1f431de0393a2339515be19cc5eca8 100644 (file)
@@ -2326,6 +2326,7 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out,
        struct object_info oi = OBJECT_INFO_INIT;
        struct odb_loose_read_stream *st;
        unsigned long mapsize;
+       unsigned long size_ul;
        void *mapped;
 
        mapped = odb_source_loose_map_object(source, oid, &mapsize);
@@ -2349,11 +2350,18 @@ int odb_source_loose_read_object_stream(struct odb_read_stream **out,
                goto error;
        }
 
-       oi.sizep = &st->base.size;
+       /*
+        * object_info.sizep is unsigned long* (32-bit on Windows), but
+        * st->base.size is size_t (64-bit). Use temporary variable.
+        * Note: loose objects >4GB would still truncate here, but such
+        * large loose objects are uncommon (they'd normally be packed).
+        */
+       oi.sizep = &size_ul;
        oi.typep = &st->base.type;
 
        if (parse_loose_header(st->hdr, &oi) < 0 || st->base.type < 0)
                goto error;
+       st->base.size = size_ul;
 
        st->mapped = mapped;
        st->mapsize = mapsize;
index 5927a12954ba5901b0e557514eafbdbc6d987f0e..af2adf5ce786d66c3a0e9dfd9f22b4e98291d9f4 100644 (file)
@@ -157,15 +157,26 @@ static int open_istream_incore(struct odb_read_stream **out,
                .base.read = read_istream_incore,
        };
        struct odb_incore_read_stream *st;
+       unsigned long size_ul;
        int ret;
 
        oi.typep = &stream.base.type;
-       oi.sizep = &stream.base.size;
+       /*
+        * object_info.sizep is unsigned long* (32-bit on Windows), but
+        * stream.base.size is size_t (64-bit). We use a temporary variable
+        * because the types are incompatible. Note: this path still truncates
+        * for >4GB objects, but large objects should use pack streaming
+        * (packfile_store_read_object_stream) which handles size_t properly.
+        * This incore fallback is only used for small objects or when pack
+        * streaming is unavailable.
+        */
+       oi.sizep = &size_ul;
        oi.contentp = (void **)&stream.buf;
        ret = odb_read_object_info_extended(odb, oid, &oi,
                                            OBJECT_INFO_DIE_IF_CORRUPT);
        if (ret)
                return ret;
+       stream.base.size = size_ul;
 
        CALLOC_ARRAY(st, 1);
        *st = stream;
index c7861f7e13c606af66d5b54b52b7b1cc3eb9adad..517e2ea2d3f5c300748907e7ef99ab8622115686 100644 (file)
@@ -21,7 +21,7 @@ struct odb_read_stream {
        odb_read_stream_close_fn close;
        odb_read_stream_read_fn read;
        enum object_type type;
-       unsigned long size; /* inflated size of full object */
+       size_t size; /* inflated size of full object */
 };
 
 /*
index 150c0f5fa2d7ec2b9dd6a14f6000e5b58b753aa9..ef61ab577c5098ae5cc1eeb7f6b52be4d7955a21 100644 (file)
@@ -6,7 +6,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
 {
        enum object_type type;
-       unsigned long len;
+       size_t len;
 
        unpack_object_header_buffer((const unsigned char *)data,
                                    (unsigned long)size, &type, &len);
index f6ec18d83afe211783dbc59203b0f6de088887fe..f9af8a96bdf4eed313bb742b7b845eb88567b752 100644 (file)
@@ -2270,7 +2270,7 @@ static int try_partial_reuse(struct bitmap_index *bitmap_git,
 {
        off_t delta_obj_offset;
        enum object_type type;
-       unsigned long size;
+       size_t size;
 
        if (pack_pos >= pack->p->num_objects)
                return -1; /* not actually in the pack */
index 79992bb509f4735e781595177b1b06cf79ed7822..2792f34d2595bf486222f0ac4fbbeb40e61debc4 100644 (file)
@@ -110,7 +110,7 @@ static int verify_packfile(struct repository *r,
                void *data;
                struct object_id oid;
                enum object_type type;
-               unsigned long size;
+               size_t size;
                off_t curpos;
                int data_valid;
 
@@ -143,7 +143,9 @@ static int verify_packfile(struct repository *r,
                        data = NULL;
                        data_valid = 0;
                } else {
-                       data = unpack_entry(r, p, entries[i].offset, &type, &size);
+                       unsigned long sz;
+                       data = unpack_entry(r, p, entries[i].offset, &type, &sz);
+                       size = sz;
                        data_valid = 1;
                }
 
index b012d648adaf2e7f0a76fe539a0d0899b072249d..fdae91dd110682a808620353866e6c45c89bd1be 100644 (file)
@@ -1133,7 +1133,7 @@ out:
 }
 
 unsigned long unpack_object_header_buffer(const unsigned char *buf,
-               unsigned long len, enum object_type *type, unsigned long *sizep)
+               unsigned long len, enum object_type *type, size_t *sizep)
 {
        unsigned shift;
        size_t size, c;
@@ -1144,7 +1144,11 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf,
        size = c & 15;
        shift = 4;
        while (c & 0x80) {
-               if (len <= used || (bitsizeof(long) - 7) < shift) {
+               /*
+                * Each continuation byte adds 7 bits. Ensure shift won't
+                * overflow size_t (use size_t not long for 64-bit on Windows).
+                */
+               if (len <= used || (bitsizeof(size_t) - 7) < shift) {
                        error("bad object header");
                        size = used = 0;
                        break;
@@ -1153,7 +1157,7 @@ unsigned long unpack_object_header_buffer(const unsigned char *buf,
                size = st_add(size, st_left_shift(c & 0x7f, shift));
                shift += 7;
        }
-       *sizep = cast_size_t_to_ulong(size);
+       *sizep = size;
        return used;
 }
 
@@ -1215,7 +1219,7 @@ unsigned long get_size_from_delta(struct packed_git *p,
 int unpack_object_header(struct packed_git *p,
                         struct pack_window **w_curs,
                         off_t *curpos,
-                        unsigned long *sizep)
+                        size_t *sizep)
 {
        unsigned char *base;
        unsigned long left;
@@ -1367,7 +1371,7 @@ static enum object_type packed_to_object_type(struct repository *r,
 
        while (type == OBJ_OFS_DELTA || type == OBJ_REF_DELTA) {
                off_t base_offset;
-               unsigned long size;
+               size_t size;
                /* Push the object we're going to leave behind */
                if (poi_stack_nr >= poi_stack_alloc && poi_stack == small_poi_stack) {
                        poi_stack_alloc = alloc_nr(poi_stack_nr);
@@ -1586,7 +1590,7 @@ static int packed_object_info_with_index_pos(struct packed_git *p, off_t obj_off
                                             uint32_t *maybe_index_pos, struct object_info *oi)
 {
        struct pack_window *w_curs = NULL;
-       unsigned long size;
+       size_t size;
        off_t curpos = obj_offset;
        enum object_type type = OBJ_NONE;
        uint32_t pack_pos;
@@ -1778,7 +1782,7 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset,
        struct pack_window *w_curs = NULL;
        off_t curpos = obj_offset;
        void *data = NULL;
-       unsigned long size;
+       size_t size;
        enum object_type type;
        struct unpack_entry_stack_ent small_delta_stack[UNPACK_ENTRY_STACK_PREALLOC];
        struct unpack_entry_stack_ent *delta_stack = small_delta_stack;
@@ -1943,8 +1947,10 @@ void *unpack_entry(struct repository *r, struct packed_git *p, off_t obj_offset,
                              (uintmax_t)curpos, p->pack_name);
                        data = NULL;
                } else {
+                       unsigned long sz;
                        data = patch_delta(base, base_size, delta_data,
-                                          delta_size, &size);
+                                          delta_size, &sz);
+                       size = sz;
 
                        /*
                         * We could not apply the delta; warn the user, but
@@ -2929,7 +2935,7 @@ int packfile_read_object_stream(struct odb_read_stream **out,
        struct odb_packed_read_stream *stream;
        struct pack_window *window = NULL;
        enum object_type in_pack_type;
-       unsigned long size;
+       size_t size;
 
        in_pack_type = unpack_object_header(pack, &window, &offset, &size);
        unuse_pack(&window);
index 9b647da7dda7c194d0a4f233adb5a3c9c0d56d14..49d6bdecf6ea185534419f3fe00fb9d20d894fbe 100644 (file)
@@ -456,9 +456,9 @@ off_t find_pack_entry_one(const struct object_id *oid, struct packed_git *);
 
 int is_pack_valid(struct packed_git *);
 void *unpack_entry(struct repository *r, struct packed_git *, off_t, enum object_type *, unsigned long *);
-unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep);
+unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, size_t *sizep);
 unsigned long get_size_from_delta(struct packed_git *, struct pack_window **, off_t);
-int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, unsigned long *);
+int unpack_object_header(struct packed_git *, struct pack_window **, off_t *, size_t *);
 off_t get_delta_base(struct packed_git *p, struct pack_window **w_curs,
                     off_t *curpos, enum object_type type,
                     off_t delta_obj_offset);