Skip to content

Commit 9ec2dde

Browse files
pcloudsgitster
authored andcommitted
index-pack: use streaming interface on large blobs (most of the time)
unpack_raw_entry() will not allocate and return decompressed blobs if they are larger than core.bigFileThreshold. sha1_object() may not be called on those objects because there's no actual content. sha1_object() is called later on those objects, where we can safely use get_data_from_pack() to retrieve blob content for checking. However we always do that when we definitely need the blob content. And we often don't. There are two cases when we may need object content. The first case is when we find an in-repo blob with the same SHA-1. We need to do collision test, byte-on-byte. If this test is on, the blob must be loaded on memory (i.e. no streaming). Normally (e.g. in fetch/pull/clone) this does not happen because git avoid to send objects that client already has. The other case is when --strict is specified and the object in question is not a blob, which can't happen in reality becase we deal with large _blobs_ here. Note: --verify (or git-verify-pack) a pack from current repository will trigger collision test on every object in the pack, which effectively disables this patch. This could be easily worked around by setting GIT_DIR to an imaginary place with no packs. Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent 681b07d commit 9ec2dde

File tree

2 files changed

+49
-8
lines changed

2 files changed

+49
-8
lines changed

builtin/index-pack.c

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -392,9 +392,10 @@ static int is_delta_type(enum object_type type)
392392
static void *unpack_entry_data(unsigned long offset, unsigned long size,
393393
enum object_type type, unsigned char *sha1)
394394
{
395+
static char fixed_buf[8192];
395396
int status;
396397
git_zstream stream;
397-
void *buf = xmalloc(size);
398+
void *buf;
398399
git_SHA_CTX c;
399400
char hdr[32];
400401
int hdrlen;
@@ -405,11 +406,15 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
405406
git_SHA1_Update(&c, hdr, hdrlen);
406407
} else
407408
sha1 = NULL;
409+
if (type == OBJ_BLOB && size > big_file_threshold)
410+
buf = fixed_buf;
411+
else
412+
buf = xmalloc(size);
408413

409414
memset(&stream, 0, sizeof(stream));
410415
git_inflate_init(&stream);
411416
stream.next_out = buf;
412-
stream.avail_out = size;
417+
stream.avail_out = buf == fixed_buf ? sizeof(fixed_buf) : size;
413418

414419
do {
415420
unsigned char *last_out = stream.next_out;
@@ -419,13 +424,17 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
419424
use(input_len - stream.avail_in);
420425
if (sha1)
421426
git_SHA1_Update(&c, last_out, stream.next_out - last_out);
427+
if (buf == fixed_buf) {
428+
stream.next_out = buf;
429+
stream.avail_out = sizeof(fixed_buf);
430+
}
422431
} while (status == Z_OK);
423432
if (stream.total_out != size || status != Z_STREAM_END)
424433
bad_object(offset, _("inflate returned %d"), status);
425434
git_inflate_end(&stream);
426435
if (sha1)
427436
git_SHA1_Final(sha1, &c);
428-
return buf;
437+
return buf == fixed_buf ? NULL : buf;
429438
}
430439

431440
static void *unpack_raw_entry(struct object_entry *obj,
@@ -591,14 +600,21 @@ static void find_delta_children(const union delta_base *base,
591600
*last_index = last;
592601
}
593602

594-
static void sha1_object(const void *data, unsigned long size,
595-
enum object_type type, const unsigned char *sha1)
603+
static void sha1_object(const void *data, struct object_entry *obj_entry,
604+
unsigned long size, enum object_type type,
605+
const unsigned char *sha1)
596606
{
607+
void *new_data = NULL;
608+
609+
assert(data || obj_entry);
610+
597611
read_lock();
598612
if (has_sha1_file(sha1)) {
599613
void *has_data;
600614
enum object_type has_type;
601615
unsigned long has_size;
616+
if (!data)
617+
data = new_data = get_data_from_pack(obj_entry);
602618
has_data = read_sha1_file(sha1, &has_type, &has_size);
603619
read_unlock();
604620
if (!has_data)
@@ -623,6 +639,9 @@ static void sha1_object(const void *data, unsigned long size,
623639
int eaten;
624640
void *buf = (void *) data;
625641

642+
if (!buf)
643+
buf = new_data = get_data_from_pack(obj_entry);
644+
626645
/*
627646
* we do not need to free the memory here, as the
628647
* buf is deleted by the caller.
@@ -647,6 +666,8 @@ static void sha1_object(const void *data, unsigned long size,
647666
}
648667
read_unlock();
649668
}
669+
670+
free(new_data);
650671
}
651672

652673
/*
@@ -730,7 +751,7 @@ static void resolve_delta(struct object_entry *delta_obj,
730751
bad_object(delta_obj->idx.offset, _("failed to apply delta"));
731752
hash_sha1_file(result->data, result->size,
732753
typename(delta_obj->real_type), delta_obj->idx.sha1);
733-
sha1_object(result->data, result->size, delta_obj->real_type,
754+
sha1_object(result->data, NULL, result->size, delta_obj->real_type,
734755
delta_obj->idx.sha1);
735756
counter_lock();
736757
nr_resolved_deltas++;
@@ -860,7 +881,7 @@ static void *threaded_second_pass(void *data)
860881
*/
861882
static void parse_pack_objects(unsigned char *sha1)
862883
{
863-
int i;
884+
int i, nr_delays = 0;
864885
struct delta_entry *delta = deltas;
865886
struct stat st;
866887

@@ -876,8 +897,12 @@ static void parse_pack_objects(unsigned char *sha1)
876897
nr_deltas++;
877898
delta->obj_no = i;
878899
delta++;
900+
} else if (!data) {
901+
/* large blobs, check later */
902+
obj->real_type = OBJ_BAD;
903+
nr_delays++;
879904
} else
880-
sha1_object(data, obj->size, obj->type, obj->idx.sha1);
905+
sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1);
881906
free(data);
882907
display_progress(progress, i+1);
883908
}
@@ -897,6 +922,17 @@ static void parse_pack_objects(unsigned char *sha1)
897922
if (S_ISREG(st.st_mode) &&
898923
lseek(input_fd, 0, SEEK_CUR) - input_len != st.st_size)
899924
die(_("pack has junk at the end"));
925+
926+
for (i = 0; i < nr_objects; i++) {
927+
struct object_entry *obj = &objects[i];
928+
if (obj->real_type != OBJ_BAD)
929+
continue;
930+
obj->real_type = obj->type;
931+
sha1_object(NULL, obj, obj->size, obj->type, obj->idx.sha1);
932+
nr_delays--;
933+
}
934+
if (nr_delays)
935+
die(_("confusion beyond insanity in parse_pack_objects()"));
900936
}
901937

902938
/*

t/t1050-large.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,11 @@ test_expect_success 'git-show a large file' '
130130
131131
'
132132

133+
test_expect_success 'index-pack' '
134+
git clone file://"`pwd`"/.git foo &&
135+
GIT_DIR=non-existent git index-pack --strict --verify foo/.git/objects/pack/*.pack
136+
'
137+
133138
test_expect_success 'repack' '
134139
git repack -ad
135140
'

0 commit comments

Comments
 (0)