Skip to content

Commit 6b7d25d

Browse files
Nicolas PitreJunio C Hamano
authored andcommitted
diff-delta: produce optimal pack data
Indexing based on adler32 has a match precision based on the block size (currently 16). Lowering the block size would produce smaller deltas but the indexing memory and computing cost increases significantly. For optimal delta result the indexing block size should be 3 with an increment of 1 (instead of 16 and 16). With such low params the adler32 becomes a clear overhead increasing the time for git-repack by a factor of 3. And with such small blocks the adler 32 is not very useful as the whole of the block bits can be used directly. This patch replaces the adler32 with an open coded index value based on 3 characters directly. This gives sufficient bits for hashing and allows for optimal delta with reasonable CPU cycles. The resulting packs are 6% smaller on average. The increase in CPU time is about 25%. But this cost is now hidden by the delta reuse patch while the saving on data transfers is always there. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Junio C Hamano <junkio@cox.net>
1 parent 8e1454b commit 6b7d25d

File tree

1 file changed

+30
-47
lines changed

1 file changed

+30
-47
lines changed

diff-delta.c

Lines changed: 30 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,61 +20,46 @@
2020

2121
#include <stdlib.h>
2222
#include <string.h>
23-
#include <zlib.h>
2423
#include "delta.h"
2524

2625

27-
/* block size: min = 16, max = 64k, power of 2 */
28-
#define BLK_SIZE 16
29-
30-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
31-
32-
#define GR_PRIME 0x9e370001
33-
#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
34-
3526
struct index {
3627
const unsigned char *ptr;
37-
unsigned int val;
3828
struct index *next;
3929
};
4030

4131
static struct index ** delta_index(const unsigned char *buf,
4232
unsigned long bufsize,
4333
unsigned int *hash_shift)
4434
{
45-
unsigned int hsize, hshift, entries, blksize, i;
35+
unsigned long hsize;
36+
unsigned int hshift, i;
4637
const unsigned char *data;
4738
struct index *entry, **hash;
4839
void *mem;
4940

5041
/* determine index hash size */
51-
entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
52-
hsize = entries / 4;
53-
for (i = 4; (1 << i) < hsize && i < 16; i++);
42+
hsize = bufsize / 4;
43+
for (i = 8; (1 << i) < hsize && i < 16; i++);
5444
hsize = 1 << i;
55-
hshift = 32 - i;
45+
hshift = i - 8;
5646
*hash_shift = hshift;
5747

5848
/* allocate lookup index */
59-
mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
49+
mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
6050
if (!mem)
6151
return NULL;
6252
hash = mem;
6353
entry = mem + hsize * sizeof(*hash);
6454
memset(hash, 0, hsize * sizeof(*hash));
6555

6656
/* then populate it */
67-
data = buf + entries * BLK_SIZE - BLK_SIZE;
68-
blksize = bufsize - (data - buf);
69-
while (data >= buf) {
70-
unsigned int val = adler32(0, data, blksize);
71-
i = HASH(val, hshift);
72-
entry->ptr = data;
73-
entry->val = val;
57+
data = buf + bufsize - 2;
58+
while (data > buf) {
59+
entry->ptr = --data;
60+
i = data[0] ^ data[1] ^ (data[2] << hshift);
7461
entry->next = hash[i];
7562
hash[i] = entry++;
76-
blksize = BLK_SIZE;
77-
data -= BLK_SIZE;
7863
}
7964

8065
return hash;
@@ -141,29 +126,27 @@ void *diff_delta(void *from_buf, unsigned long from_size,
141126

142127
while (data < top) {
143128
unsigned int moff = 0, msize = 0;
144-
unsigned int blksize = MIN(top - data, BLK_SIZE);
145-
unsigned int val = adler32(0, data, blksize);
146-
i = HASH(val, hash_shift);
147-
for (entry = hash[i]; entry; entry = entry->next) {
148-
const unsigned char *ref = entry->ptr;
149-
const unsigned char *src = data;
150-
unsigned int ref_size = ref_top - ref;
151-
if (entry->val != val)
152-
continue;
153-
if (ref_size > top - src)
154-
ref_size = top - src;
155-
while (ref_size && *src++ == *ref) {
156-
ref++;
157-
ref_size--;
158-
}
159-
ref_size = ref - entry->ptr;
160-
if (ref_size > msize) {
161-
/* this is our best match so far */
162-
moff = entry->ptr - ref_data;
163-
msize = ref_size;
164-
if (msize >= 0x10000) {
165-
msize = 0x10000;
129+
if (data + 2 < top) {
130+
i = data[0] ^ data[1] ^ (data[2] << hash_shift);
131+
for (entry = hash[i]; entry; entry = entry->next) {
132+
const unsigned char *ref = entry->ptr;
133+
const unsigned char *src = data;
134+
unsigned int ref_size = ref_top - ref;
135+
if (ref_size > top - src)
136+
ref_size = top - src;
137+
if (ref_size > 0x10000)
138+
ref_size = 0x10000;
139+
if (ref_size <= msize)
166140
break;
141+
while (ref_size && *src++ == *ref) {
142+
ref++;
143+
ref_size--;
144+
}
145+
ref_size = ref - entry->ptr;
146+
if (msize < ref - entry->ptr) {
147+
/* this is our best match so far */
148+
msize = ref - entry->ptr;
149+
moff = entry->ptr - ref_data;
167150
}
168151
}
169152
}

0 commit comments

Comments
 (0)