Skip to content

Commit eae3fe5

Browse files
author
Junio C Hamano
committed
Revert "diff-delta: produce optimal pack data"
This reverts 6b7d25d commit. It turns out that the new algorithm has a really bad corner case, that literally spends minutes for inputs that takes less than a quater seconds to delta with the old algorithm. The resulting delta is 50% smaller which is admirable, but the performance degradation is simply unacceptable for unconditional use. Some example cases are these blobs in Linux 2.6 repository: 4917ec5 9af06ba dfc9cd5 Signed-off-by: Junio C Hamano <junkio@cox.net>
1 parent 581845f commit eae3fe5

File tree

1 file changed

+47
-30
lines changed

1 file changed

+47
-30
lines changed

diff-delta.c

Lines changed: 47 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,46 +20,61 @@
2020

2121
#include <stdlib.h>
2222
#include <string.h>
23+
#include <zlib.h>
2324
#include "delta.h"
2425

2526

27+
/* block size: min = 16, max = 64k, power of 2 */
28+
#define BLK_SIZE 16
29+
30+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
31+
32+
#define GR_PRIME 0x9e370001
33+
#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
34+
2635
struct index {
2736
const unsigned char *ptr;
37+
unsigned int val;
2838
struct index *next;
2939
};
3040

3141
static struct index ** delta_index(const unsigned char *buf,
3242
unsigned long bufsize,
3343
unsigned int *hash_shift)
3444
{
35-
unsigned long hsize;
36-
unsigned int hshift, i;
45+
unsigned int hsize, hshift, entries, blksize, i;
3746
const unsigned char *data;
3847
struct index *entry, **hash;
3948
void *mem;
4049

4150
/* determine index hash size */
42-
hsize = bufsize / 4;
43-
for (i = 8; (1 << i) < hsize && i < 16; i++);
51+
entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
52+
hsize = entries / 4;
53+
for (i = 4; (1 << i) < hsize && i < 16; i++);
4454
hsize = 1 << i;
45-
hshift = i - 8;
55+
hshift = 32 - i;
4656
*hash_shift = hshift;
4757

4858
/* allocate lookup index */
49-
mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
59+
mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
5060
if (!mem)
5161
return NULL;
5262
hash = mem;
5363
entry = mem + hsize * sizeof(*hash);
5464
memset(hash, 0, hsize * sizeof(*hash));
5565

5666
/* then populate it */
57-
data = buf + bufsize - 2;
58-
while (data > buf) {
59-
entry->ptr = --data;
60-
i = data[0] ^ data[1] ^ (data[2] << hshift);
67+
data = buf + entries * BLK_SIZE - BLK_SIZE;
68+
blksize = bufsize - (data - buf);
69+
while (data >= buf) {
70+
unsigned int val = adler32(0, data, blksize);
71+
i = HASH(val, hshift);
72+
entry->ptr = data;
73+
entry->val = val;
6174
entry->next = hash[i];
6275
hash[i] = entry++;
76+
blksize = BLK_SIZE;
77+
data -= BLK_SIZE;
6378
}
6479

6580
return hash;
@@ -126,27 +141,29 @@ void *diff_delta(void *from_buf, unsigned long from_size,
126141

127142
while (data < top) {
128143
unsigned int moff = 0, msize = 0;
129-
if (data + 2 < top) {
130-
i = data[0] ^ data[1] ^ (data[2] << hash_shift);
131-
for (entry = hash[i]; entry; entry = entry->next) {
132-
const unsigned char *ref = entry->ptr;
133-
const unsigned char *src = data;
134-
unsigned int ref_size = ref_top - ref;
135-
if (ref_size > top - src)
136-
ref_size = top - src;
137-
if (ref_size > 0x10000)
138-
ref_size = 0x10000;
139-
if (ref_size <= msize)
144+
unsigned int blksize = MIN(top - data, BLK_SIZE);
145+
unsigned int val = adler32(0, data, blksize);
146+
i = HASH(val, hash_shift);
147+
for (entry = hash[i]; entry; entry = entry->next) {
148+
const unsigned char *ref = entry->ptr;
149+
const unsigned char *src = data;
150+
unsigned int ref_size = ref_top - ref;
151+
if (entry->val != val)
152+
continue;
153+
if (ref_size > top - src)
154+
ref_size = top - src;
155+
while (ref_size && *src++ == *ref) {
156+
ref++;
157+
ref_size--;
158+
}
159+
ref_size = ref - entry->ptr;
160+
if (ref_size > msize) {
161+
/* this is our best match so far */
162+
moff = entry->ptr - ref_data;
163+
msize = ref_size;
164+
if (msize >= 0x10000) {
165+
msize = 0x10000;
140166
break;
141-
while (ref_size && *src++ == *ref) {
142-
ref++;
143-
ref_size--;
144-
}
145-
ref_size = ref - entry->ptr;
146-
if (msize < ref - entry->ptr) {
147-
/* this is our best match so far */
148-
msize = ref - entry->ptr;
149-
moff = entry->ptr - ref_data;
150167
}
151168
}
152169
}

0 commit comments

Comments
 (0)