Skip to content

Commit d00e0f8

Browse files
author
Junio C Hamano
committed
Merge part of np/delta
2 parents a369bbf + 2b8d934 commit d00e0f8

File tree

1 file changed

+75
-49
lines changed

1 file changed

+75
-49
lines changed

diff-delta.c

Lines changed: 75 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -20,63 +20,93 @@
2020

2121
#include <stdlib.h>
2222
#include <string.h>
23-
#include <zlib.h>
2423
#include "delta.h"
2524

2625

27-
/* block size: min = 16, max = 64k, power of 2 */
28-
#define BLK_SIZE 16
29-
30-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
31-
32-
#define GR_PRIME 0x9e370001
33-
#define HASH(v, shift) (((unsigned int)(v) * GR_PRIME) >> (shift))
34-
3526
struct index {
3627
const unsigned char *ptr;
37-
unsigned int val;
3828
struct index *next;
3929
};
4030

4131
static struct index ** delta_index(const unsigned char *buf,
4232
unsigned long bufsize,
33+
unsigned long trg_bufsize,
4334
unsigned int *hash_shift)
4435
{
45-
unsigned int hsize, hshift, entries, blksize, i;
36+
unsigned long hsize;
37+
unsigned int i, hshift, hlimit, *hash_count;
4638
const unsigned char *data;
4739
struct index *entry, **hash;
4840
void *mem;
4941

5042
/* determine index hash size */
51-
entries = (bufsize + BLK_SIZE - 1) / BLK_SIZE;
52-
hsize = entries / 4;
53-
for (i = 4; (1 << i) < hsize && i < 16; i++);
43+
hsize = bufsize / 4;
44+
for (i = 8; (1 << i) < hsize && i < 24; i += 2);
5445
hsize = 1 << i;
55-
hshift = 32 - i;
46+
hshift = (i - 8) / 2;
5647
*hash_shift = hshift;
5748

5849
/* allocate lookup index */
59-
mem = malloc(hsize * sizeof(*hash) + entries * sizeof(*entry));
50+
mem = malloc(hsize * sizeof(*hash) + bufsize * sizeof(*entry));
6051
if (!mem)
6152
return NULL;
6253
hash = mem;
6354
entry = mem + hsize * sizeof(*hash);
6455
memset(hash, 0, hsize * sizeof(*hash));
6556

66-
/* then populate it */
67-
data = buf + entries * BLK_SIZE - BLK_SIZE;
68-
blksize = bufsize - (data - buf);
69-
while (data >= buf) {
70-
unsigned int val = adler32(0, data, blksize);
71-
i = HASH(val, hshift);
72-
entry->ptr = data;
73-
entry->val = val;
57+
/* allocate an array to count hash entries */
58+
hash_count = calloc(hsize, sizeof(*hash_count));
59+
if (!hash_count) {
60+
free(hash);
61+
return NULL;
62+
}
63+
64+
/* then populate the index */
65+
data = buf + bufsize - 2;
66+
while (data > buf) {
67+
entry->ptr = --data;
68+
i = data[0] ^ ((data[1] ^ (data[2] << hshift)) << hshift);
7469
entry->next = hash[i];
7570
hash[i] = entry++;
76-
blksize = BLK_SIZE;
77-
data -= BLK_SIZE;
71+
hash_count[i]++;
7872
}
7973

74+
/*
75+
* Determine a limit on the number of entries in the same hash
76+
* bucket. This guard us against patological data sets causing
77+
* really bad hash distribution with most entries in the same hash
78+
* bucket that would bring us to O(m*n) computing costs (m and n
79+
* corresponding to reference and target buffer sizes).
80+
*
81+
* The more the target buffer is large, the more it is important to
82+
* have small entry lists for each hash buckets. With such a limit
83+
* the cost is bounded to something more like O(m+n).
84+
*/
85+
hlimit = (1 << 26) / trg_bufsize;
86+
if (hlimit < 16)
87+
hlimit = 16;
88+
89+
/*
90+
* Now make sure none of the hash buckets has more entries than
91+
* we're willing to test. Otherwise we short-circuit the entry
92+
* list uniformly to still preserve a good repartition across
93+
* the reference buffer.
94+
*/
95+
for (i = 0; i < hsize; i++) {
96+
if (hash_count[i] < hlimit)
97+
continue;
98+
entry = hash[i];
99+
do {
100+
struct index *keep = entry;
101+
int skip = hash_count[i] / hlimit / 2;
102+
do {
103+
entry = entry->next;
104+
} while(--skip && entry);
105+
keep->next = entry;
106+
} while(entry);
107+
}
108+
free(hash_count);
109+
80110
return hash;
81111
}
82112

@@ -100,7 +130,7 @@ void *diff_delta(void *from_buf, unsigned long from_size,
100130

101131
if (!from_size || !to_size)
102132
return NULL;
103-
hash = delta_index(from_buf, from_size, &hash_shift);
133+
hash = delta_index(from_buf, from_size, to_size, &hash_shift);
104134
if (!hash)
105135
return NULL;
106136

@@ -141,29 +171,25 @@ void *diff_delta(void *from_buf, unsigned long from_size,
141171

142172
while (data < top) {
143173
unsigned int moff = 0, msize = 0;
144-
unsigned int blksize = MIN(top - data, BLK_SIZE);
145-
unsigned int val = adler32(0, data, blksize);
146-
i = HASH(val, hash_shift);
147-
for (entry = hash[i]; entry; entry = entry->next) {
148-
const unsigned char *ref = entry->ptr;
149-
const unsigned char *src = data;
150-
unsigned int ref_size = ref_top - ref;
151-
if (entry->val != val)
152-
continue;
153-
if (ref_size > top - src)
154-
ref_size = top - src;
155-
while (ref_size && *src++ == *ref) {
156-
ref++;
157-
ref_size--;
158-
}
159-
ref_size = ref - entry->ptr;
160-
if (ref_size > msize) {
161-
/* this is our best match so far */
162-
moff = entry->ptr - ref_data;
163-
msize = ref_size;
164-
if (msize >= 0x10000) {
165-
msize = 0x10000;
174+
if (data + 3 <= top) {
175+
i = data[0] ^ ((data[1] ^ (data[2] << hash_shift)) << hash_shift);
176+
for (entry = hash[i]; entry; entry = entry->next) {
177+
const unsigned char *ref = entry->ptr;
178+
const unsigned char *src = data;
179+
unsigned int ref_size = ref_top - ref;
180+
if (ref_size > top - src)
181+
ref_size = top - src;
182+
if (ref_size > 0x10000)
183+
ref_size = 0x10000;
184+
if (ref_size <= msize)
166185
break;
186+
if (*ref != *src)
187+
continue;
188+
while (ref_size-- && *++src == *++ref);
189+
if (msize < ref - entry->ptr) {
190+
/* this is our best match so far */
191+
msize = ref - entry->ptr;
192+
moff = entry->ptr - ref_data;
167193
}
168194
}
169195
}

0 commit comments

Comments
 (0)