Skip to content

Commit 2b8d934

Browse files
Nicolas PitreJunio C Hamano
authored andcommitted
diff-delta: bound hash list length to avoid O(m*n) behavior
The diff-delta code can exhibit O(m*n) behavior with some patological data set where most hash entries end up in the same hash bucket. The latest code rework reduced the block size making it particularly vulnerable to this issue, but the issue was always there and can be triggered regardless of the block size. This patch does two things: 1) the hashing has been reworked to offer a better distribution to atenuate the problem a bit, and 2) a limit is imposed to the number of entries that can exist in the same hash bucket. Because of the above the code is a bit more expensive on average, but the problematic samples used to diagnoze the issue are now orders of magnitude less expensive to process with only a slight loss in compression. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Junio C Hamano <junkio@cox.net>
1 parent bec2a69 commit 2b8d934

File tree

1 file changed

+56
-13
lines changed

1 file changed

+56
-13
lines changed

diff-delta.c

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,20 @@ struct index {
3030

3131
static struct index ** delta_index(const unsigned char *buf,
3232
unsigned long bufsize,
33+
unsigned long trg_bufsize,
3334
unsigned int *hash_shift)
3435
{
3536
unsigned long hsize;
36-
unsigned int hshift, i;
37+
unsigned int i, hshift, hlimit, *hash_count;
3738
const unsigned char *data;
3839
struct index *entry, **hash;
3940
void *mem;
4041

4142
/* determine index hash size */
4243
hsize = bufsize / 4;
43-
for (i = 8; (1 << i) < hsize && i < 16; i++);
44+
for (i = 8; (1 << i) < hsize && i < 24; i += 2);
4445
hsize = 1 << i;
45-
hshift = i - 8;
46+
hshift = (i - 8) / 2;
4647
*hash_shift = hshift;
4748

4849
/* allocate lookup index */
@@ -53,15 +54,59 @@ static struct index ** delta_index(const unsigned char *buf,
5354
entry = mem + hsize * sizeof(*hash);
5455
memset(hash, 0, hsize * sizeof(*hash));
5556

56-
/* then populate it */
57+
/* allocate an array to count hash entries */
58+
hash_count = calloc(hsize, sizeof(*hash_count));
59+
if (!hash_count) {
60+
free(hash);
61+
return NULL;
62+
}
63+
64+
/* then populate the index */
5765
data = buf + bufsize - 2;
5866
while (data > buf) {
5967
entry->ptr = --data;
60-
i = data[0] ^ data[1] ^ (data[2] << hshift);
68+
i = data[0] ^ ((data[1] ^ (data[2] << hshift)) << hshift);
6169
entry->next = hash[i];
6270
hash[i] = entry++;
71+
hash_count[i]++;
6372
}
6473

74+
/*
75+
* Determine a limit on the number of entries in the same hash
76+
* bucket. This guard us against patological data sets causing
77+
* really bad hash distribution with most entries in the same hash
78+
* bucket that would bring us to O(m*n) computing costs (m and n
79+
* corresponding to reference and target buffer sizes).
80+
*
81+
* The more the target buffer is large, the more it is important to
82+
* have small entry lists for each hash buckets. With such a limit
83+
* the cost is bounded to something more like O(m+n).
84+
*/
85+
hlimit = (1 << 26) / trg_bufsize;
86+
if (hlimit < 16)
87+
hlimit = 16;
88+
89+
/*
90+
* Now make sure none of the hash buckets has more entries than
91+
* we're willing to test. Otherwise we short-circuit the entry
92+
* list uniformly to still preserve a good repartition across
93+
* the reference buffer.
94+
*/
95+
for (i = 0; i < hsize; i++) {
96+
if (hash_count[i] < hlimit)
97+
continue;
98+
entry = hash[i];
99+
do {
100+
struct index *keep = entry;
101+
int skip = hash_count[i] / hlimit / 2;
102+
do {
103+
entry = entry->next;
104+
} while(--skip && entry);
105+
keep->next = entry;
106+
} while(entry);
107+
}
108+
free(hash_count);
109+
65110
return hash;
66111
}
67112

@@ -85,7 +130,7 @@ void *diff_delta(void *from_buf, unsigned long from_size,
85130

86131
if (!from_size || !to_size)
87132
return NULL;
88-
hash = delta_index(from_buf, from_size, &hash_shift);
133+
hash = delta_index(from_buf, from_size, to_size, &hash_shift);
89134
if (!hash)
90135
return NULL;
91136

@@ -126,8 +171,8 @@ void *diff_delta(void *from_buf, unsigned long from_size,
126171

127172
while (data < top) {
128173
unsigned int moff = 0, msize = 0;
129-
if (data + 2 < top) {
130-
i = data[0] ^ data[1] ^ (data[2] << hash_shift);
174+
if (data + 3 <= top) {
175+
i = data[0] ^ ((data[1] ^ (data[2] << hash_shift)) << hash_shift);
131176
for (entry = hash[i]; entry; entry = entry->next) {
132177
const unsigned char *ref = entry->ptr;
133178
const unsigned char *src = data;
@@ -138,11 +183,9 @@ void *diff_delta(void *from_buf, unsigned long from_size,
138183
ref_size = 0x10000;
139184
if (ref_size <= msize)
140185
break;
141-
while (ref_size && *src++ == *ref) {
142-
ref++;
143-
ref_size--;
144-
}
145-
ref_size = ref - entry->ptr;
186+
if (*ref != *src)
187+
continue;
188+
while (ref_size-- && *++src == *++ref);
146189
if (msize < ref - entry->ptr) {
147190
/* this is our best match so far */
148191
msize = ref - entry->ptr;

0 commit comments

Comments
 (0)