2020
2121#include <stdlib.h>
2222#include <string.h>
23- #include <zlib.h>
2423#include "delta.h"
2524
2625
27- /* block size: min = 16, max = 64k, power of 2 */
28- #define BLK_SIZE 16
29-
30- #define MIN (a , b ) ((a) < (b) ? (a) : (b))
31-
32- #define GR_PRIME 0x9e370001
33- #define HASH (v , shift ) (((unsigned int)(v) * GR_PRIME) >> (shift))
34-
3526struct index {
3627 const unsigned char * ptr ;
37- unsigned int val ;
3828 struct index * next ;
3929};
4030
4131static struct index * * delta_index (const unsigned char * buf ,
4232 unsigned long bufsize ,
33+ unsigned long trg_bufsize ,
4334 unsigned int * hash_shift )
4435{
45- unsigned int hsize , hshift , entries , blksize , i ;
36+ unsigned long hsize ;
37+ unsigned int i , hshift , hlimit , * hash_count ;
4638 const unsigned char * data ;
4739 struct index * entry , * * hash ;
4840 void * mem ;
4941
5042 /* determine index hash size */
51- entries = (bufsize + BLK_SIZE - 1 ) / BLK_SIZE ;
52- hsize = entries / 4 ;
53- for (i = 4 ; (1 << i ) < hsize && i < 16 ; i ++ );
43+ hsize = bufsize / 4 ;
44+ for (i = 8 ; (1 << i ) < hsize && i < 24 ; i += 2 );
5445 hsize = 1 << i ;
55- hshift = 32 - i ;
46+ hshift = ( i - 8 ) / 2 ;
5647 * hash_shift = hshift ;
5748
5849 /* allocate lookup index */
59- mem = malloc (hsize * sizeof (* hash ) + entries * sizeof (* entry ));
50+ mem = malloc (hsize * sizeof (* hash ) + bufsize * sizeof (* entry ));
6051 if (!mem )
6152 return NULL ;
6253 hash = mem ;
6354 entry = mem + hsize * sizeof (* hash );
6455 memset (hash , 0 , hsize * sizeof (* hash ));
6556
66- /* then populate it */
67- data = buf + entries * BLK_SIZE - BLK_SIZE ;
68- blksize = bufsize - (data - buf );
69- while (data >= buf ) {
70- unsigned int val = adler32 (0 , data , blksize );
71- i = HASH (val , hshift );
72- entry -> ptr = data ;
73- entry -> val = val ;
57+ /* allocate an array to count hash entries */
58+ hash_count = calloc (hsize , sizeof (* hash_count ));
59+ if (!hash_count ) {
60+ free (hash );
61+ return NULL ;
62+ }
63+
64+ /* then populate the index */
65+ data = buf + bufsize - 2 ;
66+ while (data > buf ) {
67+ entry -> ptr = -- data ;
68+ i = data [0 ] ^ ((data [1 ] ^ (data [2 ] << hshift )) << hshift );
7469 entry -> next = hash [i ];
7570 hash [i ] = entry ++ ;
76- blksize = BLK_SIZE ;
77- data -= BLK_SIZE ;
71+ hash_count [i ]++ ;
7872 }
7973
74+ /*
75+ * Determine a limit on the number of entries in the same hash
76+ * bucket. This guard us against patological data sets causing
77+ * really bad hash distribution with most entries in the same hash
78+ * bucket that would bring us to O(m*n) computing costs (m and n
79+ * corresponding to reference and target buffer sizes).
80+ *
81+ * The more the target buffer is large, the more it is important to
82+ * have small entry lists for each hash buckets. With such a limit
83+ * the cost is bounded to something more like O(m+n).
84+ */
85+ hlimit = (1 << 26 ) / trg_bufsize ;
86+ if (hlimit < 16 )
87+ hlimit = 16 ;
88+
89+ /*
90+ * Now make sure none of the hash buckets has more entries than
91+ * we're willing to test. Otherwise we short-circuit the entry
92+ * list uniformly to still preserve a good repartition across
93+ * the reference buffer.
94+ */
95+ for (i = 0 ; i < hsize ; i ++ ) {
96+ if (hash_count [i ] < hlimit )
97+ continue ;
98+ entry = hash [i ];
99+ do {
100+ struct index * keep = entry ;
101+ int skip = hash_count [i ] / hlimit / 2 ;
102+ do {
103+ entry = entry -> next ;
104+ } while (-- skip && entry );
105+ keep -> next = entry ;
106+ } while (entry );
107+ }
108+ free (hash_count );
109+
80110 return hash ;
81111}
82112
@@ -100,7 +130,7 @@ void *diff_delta(void *from_buf, unsigned long from_size,
100130
101131 if (!from_size || !to_size )
102132 return NULL ;
103- hash = delta_index (from_buf , from_size , & hash_shift );
133+ hash = delta_index (from_buf , from_size , to_size , & hash_shift );
104134 if (!hash )
105135 return NULL ;
106136
@@ -141,29 +171,25 @@ void *diff_delta(void *from_buf, unsigned long from_size,
141171
142172 while (data < top ) {
143173 unsigned int moff = 0 , msize = 0 ;
144- unsigned int blksize = MIN (top - data , BLK_SIZE );
145- unsigned int val = adler32 (0 , data , blksize );
146- i = HASH (val , hash_shift );
147- for (entry = hash [i ]; entry ; entry = entry -> next ) {
148- const unsigned char * ref = entry -> ptr ;
149- const unsigned char * src = data ;
150- unsigned int ref_size = ref_top - ref ;
151- if (entry -> val != val )
152- continue ;
153- if (ref_size > top - src )
154- ref_size = top - src ;
155- while (ref_size && * src ++ == * ref ) {
156- ref ++ ;
157- ref_size -- ;
158- }
159- ref_size = ref - entry -> ptr ;
160- if (ref_size > msize ) {
161- /* this is our best match so far */
162- moff = entry -> ptr - ref_data ;
163- msize = ref_size ;
164- if (msize >= 0x10000 ) {
165- msize = 0x10000 ;
174+ if (data + 3 <= top ) {
175+ i = data [0 ] ^ ((data [1 ] ^ (data [2 ] << hash_shift )) << hash_shift );
176+ for (entry = hash [i ]; entry ; entry = entry -> next ) {
177+ const unsigned char * ref = entry -> ptr ;
178+ const unsigned char * src = data ;
179+ unsigned int ref_size = ref_top - ref ;
180+ if (ref_size > top - src )
181+ ref_size = top - src ;
182+ if (ref_size > 0x10000 )
183+ ref_size = 0x10000 ;
184+ if (ref_size <= msize )
166185 break ;
186+ if (* ref != * src )
187+ continue ;
188+ while (ref_size -- && * ++ src == * ++ ref );
189+ if (msize < ref - entry -> ptr ) {
190+ /* this is our best match so far */
191+ msize = ref - entry -> ptr ;
192+ moff = entry -> ptr - ref_data ;
167193 }
168194 }
169195 }
0 commit comments