11#include "cache.h"
22#include "diff.h"
33#include "diffcore.h"
4- #include "delta.h"
5- #include "count-delta.h"
6-
7- static int diffcore_count_changes_1 (void * src , unsigned long src_size ,
8- void * dst , unsigned long dst_size ,
9- unsigned long delta_limit ,
10- unsigned long * src_copied ,
11- unsigned long * literal_added )
4+
5+ /*
6+ * Idea here is very simple.
7+ *
8+ * We have total of (sz-N+1) N-byte overlapping sequences in buf whose
9+ * size is sz. If the same N-byte sequence appears in both source and
10+ * destination, we say the byte that starts that sequence is shared
11+ * between them (i.e. copied from source to destination).
12+ *
13+ * For each possible N-byte sequence, if the source buffer has more
14+ * instances of it than the destination buffer, that means the
15+ * difference are the number of bytes not copied from source to
16+ * destination. If the counts are the same, everything was copied
17+ * from source to destination. If the destination has more,
18+ * everything was copied, and destination added more.
19+ *
20+ * We are doing an approximation so we do not really have to waste
21+ * memory by actually storing the sequence. We just hash them into
22+ * somewhere around 2^16 hashbuckets and count the occurrences.
23+ *
24+ * The length of the sequence is arbitrarily set to 8 for now.
25+ */
26+
27+ #define HASHBASE 65537 /* next_prime(2^16) */
28+
29+ static void hash_chars (unsigned char * buf , unsigned long sz , int * count )
1230{
13- void * delta ;
14- unsigned long delta_size ;
15-
16- delta = diff_delta (src , src_size ,
17- dst , dst_size ,
18- & delta_size , delta_limit );
19- if (!delta )
20- /* If delta_limit is exceeded, we have too much differences */
21- return -1 ;
31+ unsigned int accum1 , accum2 , i ;
2232
23- /* Estimate the edit size by interpreting delta. */
24- if (count_delta (delta , delta_size , src_copied , literal_added )) {
25- free (delta );
26- return -1 ;
33+ /* an 8-byte shift register made of accum1 and accum2. New
34+ * bytes come at LSB of accum2, and shifted up to accum1
35+ */
36+ for (i = accum1 = accum2 = 0 ; i < 7 ; i ++ , sz -- ) {
37+ accum1 = (accum1 << 8 ) | (accum2 >> 24 );
38+ accum2 = (accum2 << 8 ) | * buf ++ ;
39+ }
40+ while (sz ) {
41+ accum1 = (accum1 << 8 ) | (accum2 >> 24 );
42+ accum2 = (accum2 << 8 ) | * buf ++ ;
43+ /* We want something that hashes permuted byte
44+ * sequences nicely; simpler hash like (accum1 ^
45+ * accum2) does not perform as well.
46+ */
47+ i = (accum1 + accum2 * 0x61 ) % HASHBASE ;
48+ count [i ]++ ;
49+ sz -- ;
2750 }
28- free (delta );
29- return 0 ;
3051}
3152
3253int diffcore_count_changes (void * src , unsigned long src_size ,
@@ -35,9 +56,28 @@ int diffcore_count_changes(void *src, unsigned long src_size,
3556 unsigned long * src_copied ,
3657 unsigned long * literal_added )
3758{
38- return diffcore_count_changes_1 (src , src_size ,
39- dst , dst_size ,
40- delta_limit ,
41- src_copied ,
42- literal_added );
59+ int * src_count , * dst_count , i ;
60+ unsigned long sc , la ;
61+
62+ if (src_size < 8 || dst_size < 8 )
63+ return -1 ;
64+
65+ src_count = xcalloc (HASHBASE * 2 , sizeof (int ));
66+ dst_count = src_count + HASHBASE ;
67+ hash_chars (src , src_size , src_count );
68+ hash_chars (dst , dst_size , dst_count );
69+
70+ sc = la = 0 ;
71+ for (i = 0 ; i < HASHBASE ; i ++ ) {
72+ if (src_count [i ] < dst_count [i ]) {
73+ la += dst_count [i ] - src_count [i ];
74+ sc += src_count [i ];
75+ }
76+ else /* i.e. if (dst_count[i] <= src_count[i]) */
77+ sc += dst_count [i ];
78+ }
79+ * src_copied = sc ;
80+ * literal_added = la ;
81+ free (src_count );
82+ return 0 ;
4383}
0 commit comments