Skip to content

Commit c06c796

Browse files
author
Junio C Hamano
committed
diffcore-rename: somewhat optimized.
This changes diffcore-rename to reuse statistics information gathered during similarity estimation, and updates the hashtable implementation used to keep track of the statistics to be denser. This seems to give better performance. Signed-off-by: Junio C Hamano <junkio@cox.net>
1 parent ce2a341 commit c06c796

File tree

5 files changed

+149
-21
lines changed

5 files changed

+149
-21
lines changed

diff.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,8 @@ void diff_free_filespec_data(struct diff_filespec *s)
463463
munmap(s->data, s->size);
464464
s->should_free = s->should_munmap = 0;
465465
s->data = NULL;
466+
free(s->cnt_data);
467+
s->cnt_data = NULL;
466468
}
467469

468470
static void prep_temp_blob(struct diff_tempfile *temp,

diffcore-break.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ static int should_break(struct diff_filespec *src,
6868

6969
if (diffcore_count_changes(src->data, src->size,
7070
dst->data, dst->size,
71+
NULL, NULL,
7172
0,
7273
&src_copied, &literal_added))
7374
return 0;

diffcore-delta.c

Lines changed: 140 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,106 @@
2424
* The length of the sequence is arbitrarily set to 8 for now.
2525
*/
2626

27+
/* Wild guess at the initial hash size */
28+
#define INITIAL_HASH_SIZE 10
2729
#define HASHBASE 65537 /* next_prime(2^16) */
2830

29-
static void hash_chars(unsigned char *buf, unsigned long sz, int *count)
31+
struct spanhash {
32+
unsigned long hashval;
33+
unsigned long cnt;
34+
};
35+
struct spanhash_top {
36+
int alloc_log2;
37+
int free;
38+
struct spanhash data[FLEX_ARRAY];
39+
};
40+
41+
static struct spanhash *spanhash_find(struct spanhash_top *top, unsigned long hashval)
42+
{
43+
int sz = 1 << top->alloc_log2;
44+
int bucket = hashval & (sz - 1);
45+
while (1) {
46+
struct spanhash *h = &(top->data[bucket++]);
47+
if (!h->cnt)
48+
return NULL;
49+
if (h->hashval == hashval)
50+
return h;
51+
if (sz <= bucket)
52+
bucket = 0;
53+
}
54+
}
55+
56+
static struct spanhash_top *spanhash_rehash(struct spanhash_top *orig)
57+
{
58+
struct spanhash_top *new;
59+
int i;
60+
int osz = 1 << orig->alloc_log2;
61+
int sz = osz << 1;
62+
63+
new = xmalloc(sizeof(*orig) + sizeof(struct spanhash) * sz);
64+
new->alloc_log2 = orig->alloc_log2 + 1;
65+
new->free = osz;
66+
memset(new->data, 0, sizeof(struct spanhash) * sz);
67+
for (i = 0; i < osz; i++) {
68+
struct spanhash *o = &(orig->data[i]);
69+
int bucket;
70+
if (!o->cnt)
71+
continue;
72+
bucket = o->hashval & (sz - 1);
73+
while (1) {
74+
struct spanhash *h = &(new->data[bucket++]);
75+
if (!h->cnt) {
76+
h->hashval = o->hashval;
77+
h->cnt = o->cnt;
78+
new->free--;
79+
break;
80+
}
81+
if (sz <= bucket)
82+
bucket = 0;
83+
}
84+
}
85+
free(orig);
86+
return new;
87+
}
88+
89+
static struct spanhash_top *add_spanhash(struct spanhash_top *top,
90+
unsigned long hashval)
91+
{
92+
int bucket, lim;
93+
struct spanhash *h;
94+
95+
lim = (1 << top->alloc_log2);
96+
bucket = hashval & (lim - 1);
97+
while (1) {
98+
h = &(top->data[bucket++]);
99+
if (!h->cnt) {
100+
h->hashval = hashval;
101+
h->cnt = 1;
102+
top->free--;
103+
if (top->free < 0)
104+
return spanhash_rehash(top);
105+
return top;
106+
}
107+
if (h->hashval == hashval) {
108+
h->cnt++;
109+
return top;
110+
}
111+
if (lim <= bucket)
112+
bucket = 0;
113+
}
114+
}
115+
116+
static struct spanhash_top *hash_chars(unsigned char *buf, unsigned long sz)
30117
{
31-
unsigned int accum1, accum2, i;
118+
int i;
119+
unsigned long accum1, accum2, hashval;
120+
struct spanhash_top *hash;
121+
122+
i = INITIAL_HASH_SIZE;
123+
hash = xmalloc(sizeof(*hash) + sizeof(struct spanhash) * (1<<i));
124+
hash->alloc_log2 = i;
125+
hash->free = (1<<i)/2;
126+
memset(hash->data, 0, sizeof(struct spanhash) * (1<<i));
32127

33128
/* an 8-byte shift register made of accum1 and accum2. New
34129
* bytes come at LSB of accum2, and shifted up to accum1
@@ -40,44 +135,68 @@ static void hash_chars(unsigned char *buf, unsigned long sz, int *count)
40135
while (sz) {
41136
accum1 = (accum1 << 8) | (accum2 >> 24);
42137
accum2 = (accum2 << 8) | *buf++;
43-
/* We want something that hashes permuted byte
44-
* sequences nicely; simpler hash like (accum1 ^
45-
* accum2) does not perform as well.
46-
*/
47-
i = (accum1 + accum2 * 0x61) % HASHBASE;
48-
count[i]++;
138+
hashval = (accum1 + accum2 * 0x61) % HASHBASE;
139+
hash = add_spanhash(hash, hashval);
49140
sz--;
50141
}
142+
return hash;
51143
}
52144

53145
int diffcore_count_changes(void *src, unsigned long src_size,
54146
void *dst, unsigned long dst_size,
147+
void **src_count_p,
148+
void **dst_count_p,
55149
unsigned long delta_limit,
56150
unsigned long *src_copied,
57151
unsigned long *literal_added)
58152
{
59-
int *src_count, *dst_count, i;
153+
int i, ssz;
154+
struct spanhash_top *src_count, *dst_count;
60155
unsigned long sc, la;
61156

62157
if (src_size < 8 || dst_size < 8)
63158
return -1;
64159

65-
src_count = xcalloc(HASHBASE * 2, sizeof(int));
66-
dst_count = src_count + HASHBASE;
67-
hash_chars(src, src_size, src_count);
68-
hash_chars(dst, dst_size, dst_count);
69-
160+
src_count = dst_count = NULL;
161+
if (src_count_p)
162+
src_count = *src_count_p;
163+
if (!src_count) {
164+
src_count = hash_chars(src, src_size);
165+
if (src_count_p)
166+
*src_count_p = src_count;
167+
}
168+
if (dst_count_p)
169+
dst_count = *dst_count_p;
170+
if (!dst_count) {
171+
dst_count = hash_chars(dst, dst_size);
172+
if (dst_count_p)
173+
*dst_count_p = dst_count;
174+
}
70175
sc = la = 0;
71-
for (i = 0; i < HASHBASE; i++) {
72-
if (src_count[i] < dst_count[i]) {
73-
la += dst_count[i] - src_count[i];
74-
sc += src_count[i];
176+
177+
ssz = 1 << src_count->alloc_log2;
178+
for (i = 0; i < ssz; i++) {
179+
struct spanhash *s = &(src_count->data[i]);
180+
struct spanhash *d;
181+
unsigned dst_cnt, src_cnt;
182+
if (!s->cnt)
183+
continue;
184+
src_cnt = s->cnt;
185+
d = spanhash_find(dst_count, s->hashval);
186+
dst_cnt = d ? d->cnt : 0;
187+
if (src_cnt < dst_cnt) {
188+
la += dst_cnt - src_cnt;
189+
sc += src_cnt;
75190
}
76-
else /* i.e. if (dst_count[i] <= src_count[i]) */
77-
sc += dst_count[i];
191+
else
192+
sc += dst_cnt;
78193
}
194+
195+
if (!src_count_p)
196+
free(src_count);
197+
if (!dst_count_p)
198+
free(dst_count);
79199
*src_copied = sc;
80200
*literal_added = la;
81-
free(src_count);
82201
return 0;
83202
}

diffcore-rename.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ static int estimate_similarity(struct diff_filespec *src,
166166
delta_limit = base_size * (MAX_SCORE-minimum_score) / MAX_SCORE;
167167
if (diffcore_count_changes(src->data, src->size,
168168
dst->data, dst->size,
169+
&src->cnt_data, &dst->cnt_data,
169170
delta_limit,
170171
&src_copied, &literal_added))
171172
return 0;
@@ -306,6 +307,8 @@ void diffcore_rename(struct diff_options *options)
306307
m->score = estimate_similarity(one, two,
307308
minimum_score);
308309
}
310+
free(two->cnt_data);
311+
two->cnt_data = NULL;
309312
dst_cnt++;
310313
}
311314
/* cost matrix sorted by most to least similar pair */

diffcore.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct diff_filespec {
2626
unsigned char sha1[20];
2727
char *path;
2828
void *data;
29+
void *cnt_data;
2930
unsigned long size;
3031
int xfrm_flags; /* for use by the xfrm */
3132
unsigned short mode; /* file mode */
@@ -103,6 +104,8 @@ void diff_debug_queue(const char *, struct diff_queue_struct *);
103104

104105
extern int diffcore_count_changes(void *src, unsigned long src_size,
105106
void *dst, unsigned long dst_size,
107+
void **src_count_p,
108+
void **dst_count_p,
106109
unsigned long delta_limit,
107110
unsigned long *src_copied,
108111
unsigned long *literal_added);

0 commit comments

Comments
 (0)