Skip to content

Commit 9027f53

Browse files
torvaldsgitster
authored andcommitted
Do linear-time/space rename logic for exact renames
This implements a smarter rename detector for exact renames, which rather than doing a pairwise comparison (time O(m*n)) will just hash the files into a hash-table (size O(n+m)), and only do pairwise comparisons to renames that have the same hash (time O(n+m) except for unrealistic hash collissions, which we just cull aggressively). Admittedly the exact rename case is not nearly as interesting as the generic case, but it's an important case none-the-less. A similar general approach should work for the generic case too, but even then you do need to handle the exact renames/copies separately (to avoid the inevitable added cost factor that comes from the _size_ of the file), so this is worth doing. In the expectation that we will indeed do the same hashing trick for the general rename case, this code uses a generic hash-table implementation that can be used for other things too. In fact, we might be able to consolidate some of our existing hash tables with the new generic code in hash.[ch]. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent 6447971 commit 9027f53

File tree

4 files changed

+303
-65
lines changed

4 files changed

+303
-65
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ LIB_H = \
290290
run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h \
291291
tree-walk.h log-tree.h dir.h path-list.h unpack-trees.h builtin.h \
292292
utf8.h reflog-walk.h patch-ids.h attr.h decorate.h progress.h \
293-
mailmap.h remote.h transport.h diffcore.h
293+
mailmap.h remote.h transport.h diffcore.h hash.h
294294

295295
DIFF_OBJS = \
296296
diff.o diff-lib.o diffcore-break.o diffcore-order.o \
@@ -300,7 +300,7 @@ DIFF_OBJS = \
300300
LIB_OBJS = \
301301
blob.o commit.o connect.o csum-file.o cache-tree.o base85.o \
302302
date.o diff-delta.o entry.o exec_cmd.o ident.o \
303-
interpolate.o \
303+
interpolate.o hash.o \
304304
lockfile.o \
305305
patch-ids.o \
306306
object.o pack-check.o pack-write.o patch-delta.o path.o pkt-line.o \

diffcore-rename.c

Lines changed: 148 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "cache.h"
55
#include "diff.h"
66
#include "diffcore.h"
7+
#include "hash.h"
78

89
/* Table of rename/copy destinations */
910

@@ -93,29 +94,6 @@ static struct diff_rename_src *register_rename_src(struct diff_filespec *one,
9394
return &(rename_src[first]);
9495
}
9596

96-
static int is_exact_match(struct diff_filespec *src,
97-
struct diff_filespec *dst,
98-
int contents_too)
99-
{
100-
if (src->sha1_valid && dst->sha1_valid &&
101-
!hashcmp(src->sha1, dst->sha1))
102-
return 1;
103-
if (!contents_too)
104-
return 0;
105-
if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1))
106-
return 0;
107-
if (src->size != dst->size)
108-
return 0;
109-
if (src->sha1_valid && dst->sha1_valid)
110-
return !hashcmp(src->sha1, dst->sha1);
111-
if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0))
112-
return 0;
113-
if (src->size == dst->size &&
114-
!memcmp(src->data, dst->data, src->size))
115-
return 1;
116-
return 0;
117-
}
118-
11997
static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
12098
{
12199
int src_len = strlen(src->path), dst_len = strlen(dst->path);
@@ -242,56 +220,163 @@ static int score_compare(const void *a_, const void *b_)
242220
return b->score - a->score;
243221
}
244222

223+
struct file_similarity {
224+
int src_dst, index;
225+
struct diff_filespec *filespec;
226+
struct file_similarity *next;
227+
};
228+
229+
static int find_identical_files(struct file_similarity *src,
230+
struct file_similarity *dst)
231+
{
232+
int renames = 0;
233+
234+
/*
235+
* Walk over all the destinations ...
236+
*/
237+
do {
238+
struct diff_filespec *one = dst->filespec;
239+
struct file_similarity *p, *best;
240+
int i = 100;
241+
242+
/*
243+
* .. to find the best source match
244+
*/
245+
best = NULL;
246+
for (p = src; p; p = p->next) {
247+
struct diff_filespec *two = p->filespec;
248+
249+
/* False hash collission? */
250+
if (hashcmp(one->sha1, two->sha1))
251+
continue;
252+
/* Non-regular files? If so, the modes must match! */
253+
if (!S_ISREG(one->mode) || !S_ISREG(two->mode)) {
254+
if (one->mode != two->mode)
255+
continue;
256+
}
257+
best = p;
258+
if (basename_same(one, two))
259+
break;
260+
261+
/* Too many identical alternatives? Pick one */
262+
if (!--i)
263+
break;
264+
}
265+
if (best) {
266+
record_rename_pair(dst->index, best->index, MAX_SCORE);
267+
renames++;
268+
}
269+
} while ((dst = dst->next) != NULL);
270+
return renames;
271+
}
272+
273+
/*
274+
* Note: the rest of the rename logic depends on this
275+
* phase also populating all the filespecs for any
276+
* entry that isn't matched up with an exact rename.
277+
*/
278+
static void free_similarity_list(struct file_similarity *p)
279+
{
280+
while (p) {
281+
struct file_similarity *entry = p;
282+
p = p->next;
283+
284+
/* Stupid special case, see note above! */
285+
diff_populate_filespec(entry->filespec, 0);
286+
free(entry);
287+
}
288+
}
289+
290+
static int find_same_files(void *ptr)
291+
{
292+
int ret;
293+
struct file_similarity *p = ptr;
294+
struct file_similarity *src = NULL, *dst = NULL;
295+
296+
/* Split the hash list up into sources and destinations */
297+
do {
298+
struct file_similarity *entry = p;
299+
p = p->next;
300+
if (entry->src_dst < 0) {
301+
entry->next = src;
302+
src = entry;
303+
} else {
304+
entry->next = dst;
305+
dst = entry;
306+
}
307+
} while (p);
308+
309+
/*
310+
* If we have both sources *and* destinations, see if
311+
* we can match them up
312+
*/
313+
ret = (src && dst) ? find_identical_files(src, dst) : 0;
314+
315+
/* Free the hashes and return the number of renames found */
316+
free_similarity_list(src);
317+
free_similarity_list(dst);
318+
return ret;
319+
}
320+
321+
static unsigned int hash_filespec(struct diff_filespec *filespec)
322+
{
323+
unsigned int hash;
324+
if (!filespec->sha1_valid) {
325+
if (diff_populate_filespec(filespec, 0))
326+
return 0;
327+
hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1);
328+
}
329+
memcpy(&hash, filespec->sha1, sizeof(hash));
330+
return hash;
331+
}
332+
333+
static void insert_file_table(struct hash_table *table, int src_dst, int index, struct diff_filespec *filespec)
334+
{
335+
void **pos;
336+
unsigned int hash;
337+
struct file_similarity *entry = xmalloc(sizeof(*entry));
338+
339+
entry->src_dst = src_dst;
340+
entry->index = index;
341+
entry->filespec = filespec;
342+
entry->next = NULL;
343+
344+
hash = hash_filespec(filespec);
345+
pos = insert_hash(hash, entry, table);
346+
347+
/* We already had an entry there? */
348+
if (pos) {
349+
entry->next = *pos;
350+
*pos = entry;
351+
}
352+
}
353+
245354
/*
246355
* Find exact renames first.
247356
*
248357
* The first round matches up the up-to-date entries,
249358
* and then during the second round we try to match
250359
* cache-dirty entries as well.
251-
*
252-
* Note: the rest of the rename logic depends on this
253-
* phase also populating all the filespecs for any
254-
* entry that isn't matched up with an exact rename,
255-
* see "is_exact_match()".
256360
*/
257361
static int find_exact_renames(void)
258362
{
259-
int rename_count = 0;
260-
int contents_too;
261-
262-
for (contents_too = 0; contents_too < 2; contents_too++) {
263-
int i;
264-
265-
for (i = 0; i < rename_dst_nr; i++) {
266-
struct diff_filespec *two = rename_dst[i].two;
267-
int j;
268-
269-
if (rename_dst[i].pair)
270-
continue; /* dealt with an earlier round */
271-
for (j = 0; j < rename_src_nr; j++) {
272-
int k;
273-
struct diff_filespec *one = rename_src[j].one;
274-
if (!is_exact_match(one, two, contents_too))
275-
continue;
363+
int i;
364+
struct hash_table file_table;
276365

277-
/* see if there is a basename match, too */
278-
for (k = j; k < rename_src_nr; k++) {
279-
one = rename_src[k].one;
280-
if (basename_same(one, two) &&
281-
is_exact_match(one, two,
282-
contents_too)) {
283-
j = k;
284-
break;
285-
}
286-
}
287-
288-
record_rename_pair(i, j, (int)MAX_SCORE);
289-
rename_count++;
290-
break; /* we are done with this entry */
291-
}
292-
}
293-
}
294-
return rename_count;
366+
init_hash(&file_table);
367+
for (i = 0; i < rename_src_nr; i++)
368+
insert_file_table(&file_table, -1, i, rename_src[i].one);
369+
370+
for (i = 0; i < rename_dst_nr; i++)
371+
insert_file_table(&file_table, 1, i, rename_dst[i].two);
372+
373+
/* Find the renames */
374+
i = for_each_hash(&file_table, find_same_files);
375+
376+
/* .. and free the hash data structure */
377+
free_hash(&file_table);
378+
379+
return i;
295380
}
296381

297382
void diffcore_rename(struct diff_options *options)

hash.c

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
/*
2+
* Some generic hashing helpers.
3+
*/
4+
#include "cache.h"
5+
#include "hash.h"
6+
7+
/*
8+
* Look up a hash entry in the hash table. Return the pointer to
9+
* the existing entry, or the empty slot if none existed. The caller
10+
* can then look at the (*ptr) to see whether it existed or not.
11+
*/
12+
static struct hash_table_entry *lookup_hash_entry(unsigned int hash, struct hash_table *table)
13+
{
14+
unsigned int size = table->size, nr = hash % size;
15+
struct hash_table_entry *array = table->array;
16+
17+
while (array[nr].ptr) {
18+
if (array[nr].hash == hash)
19+
break;
20+
nr++;
21+
if (nr >= size)
22+
nr = 0;
23+
}
24+
return array + nr;
25+
}
26+
27+
28+
/*
29+
* Insert a new hash entry pointer into the table.
30+
*
31+
* If that hash entry already existed, return the pointer to
32+
* the existing entry (and the caller can create a list of the
33+
* pointers or do anything else). If it didn't exist, return
34+
* NULL (and the caller knows the pointer has been inserted).
35+
*/
36+
static void **insert_hash_entry(unsigned int hash, void *ptr, struct hash_table *table)
37+
{
38+
struct hash_table_entry *entry = lookup_hash_entry(hash, table);
39+
40+
if (!entry->ptr) {
41+
entry->ptr = ptr;
42+
entry->hash = hash;
43+
table->nr++;
44+
return NULL;
45+
}
46+
return &entry->ptr;
47+
}
48+
49+
static void grow_hash_table(struct hash_table *table)
50+
{
51+
unsigned int i;
52+
unsigned int old_size = table->size, new_size;
53+
struct hash_table_entry *old_array = table->array, *new_array;
54+
55+
new_size = alloc_nr(old_size);
56+
new_array = xcalloc(sizeof(struct hash_table_entry), new_size);
57+
table->size = new_size;
58+
table->array = new_array;
59+
table->nr = 0;
60+
for (i = 0; i < old_size; i++) {
61+
unsigned int hash = old_array[i].hash;
62+
void *ptr = old_array[i].ptr;
63+
if (ptr)
64+
insert_hash_entry(hash, ptr, table);
65+
}
66+
free(old_array);
67+
}
68+
69+
void *lookup_hash(unsigned int hash, struct hash_table *table)
70+
{
71+
if (!table->array)
72+
return NULL;
73+
return &lookup_hash_entry(hash, table)->ptr;
74+
}
75+
76+
void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table)
77+
{
78+
unsigned int nr = table->nr;
79+
if (nr >= table->size/2)
80+
grow_hash_table(table);
81+
return insert_hash_entry(hash, ptr, table);
82+
}
83+
84+
int for_each_hash(struct hash_table *table, int (*fn)(void *))
85+
{
86+
int sum = 0;
87+
unsigned int i;
88+
unsigned int size = table->size;
89+
struct hash_table_entry *array = table->array;
90+
91+
for (i = 0; i < size; i++) {
92+
void *ptr = array->ptr;
93+
array++;
94+
if (ptr) {
95+
int val = fn(ptr);
96+
if (val < 0)
97+
return val;
98+
sum += val;
99+
}
100+
}
101+
return sum;
102+
}
103+
104+
void free_hash(struct hash_table *table)
105+
{
106+
free(table->array);
107+
table->array = NULL;
108+
table->size = 0;
109+
table->nr = 0;
110+
}

0 commit comments

Comments
 (0)