Skip to content

Commit 1f7c926

Browse files
peffgitster
authored andcommitted
xdiff: drop XDL_FAST_HASH
The xdiff code hashes every line of both sides of a diff, and then compares those hashes to find duplicates. The overall performance depends both on how fast we can compute the hashes, but also on how many hash collisions we see. The idea of XDL_FAST_HASH is to speed up the hash computation. But the generated hashes have worse collision behavior. This means that in some cases it speeds diffs up (running "git log -p" on git.git improves by ~8% with it), but in others it can slow things down. One pathological case saw over a 100x slowdown[1]. There may be a better hash function that covers both properties, but in the meantime we are better off with the original hash. It's slightly slower in the common case, but it has fewer surprising pathological cases. [1] http://public-inbox.org/git/20141222041944.GA441@peff.net/ Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent 8d7a455 commit 1f7c926

File tree

3 files changed

+0
-118
lines changed

3 files changed

+0
-118
lines changed

Makefile

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -338,11 +338,6 @@ all::
338338
#
339339
# Define NATIVE_CRLF if your platform uses CRLF for line endings.
340340
#
341-
# Define XDL_FAST_HASH to use an alternative line-hashing method in
342-
# the diff algorithm. It gives a nice speedup if your processor has
343-
# fast unaligned word loads. Does NOT work on big-endian systems!
344-
# Enabled by default on x86_64.
345-
#
346341
# Define GIT_USER_AGENT if you want to change how git identifies itself during
347342
# network interactions. The default is "git/$(GIT_VERSION)".
348343
#
@@ -1485,10 +1480,6 @@ ifndef NO_MSGFMT_EXTENDED_OPTIONS
14851480
MSGFMT += --check --statistics
14861481
endif
14871482

1488-
ifneq (,$(XDL_FAST_HASH))
1489-
BASIC_CFLAGS += -DXDL_FAST_HASH
1490-
endif
1491-
14921483
ifdef GMTIME_UNRELIABLE_ERRORS
14931484
COMPAT_OBJS += compat/gmtime.o
14941485
BASIC_CFLAGS += -DGMTIME_UNRELIABLE_ERRORS

config.mak.uname

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@ endif
1717
# because maintaining the nesting to match is a pain. If
1818
# we had "elif" things would have been much nicer...
1919

20-
ifeq ($(uname_M),x86_64)
21-
XDL_FAST_HASH = YesPlease
22-
endif
2320
ifeq ($(uname_S),OSF1)
2421
# Need this for u_short definitions et al
2522
BASIC_CFLAGS += -D_OSF_SOURCE

xdiff/xutils.c

Lines changed: 0 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -264,110 +264,6 @@ static unsigned long xdl_hash_record_with_whitespace(char const **data,
264264
return ha;
265265
}
266266

267-
#ifdef XDL_FAST_HASH
268-
269-
#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x))
270-
271-
#define ONEBYTES REPEAT_BYTE(0x01)
272-
#define NEWLINEBYTES REPEAT_BYTE(0x0a)
273-
#define HIGHBITS REPEAT_BYTE(0x80)
274-
275-
/* Return the high bit set in the first byte that is a zero */
276-
static inline unsigned long has_zero(unsigned long a)
277-
{
278-
return ((a - ONEBYTES) & ~a) & HIGHBITS;
279-
}
280-
281-
static inline long count_masked_bytes(unsigned long mask)
282-
{
283-
if (sizeof(long) == 8) {
284-
/*
285-
* Jan Achrenius on G+: microoptimized version of
286-
* the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
287-
* that works for the bytemasks without having to
288-
* mask them first.
289-
*/
290-
/*
291-
* return mask * 0x0001020304050608 >> 56;
292-
*
293-
* Doing it like this avoids warnings on 32-bit machines.
294-
*/
295-
long a = (REPEAT_BYTE(0x01) / 0xff + 1);
296-
return mask * a >> (sizeof(long) * 7);
297-
} else {
298-
/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
299-
/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
300-
long a = (0x0ff0001 + mask) >> 23;
301-
/* Fix the 1 for 00 case */
302-
return a & mask;
303-
}
304-
}
305-
306-
unsigned long xdl_hash_record(char const **data, char const *top, long flags)
307-
{
308-
unsigned long hash = 5381;
309-
unsigned long a = 0, mask = 0;
310-
char const *ptr = *data;
311-
char const *end = top - sizeof(unsigned long) + 1;
312-
313-
if (flags & XDF_WHITESPACE_FLAGS)
314-
return xdl_hash_record_with_whitespace(data, top, flags);
315-
316-
ptr -= sizeof(unsigned long);
317-
do {
318-
hash += hash << 5;
319-
hash ^= a;
320-
ptr += sizeof(unsigned long);
321-
if (ptr >= end)
322-
break;
323-
a = *(unsigned long *)ptr;
324-
/* Do we have any '\n' bytes in this word? */
325-
mask = has_zero(a ^ NEWLINEBYTES);
326-
} while (!mask);
327-
328-
if (ptr >= end) {
329-
/*
330-
* There is only a partial word left at the end of the
331-
* buffer. Because we may work with a memory mapping,
332-
* we have to grab the rest byte by byte instead of
333-
* blindly reading it.
334-
*
335-
* To avoid problems with masking in a signed value,
336-
* we use an unsigned char here.
337-
*/
338-
const char *p;
339-
for (p = top - 1; p >= ptr; p--)
340-
a = (a << 8) + *((const unsigned char *)p);
341-
mask = has_zero(a ^ NEWLINEBYTES);
342-
if (!mask)
343-
/*
344-
* No '\n' found in the partial word. Make a
345-
* mask that matches what we read.
346-
*/
347-
mask = 1UL << (8 * (top - ptr) + 7);
348-
}
349-
350-
/* The mask *below* the first high bit set */
351-
mask = (mask - 1) & ~mask;
352-
mask >>= 7;
353-
hash += hash << 5;
354-
hash ^= a & mask;
355-
356-
/* Advance past the last (possibly partial) word */
357-
ptr += count_masked_bytes(mask);
358-
359-
if (ptr < top) {
360-
assert(*ptr == '\n');
361-
ptr++;
362-
}
363-
364-
*data = ptr;
365-
366-
return hash;
367-
}
368-
369-
#else /* XDL_FAST_HASH */
370-
371267
unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
372268
unsigned long ha = 5381;
373269
char const *ptr = *data;
@@ -384,8 +280,6 @@ unsigned long xdl_hash_record(char const **data, char const *top, long flags) {
384280
return ha;
385281
}
386282

387-
#endif /* XDL_FAST_HASH */
388-
389283
unsigned int xdl_hashbits(unsigned int size) {
390284
unsigned int val = 1, bits = 0;
391285

0 commit comments

Comments
 (0)