Skip to content

Commit 384b32c

Browse files
Nicolas Pitregitster
authored andcommitted
pack-objects: fix threaded load balancing
The current method consists of a master thread serving chunks of objects to work threads when they're done with their previous chunk. The issue is to determine the best chunk size: making it too large creates poor load balancing, while making it too small has a negative effect on pack size because of the increased number of chunk boundaries and poor delta window utilization. This patch implements a completely different approach by initially splitting the work in large chunks uniformly amongst all threads, and whenever a thread is done then it steals half of the remaining work from another thread with the largest amount of unprocessed objects. This has the advantage of greatly reducing the number of chunk boundaries with an almost perfect load balancing. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent b904166 commit 384b32c

File tree

1 file changed

+85
-32
lines changed

1 file changed

+85
-32
lines changed

builtin-pack-objects.c

Lines changed: 85 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1479,22 +1479,34 @@ static unsigned long free_unpacked(struct unpacked *n)
14791479
return freed_mem;
14801480
}
14811481

1482-
static void find_deltas(struct object_entry **list, unsigned list_size,
1482+
static void find_deltas(struct object_entry **list, unsigned *list_size,
14831483
int window, int depth, unsigned *processed)
14841484
{
1485-
uint32_t i = 0, idx = 0, count = 0;
1485+
uint32_t i, idx = 0, count = 0;
14861486
unsigned int array_size = window * sizeof(struct unpacked);
14871487
struct unpacked *array;
14881488
unsigned long mem_usage = 0;
14891489

14901490
array = xmalloc(array_size);
14911491
memset(array, 0, array_size);
14921492

1493-
do {
1494-
struct object_entry *entry = list[i++];
1493+
for (;;) {
1494+
struct object_entry *entry = *list++;
14951495
struct unpacked *n = array + idx;
14961496
int j, max_depth, best_base = -1;
14971497

1498+
progress_lock();
1499+
if (!*list_size) {
1500+
progress_unlock();
1501+
break;
1502+
}
1503+
(*list_size)--;
1504+
if (!entry->preferred_base) {
1505+
(*processed)++;
1506+
display_progress(progress_state, *processed);
1507+
}
1508+
progress_unlock();
1509+
14981510
mem_usage -= free_unpacked(n);
14991511
n->entry = entry;
15001512

@@ -1512,11 +1524,6 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
15121524
if (entry->preferred_base)
15131525
goto next;
15141526

1515-
progress_lock();
1516-
(*processed)++;
1517-
display_progress(progress_state, *processed);
1518-
progress_unlock();
1519-
15201527
/*
15211528
* If the current object is at pack edge, take the depth the
15221529
* objects that depend on the current object into account
@@ -1576,7 +1583,7 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
15761583
count++;
15771584
if (idx >= window)
15781585
idx = 0;
1579-
} while (i < list_size);
1586+
}
15801587

15811588
for (i = 0; i < window; ++i) {
15821589
free_delta_index(array[i].index);
@@ -1591,6 +1598,7 @@ struct thread_params {
15911598
pthread_t thread;
15921599
struct object_entry **list;
15931600
unsigned list_size;
1601+
unsigned remaining;
15941602
int window;
15951603
int depth;
15961604
unsigned *processed;
@@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
16121620
pthread_mutex_lock(&data_ready);
16131621
pthread_mutex_unlock(&data_request);
16141622

1615-
if (!me->list_size)
1623+
if (!me->remaining)
16161624
return NULL;
16171625

1618-
find_deltas(me->list, me->list_size,
1626+
find_deltas(me->list, &me->remaining,
16191627
me->window, me->depth, me->processed);
16201628
}
16211629
}
@@ -1624,57 +1632,102 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
16241632
int window, int depth, unsigned *processed)
16251633
{
16261634
struct thread_params *target, p[delta_search_threads];
1627-
int i, ret;
1628-
unsigned chunk_size;
1635+
int i, ret, active_threads = 0;
16291636

16301637
if (delta_search_threads <= 1) {
1631-
find_deltas(list, list_size, window, depth, processed);
1638+
find_deltas(list, &list_size, window, depth, processed);
16321639
return;
16331640
}
16341641

16351642
pthread_mutex_lock(&data_provider);
16361643
pthread_mutex_lock(&data_ready);
16371644

1645+
/* Start work threads. */
16381646
for (i = 0; i < delta_search_threads; i++) {
16391647
p[i].window = window;
16401648
p[i].depth = depth;
16411649
p[i].processed = processed;
1650+
p[i].remaining = 0;
16421651
ret = pthread_create(&p[i].thread, NULL,
16431652
threaded_find_deltas, &p[i]);
16441653
if (ret)
16451654
die("unable to create thread: %s", strerror(ret));
1655+
active_threads++;
16461656
}
16471657

1648-
/* this should be auto-tuned somehow */
1649-
chunk_size = window * 1000;
1658+
/* Then partition the work amongst them. */
1659+
for (i = 0; i < delta_search_threads; i++) {
1660+
unsigned sub_size = list_size / (delta_search_threads - i);
16501661

1651-
do {
1652-
unsigned sublist_size = chunk_size;
1653-
if (sublist_size > list_size)
1654-
sublist_size = list_size;
1662+
pthread_mutex_lock(&data_provider);
1663+
target = data_requester;
1664+
if (!sub_size) {
1665+
pthread_mutex_unlock(&data_ready);
1666+
pthread_join(target->thread, NULL);
1667+
active_threads--;
1668+
continue;
1669+
}
16551670

16561671
/* try to split chunks on "path" boundaries */
1657-
while (sublist_size < list_size && list[sublist_size]->hash &&
1658-
list[sublist_size]->hash == list[sublist_size-1]->hash)
1659-
sublist_size++;
1672+
while (sub_size < list_size && list[sub_size]->hash &&
1673+
list[sub_size]->hash == list[sub_size-1]->hash)
1674+
sub_size++;
1675+
1676+
target->list = list;
1677+
target->list_size = sub_size;
1678+
target->remaining = sub_size;
1679+
pthread_mutex_unlock(&data_ready);
16601680

1681+
list += sub_size;
1682+
list_size -= sub_size;
1683+
}
1684+
1685+
/*
1686+
* Now let's wait for work completion. Each time a thread is done
1687+
* with its work, we steal half of the remaining work from the
1688+
* thread with the largest number of unprocessed objects and give
1689+
* it to that newly idle thread. This ensure good load balancing
1690+
* until the remaining object list segments are simply too short
1691+
* to be worth splitting anymore.
1692+
*/
1693+
do {
1694+
struct thread_params *victim = NULL;
1695+
unsigned sub_size = 0;
16611696
pthread_mutex_lock(&data_provider);
16621697
target = data_requester;
1663-
target->list = list;
1664-
target->list_size = sublist_size;
1698+
1699+
progress_lock();
1700+
for (i = 0; i < delta_search_threads; i++)
1701+
if (p[i].remaining > 2*window &&
1702+
(!victim || victim->remaining < p[i].remaining))
1703+
victim = &p[i];
1704+
if (victim) {
1705+
sub_size = victim->remaining / 2;
1706+
list = victim->list + victim->list_size - sub_size;
1707+
while (sub_size && list[0]->hash &&
1708+
list[0]->hash == list[-1]->hash) {
1709+
list++;
1710+
sub_size--;
1711+
}
1712+
target->list = list;
1713+
victim->list_size -= sub_size;
1714+
victim->remaining -= sub_size;
1715+
}
1716+
progress_unlock();
1717+
1718+
target->list_size = sub_size;
1719+
target->remaining = sub_size;
16651720
pthread_mutex_unlock(&data_ready);
16661721

1667-
list += sublist_size;
1668-
list_size -= sublist_size;
1669-
if (!sublist_size) {
1722+
if (!sub_size) {
16701723
pthread_join(target->thread, NULL);
1671-
i--;
1724+
active_threads--;
16721725
}
1673-
} while (i);
1726+
} while (active_threads);
16741727
}
16751728

16761729
#else
1677-
#define ll_find_deltas find_deltas
1730+
#define ll_find_deltas(l, s, w, d, p) find_deltas(l, &s, w, d, p)
16781731
#endif
16791732

16801733
static void prepare_pack(int window, int depth)

0 commit comments

Comments
 (0)