Skip to content

Commit 5fc2fc8

Browse files
pcloudsgitster
authored andcommitted
read-cache: split-index mode
This split-index mode is designed to keep write cost proportional to the number of changes the user has made, not the size of the work tree. (Read cost is another matter, to be dealt separately.) This mode stores index info in a pair of $GIT_DIR/index and $GIT_DIR/sharedindex.<SHA-1>. sharedindex is large and unchanged over time while "index" is smaller and updated often. Format details are in index-format.txt, although not everything is implemented in this patch. Shared indexes are not automatically removed, because it's unclear if the shared index is needed by any (even temporary) indexes by just looking at it. After a while you'll collect stale shared indexes. The good news is one shared index is useable for long, until $GIT_DIR/index becomes too big and sluggish that the new shared index must be created. The safest way to clean shared indexes is to turn off split index mode, so shared files are all garbage, delete them all, then turn on split index mode again. Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent e93021b commit 5fc2fc8

File tree

8 files changed

+253
-5
lines changed

8 files changed

+253
-5
lines changed

Documentation/gitrepository-layout.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ index::
155155
The current index file for the repository. It is
156156
usually not found in a bare repository.
157157

158+
sharedindex.<SHA-1>::
159+
The shared index part, to be referenced by $GIT_DIR/index and
160+
other temporary index files. Only valid in split index mode.
161+
158162
info::
159163
Additional information about the repository is recorded
160164
in this directory.

Documentation/technical/index-format.txt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ Git index format
129129
(Version 4) In version 4, the padding after the pathname does not
130130
exist.
131131

132+
Interpretation of index entries in split index mode is completely
133+
different. See below for details.
134+
132135
== Extensions
133136

134137
=== Cached tree
@@ -198,3 +201,35 @@ Git index format
198201
- At most three 160-bit object names of the entry in stages from 1 to 3
199202
(nothing is written for a missing stage).
200203

204+
=== Split index
205+
206+
In split index mode, the majority of index entries could be stored
207+
in a separate file. This extension records the changes to be made on
208+
top of that to produce the final index.
209+
210+
The signature for this extension is { 'l', 'i, 'n', 'k' }.
211+
212+
The extension consists of:
213+
214+
- 160-bit SHA-1 of the shared index file. The shared index file path
215+
is $GIT_DIR/sharedindex.<SHA-1>. If all 160 bits are zero, the
216+
index does not require a shared index file.
217+
218+
- An ewah-encoded delete bitmap, each bit represents an entry in the
219+
shared index. If a bit is set, its corresponding entry in the
220+
shared index will be removed from the final index. Note, because
221+
a delete operation changes index entry positions, but we do need
222+
original positions in replace phase, it's best to just mark
223+
entries for removal, then do a mass deletion after replacement.
224+
225+
- An ewah-encoded replace bitmap, each bit represents an entry in
226+
the shared index. If a bit is set, its corresponding entry in the
227+
shared index will be replaced with an entry in this index
228+
file. All replaced entries are stored in sorted order in this
229+
index. The first "1" bit in the replace bitmap corresponds to the
230+
first index entry, the second "1" bit to the second entry and so
231+
on. Replaced entries may have empty path names to save space.
232+
233+
The remaining index entries after replaced ones will be added to the
234+
final index. These added entries are also sorted by entry namme then
235+
stage.

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,7 @@ LIB_OBJS += sha1_name.o
887887
LIB_OBJS += shallow.o
888888
LIB_OBJS += sideband.o
889889
LIB_OBJS += sigchain.o
890+
LIB_OBJS += split-index.o
890891
LIB_OBJS += strbuf.o
891892
LIB_OBJS += streaming.o
892893
LIB_OBJS += string-list.o

cache.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ struct cache_entry {
135135
unsigned int ce_mode;
136136
unsigned int ce_flags;
137137
unsigned int ce_namelen;
138+
unsigned int index; /* for link extension */
138139
unsigned char sha1[20];
139140
char name[FLEX_ARRAY]; /* more */
140141
};
@@ -275,12 +276,14 @@ static inline unsigned int canon_mode(unsigned int mode)
275276
#define RESOLVE_UNDO_CHANGED (1 << 4)
276277
#define CACHE_TREE_CHANGED (1 << 5)
277278

279+
struct split_index;
278280
struct index_state {
279281
struct cache_entry **cache;
280282
unsigned int version;
281283
unsigned int cache_nr, cache_alloc, cache_changed;
282284
struct string_list *resolve_undo;
283285
struct cache_tree *cache_tree;
286+
struct split_index *split_index;
284287
struct cache_time timestamp;
285288
unsigned name_hash_initialized : 1,
286289
initialized : 1;

read-cache.c

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "resolve-undo.h"
1515
#include "strbuf.h"
1616
#include "varint.h"
17+
#include "split-index.h"
1718

1819
static struct cache_entry *refresh_cache_entry(struct cache_entry *ce,
1920
unsigned int options);
@@ -34,6 +35,10 @@ static struct cache_entry *refresh_cache_entry(struct cache_entry *ce,
3435
#define CACHE_EXT(s) ( (s[0]<<24)|(s[1]<<16)|(s[2]<<8)|(s[3]) )
3536
#define CACHE_EXT_TREE 0x54524545 /* "TREE" */
3637
#define CACHE_EXT_RESOLVE_UNDO 0x52455543 /* "REUC" */
38+
#define CACHE_EXT_LINK 0x6c696e6b /* "link" */
39+
40+
/* changes that can be kept in $GIT_DIR/index (basically all extensions) */
41+
#define EXTMASK (RESOLVE_UNDO_CHANGED | CACHE_TREE_CHANGED)
3742

3843
struct index_state the_index;
3944
static const char *alternate_index_output;
@@ -63,6 +68,7 @@ void rename_index_entry_at(struct index_state *istate, int nr, const char *new_n
6368
copy_cache_entry(new, old);
6469
new->ce_flags &= ~CE_HASHED;
6570
new->ce_namelen = namelen;
71+
new->index = 0;
6672
memcpy(new->name, new_name, namelen + 1);
6773

6874
cache_tree_invalidate_path(istate, old->name);
@@ -1335,6 +1341,10 @@ static int read_index_extension(struct index_state *istate,
13351341
case CACHE_EXT_RESOLVE_UNDO:
13361342
istate->resolve_undo = resolve_undo_read(data, sz);
13371343
break;
1344+
case CACHE_EXT_LINK:
1345+
if (read_link_extension(istate, data, sz))
1346+
return -1;
1347+
break;
13381348
default:
13391349
if (*ext < 'A' || 'Z' < *ext)
13401350
return error("index uses %.4s extension, which we do not understand",
@@ -1369,6 +1379,7 @@ static struct cache_entry *cache_entry_from_ondisk(struct ondisk_cache_entry *on
13691379
ce->ce_stat_data.sd_size = get_be32(&ondisk->size);
13701380
ce->ce_flags = flags & ~CE_NAMEMASK;
13711381
ce->ce_namelen = len;
1382+
ce->index = 0;
13721383
hashcpy(ce->sha1, ondisk->sha1);
13731384
memcpy(ce->name, name, len);
13741385
ce->name[len] = '\0';
@@ -1443,7 +1454,8 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk,
14431454
}
14441455

14451456
/* remember to discard_cache() before reading a different cache! */
1446-
int read_index_from(struct index_state *istate, const char *path)
1457+
static int do_read_index(struct index_state *istate, const char *path,
1458+
int must_exist)
14471459
{
14481460
int fd, i;
14491461
struct stat st;
@@ -1460,9 +1472,9 @@ int read_index_from(struct index_state *istate, const char *path)
14601472
istate->timestamp.nsec = 0;
14611473
fd = open(path, O_RDONLY);
14621474
if (fd < 0) {
1463-
if (errno == ENOENT)
1475+
if (!must_exist && errno == ENOENT)
14641476
return 0;
1465-
die_errno("index file open failed");
1477+
die_errno("%s: index file open failed", path);
14661478
}
14671479

14681480
if (fstat(fd, &st))
@@ -1535,6 +1547,42 @@ int read_index_from(struct index_state *istate, const char *path)
15351547
die("index file corrupt");
15361548
}
15371549

1550+
int read_index_from(struct index_state *istate, const char *path)
1551+
{
1552+
struct split_index *split_index;
1553+
int ret;
1554+
1555+
/* istate->initialized covers both .git/index and .git/sharedindex.xxx */
1556+
if (istate->initialized)
1557+
return istate->cache_nr;
1558+
1559+
ret = do_read_index(istate, path, 0);
1560+
split_index = istate->split_index;
1561+
if (!split_index)
1562+
return ret;
1563+
1564+
if (is_null_sha1(split_index->base_sha1))
1565+
return ret;
1566+
if (istate->cache_nr)
1567+
die("index in split-index mode must contain no entries");
1568+
1569+
if (split_index->base)
1570+
discard_index(split_index->base);
1571+
else
1572+
split_index->base = xcalloc(1, sizeof(*split_index->base));
1573+
ret = do_read_index(split_index->base,
1574+
git_path("sharedindex.%s",
1575+
sha1_to_hex(split_index->base_sha1)), 1);
1576+
if (hashcmp(split_index->base_sha1, split_index->base->sha1))
1577+
die("broken index, expect %s in %s, got %s",
1578+
sha1_to_hex(split_index->base_sha1),
1579+
git_path("sharedindex.%s",
1580+
sha1_to_hex(split_index->base_sha1)),
1581+
sha1_to_hex(split_index->base->sha1));
1582+
merge_base_index(istate);
1583+
return ret;
1584+
}
1585+
15381586
int is_index_unborn(struct index_state *istate)
15391587
{
15401588
return (!istate->cache_nr && !istate->timestamp.sec);
@@ -1544,8 +1592,15 @@ int discard_index(struct index_state *istate)
15441592
{
15451593
int i;
15461594

1547-
for (i = 0; i < istate->cache_nr; i++)
1595+
for (i = 0; i < istate->cache_nr; i++) {
1596+
if (istate->cache[i]->index &&
1597+
istate->split_index &&
1598+
istate->split_index->base &&
1599+
istate->cache[i]->index <= istate->split_index->base->cache_nr &&
1600+
istate->cache[i] == istate->split_index->base->cache[istate->cache[i]->index - 1])
1601+
continue;
15481602
free(istate->cache[i]);
1603+
}
15491604
resolve_undo_clear_index(istate);
15501605
istate->cache_nr = 0;
15511606
istate->cache_changed = 0;
@@ -1557,6 +1612,7 @@ int discard_index(struct index_state *istate)
15571612
free(istate->cache);
15581613
istate->cache = NULL;
15591614
istate->cache_alloc = 0;
1615+
discard_split_index(istate);
15601616
return 0;
15611617
}
15621618

@@ -1852,6 +1908,17 @@ static int do_write_index(struct index_state *istate, int newfd)
18521908
strbuf_release(&previous_name_buf);
18531909

18541910
/* Write extension data here */
1911+
if (istate->split_index) {
1912+
struct strbuf sb = STRBUF_INIT;
1913+
1914+
err = write_link_extension(&sb, istate) < 0 ||
1915+
write_index_ext_header(&c, newfd, CACHE_EXT_LINK,
1916+
sb.len) < 0 ||
1917+
ce_write(&c, newfd, sb.buf, sb.len) < 0;
1918+
strbuf_release(&sb);
1919+
if (err)
1920+
return -1;
1921+
}
18551922
if (istate->cache_tree) {
18561923
struct strbuf sb = STRBUF_INIT;
18571924

@@ -1916,10 +1983,29 @@ static int do_write_locked_index(struct index_state *istate, struct lock_file *l
19161983
return ret;
19171984
}
19181985

1986+
static int write_split_index(struct index_state *istate,
1987+
struct lock_file *lock,
1988+
unsigned flags)
1989+
{
1990+
int ret;
1991+
prepare_to_write_split_index(istate);
1992+
ret = do_write_locked_index(istate, lock, flags);
1993+
finish_writing_split_index(istate);
1994+
return ret;
1995+
}
1996+
19191997
int write_locked_index(struct index_state *istate, struct lock_file *lock,
19201998
unsigned flags)
19211999
{
1922-
return do_write_locked_index(istate, lock, flags);
2000+
struct split_index *si = istate->split_index;
2001+
2002+
if (!si || (istate->cache_changed & ~EXTMASK)) {
2003+
if (si)
2004+
hashclr(si->base_sha1);
2005+
return do_write_locked_index(istate, lock, flags);
2006+
}
2007+
2008+
return write_split_index(istate, lock, flags);
19232009
}
19242010

19252011
/*

split-index.c

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#include "cache.h"
2+
#include "split-index.h"
3+
4+
struct split_index *init_split_index(struct index_state *istate)
5+
{
6+
if (!istate->split_index) {
7+
istate->split_index = xcalloc(1, sizeof(*istate->split_index));
8+
istate->split_index->refcount = 1;
9+
}
10+
return istate->split_index;
11+
}
12+
13+
int read_link_extension(struct index_state *istate,
14+
const void *data_, unsigned long sz)
15+
{
16+
const unsigned char *data = data_;
17+
struct split_index *si;
18+
if (sz < 20)
19+
return error("corrupt link extension (too short)");
20+
si = init_split_index(istate);
21+
hashcpy(si->base_sha1, data);
22+
data += 20;
23+
sz -= 20;
24+
if (sz)
25+
return error("garbage at the end of link extension");
26+
return 0;
27+
}
28+
29+
int write_link_extension(struct strbuf *sb,
30+
struct index_state *istate)
31+
{
32+
struct split_index *si = istate->split_index;
33+
strbuf_add(sb, si->base_sha1, 20);
34+
return 0;
35+
}
36+
37+
static void mark_base_index_entries(struct index_state *base)
38+
{
39+
int i;
40+
/*
41+
* To keep track of the shared entries between
42+
* istate->base->cache[] and istate->cache[], base entry
43+
* position is stored in each base entry. All positions start
44+
* from 1 instead of 0, which is resrved to say "this is a new
45+
* entry".
46+
*/
47+
for (i = 0; i < base->cache_nr; i++)
48+
base->cache[i]->index = i + 1;
49+
}
50+
51+
void merge_base_index(struct index_state *istate)
52+
{
53+
struct split_index *si = istate->split_index;
54+
55+
mark_base_index_entries(si->base);
56+
istate->cache_nr = si->base->cache_nr;
57+
ALLOC_GROW(istate->cache, istate->cache_nr, istate->cache_alloc);
58+
memcpy(istate->cache, si->base->cache,
59+
sizeof(*istate->cache) * istate->cache_nr);
60+
}
61+
62+
void prepare_to_write_split_index(struct index_state *istate)
63+
{
64+
struct split_index *si = init_split_index(istate);
65+
/* take cache[] out temporarily */
66+
si->saved_cache_nr = istate->cache_nr;
67+
istate->cache_nr = 0;
68+
}
69+
70+
void finish_writing_split_index(struct index_state *istate)
71+
{
72+
struct split_index *si = init_split_index(istate);
73+
istate->cache_nr = si->saved_cache_nr;
74+
}
75+
76+
void discard_split_index(struct index_state *istate)
77+
{
78+
struct split_index *si = istate->split_index;
79+
if (!si)
80+
return;
81+
istate->split_index = NULL;
82+
si->refcount--;
83+
if (si->refcount)
84+
return;
85+
if (si->base) {
86+
discard_index(si->base);
87+
free(si->base);
88+
}
89+
free(si);
90+
}

split-index.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#ifndef SPLIT_INDEX_H
2+
#define SPLIT_INDEX_H
3+
4+
struct index_state;
5+
struct strbuf;
6+
7+
struct split_index {
8+
unsigned char base_sha1[20];
9+
struct index_state *base;
10+
unsigned int saved_cache_nr;
11+
int refcount;
12+
};
13+
14+
struct split_index *init_split_index(struct index_state *istate);
15+
int read_link_extension(struct index_state *istate,
16+
const void *data, unsigned long sz);
17+
int write_link_extension(struct strbuf *sb,
18+
struct index_state *istate);
19+
void move_cache_to_base_index(struct index_state *istate);
20+
void merge_base_index(struct index_state *istate);
21+
void prepare_to_write_split_index(struct index_state *istate);
22+
void finish_writing_split_index(struct index_state *istate);
23+
void discard_split_index(struct index_state *istate);
24+
25+
#endif

0 commit comments

Comments
 (0)