Skip to content

Commit b856ad6

Browse files
committed
Merge branch 'tb/sanitize-decomposed-utf-8-pathname'
Teaches git to normalize pathnames read from readdir(3) and all arguments from the command line into precomposed UTF-8 (assuming that they come as decomposed UTF-8) to work around issues on Mac OS. I think there still are other places that need conversion (e.g. paths that are read from stdin for some commands), but this should be a good first step in the right direction. * tb/sanitize-decomposed-utf-8-pathname: git on Mac OS and precomposed unicode
2 parents 6a9aa0c + 76759c7 commit b856ad6

File tree

13 files changed

+446
-10
lines changed

13 files changed

+446
-10
lines changed

Documentation/config.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,15 @@ The default is false, except linkgit:git-clone[1] or linkgit:git-init[1]
211211
will probe and set core.ignorecase true if appropriate when the repository
212212
is created.
213213

214+
core.precomposeunicode::
215+
This option is only used by Mac OS implementation of git.
216+
When core.precomposeunicode=true, git reverts the unicode decomposition
217+
of filenames done by Mac OS. This is useful when sharing a repository
218+
between Mac OS and Linux or Windows.
219+
(Git for Windows 1.7.10 or higher is needed, or git under cygwin 1.7).
220+
When false, file names are handled fully transparent by git,
221+
which is backward compatible with older versions of git.
222+
214223
core.trustctime::
215224
If false, the ctime differences between the index and the
216225
working tree are ignored; useful when the inode change time

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,7 @@ LIB_H += compat/bswap.h
607607
LIB_H += compat/cygwin.h
608608
LIB_H += compat/mingw.h
609609
LIB_H += compat/obstack.h
610+
LIB_H += compat/precompose_utf8.h
610611
LIB_H += compat/terminal.h
611612
LIB_H += compat/win32/dirent.h
612613
LIB_H += compat/win32/poll.h
@@ -1001,6 +1002,8 @@ ifeq ($(uname_S),Darwin)
10011002
NO_MEMMEM = YesPlease
10021003
USE_ST_TIMESPEC = YesPlease
10031004
HAVE_DEV_TTY = YesPlease
1005+
COMPAT_OBJS += compat/precompose_utf8.o
1006+
BASIC_CFLAGS += -DPRECOMPOSE_UNICODE
10041007
endif
10051008
ifeq ($(uname_S),SunOS)
10061009
NEEDS_SOCKET = YesPlease

builtin/init-db.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ static int create_default_files(const char *template_path)
290290
strcpy(path + len, "CoNfIg");
291291
if (!access(path, F_OK))
292292
git_config_set("core.ignorecase", "true");
293+
probe_utf8_pathname_composition(path, len);
293294
}
294295

295296
return reinit;

cache.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,7 @@ extern int read_replace_refs;
563563
extern int fsync_object_files;
564564
extern int core_preload_index;
565565
extern int core_apply_sparse_checkout;
566+
extern int precomposed_unicode;
566567

567568
enum branch_track {
568569
BRANCH_TRACK_UNSPECIFIED = -1,

compat/precompose_utf8.c

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
/*
2+
* Converts filenames from decomposed unicode into precomposed unicode.
3+
* Used on MacOS X.
4+
*/
5+
6+
7+
#define PRECOMPOSE_UNICODE_C
8+
9+
#include "cache.h"
10+
#include "utf8.h"
11+
#include "precompose_utf8.h"
12+
13+
typedef char *iconv_ibp;
14+
const static char *repo_encoding = "UTF-8";
15+
const static char *path_encoding = "UTF-8-MAC";
16+
17+
18+
static size_t has_utf8(const char *s, size_t maxlen, size_t *strlen_c)
19+
{
20+
const uint8_t *utf8p = (const uint8_t*) s;
21+
size_t strlen_chars = 0;
22+
size_t ret = 0;
23+
24+
if ((!utf8p) || (!*utf8p)) {
25+
return 0;
26+
}
27+
28+
while((*utf8p) && maxlen) {
29+
if (*utf8p & 0x80)
30+
ret++;
31+
strlen_chars++;
32+
utf8p++;
33+
maxlen--;
34+
}
35+
if (strlen_c)
36+
*strlen_c = strlen_chars;
37+
38+
return ret;
39+
}
40+
41+
42+
void probe_utf8_pathname_composition(char *path, int len)
43+
{
44+
const static char *auml_nfc = "\xc3\xa4";
45+
const static char *auml_nfd = "\x61\xcc\x88";
46+
int output_fd;
47+
if (precomposed_unicode != -1)
48+
return; /* We found it defined in the global config, respect it */
49+
path[len] = 0;
50+
strcpy(path + len, auml_nfc);
51+
output_fd = open(path, O_CREAT|O_EXCL|O_RDWR, 0600);
52+
if (output_fd >=0) {
53+
close(output_fd);
54+
path[len] = 0;
55+
strcpy(path + len, auml_nfd);
56+
/* Indicate to the user, that we can configure it to true */
57+
if (0 == access(path, R_OK))
58+
git_config_set("core.precomposeunicode", "false");
59+
/* To be backward compatible, set precomposed_unicode to 0 */
60+
precomposed_unicode = 0;
61+
path[len] = 0;
62+
strcpy(path + len, auml_nfc);
63+
unlink(path);
64+
}
65+
}
66+
67+
68+
void precompose_argv(int argc, const char **argv)
69+
{
70+
int i = 0;
71+
const char *oldarg;
72+
char *newarg;
73+
iconv_t ic_precompose;
74+
75+
if (precomposed_unicode != 1)
76+
return;
77+
78+
ic_precompose = iconv_open(repo_encoding, path_encoding);
79+
if (ic_precompose == (iconv_t) -1)
80+
return;
81+
82+
while (i < argc) {
83+
size_t namelen;
84+
oldarg = argv[i];
85+
if (has_utf8(oldarg, (size_t)-1, &namelen)) {
86+
newarg = reencode_string_iconv(oldarg, namelen, ic_precompose);
87+
if (newarg)
88+
argv[i] = newarg;
89+
}
90+
i++;
91+
}
92+
iconv_close(ic_precompose);
93+
}
94+
95+
96+
PREC_DIR *precompose_utf8_opendir(const char *dirname)
97+
{
98+
PREC_DIR *prec_dir = xmalloc(sizeof(PREC_DIR));
99+
prec_dir->dirent_nfc = xmalloc(sizeof(dirent_prec_psx));
100+
prec_dir->dirent_nfc->max_name_len = sizeof(prec_dir->dirent_nfc->d_name);
101+
102+
prec_dir->dirp = opendir(dirname);
103+
if (!prec_dir->dirp) {
104+
free(prec_dir->dirent_nfc);
105+
free(prec_dir);
106+
return NULL;
107+
} else {
108+
int ret_errno = errno;
109+
prec_dir->ic_precompose = iconv_open(repo_encoding, path_encoding);
110+
/* if iconv_open() fails, die() in readdir() if needed */
111+
errno = ret_errno;
112+
}
113+
114+
return prec_dir;
115+
}
116+
117+
struct dirent_prec_psx *precompose_utf8_readdir(PREC_DIR *prec_dir)
118+
{
119+
struct dirent *res;
120+
res = readdir(prec_dir->dirp);
121+
if (res) {
122+
size_t namelenz = strlen(res->d_name) + 1; /* \0 */
123+
size_t new_maxlen = namelenz;
124+
125+
int ret_errno = errno;
126+
127+
if (new_maxlen > prec_dir->dirent_nfc->max_name_len) {
128+
size_t new_len = sizeof(dirent_prec_psx) + new_maxlen -
129+
sizeof(prec_dir->dirent_nfc->d_name);
130+
131+
prec_dir->dirent_nfc = xrealloc(prec_dir->dirent_nfc, new_len);
132+
prec_dir->dirent_nfc->max_name_len = new_maxlen;
133+
}
134+
135+
prec_dir->dirent_nfc->d_ino = res->d_ino;
136+
prec_dir->dirent_nfc->d_type = res->d_type;
137+
138+
if ((precomposed_unicode == 1) && has_utf8(res->d_name, (size_t)-1, NULL)) {
139+
if (prec_dir->ic_precompose == (iconv_t)-1) {
140+
die("iconv_open(%s,%s) failed, but needed:\n"
141+
" precomposed unicode is not supported.\n"
142+
" If you wnat to use decomposed unicode, run\n"
143+
" \"git config core.precomposeunicode false\"\n",
144+
repo_encoding, path_encoding);
145+
} else {
146+
iconv_ibp cp = (iconv_ibp)res->d_name;
147+
size_t inleft = namelenz;
148+
char *outpos = &prec_dir->dirent_nfc->d_name[0];
149+
size_t outsz = prec_dir->dirent_nfc->max_name_len;
150+
size_t cnt;
151+
errno = 0;
152+
cnt = iconv(prec_dir->ic_precompose, &cp, &inleft, &outpos, &outsz);
153+
if (errno || inleft) {
154+
/*
155+
* iconv() failed and errno could be E2BIG, EILSEQ, EINVAL, EBADF
156+
* MacOS X avoids illegal byte sequemces.
157+
* If they occur on a mounted drive (e.g. NFS) it is not worth to
158+
* die() for that, but rather let the user see the original name
159+
*/
160+
namelenz = 0; /* trigger strlcpy */
161+
}
162+
}
163+
}
164+
else
165+
namelenz = 0;
166+
167+
if (!namelenz)
168+
strlcpy(prec_dir->dirent_nfc->d_name, res->d_name,
169+
prec_dir->dirent_nfc->max_name_len);
170+
171+
errno = ret_errno;
172+
return prec_dir->dirent_nfc;
173+
}
174+
return NULL;
175+
}
176+
177+
178+
int precompose_utf8_closedir(PREC_DIR *prec_dir)
179+
{
180+
int ret_value;
181+
int ret_errno;
182+
ret_value = closedir(prec_dir->dirp);
183+
ret_errno = errno;
184+
if (prec_dir->ic_precompose != (iconv_t)-1)
185+
iconv_close(prec_dir->ic_precompose);
186+
free(prec_dir->dirent_nfc);
187+
free(prec_dir);
188+
errno = ret_errno;
189+
return ret_value;
190+
}

compat/precompose_utf8.h

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#ifndef PRECOMPOSE_UNICODE_H
2+
#include <sys/stat.h>
3+
#include <sys/types.h>
4+
#include <dirent.h>
5+
#include <iconv.h>
6+
7+
8+
typedef struct dirent_prec_psx {
9+
ino_t d_ino; /* Posix */
10+
size_t max_name_len; /* See below */
11+
unsigned char d_type; /* available on all systems git runs on */
12+
13+
/*
14+
* See http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/dirent.h.html
15+
* NAME_MAX + 1 should be enough, but some systems have
16+
* NAME_MAX=255 and strlen(d_name) may return 508 or 510
17+
* Solution: allocate more when needed, see precompose_utf8_readdir()
18+
*/
19+
char d_name[NAME_MAX+1];
20+
} dirent_prec_psx;
21+
22+
23+
typedef struct {
24+
iconv_t ic_precompose;
25+
DIR *dirp;
26+
struct dirent_prec_psx *dirent_nfc;
27+
} PREC_DIR;
28+
29+
void precompose_argv(int argc, const char **argv);
30+
void probe_utf8_pathname_composition(char *, int);
31+
32+
PREC_DIR *precompose_utf8_opendir(const char *dirname);
33+
struct dirent_prec_psx *precompose_utf8_readdir(PREC_DIR *dirp);
34+
int precompose_utf8_closedir(PREC_DIR *dirp);
35+
36+
#ifndef PRECOMPOSE_UNICODE_C
37+
#define dirent dirent_prec_psx
38+
#define opendir(n) precompose_utf8_opendir(n)
39+
#define readdir(d) precompose_utf8_readdir(d)
40+
#define closedir(d) precompose_utf8_closedir(d)
41+
#define DIR PREC_DIR
42+
#endif /* PRECOMPOSE_UNICODE_C */
43+
44+
#define PRECOMPOSE_UNICODE_H
45+
#endif /* PRECOMPOSE_UNICODE_H */

config.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -758,6 +758,11 @@ static int git_default_core_config(const char *var, const char *value)
758758
return 0;
759759
}
760760

761+
if (!strcmp(var, "core.precomposeunicode")) {
762+
precomposed_unicode = git_config_bool(var, value);
763+
return 0;
764+
}
765+
761766
/* Add other config variables here and to Documentation/config.txt. */
762767
return 0;
763768
}

environment.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ char *notes_ref_name;
5858
int grafts_replace_parents = 1;
5959
int core_apply_sparse_checkout;
6060
int merge_log_config = -1;
61+
int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
6162
struct startup_info *startup_info;
6263
unsigned long pack_size_limit_cfg;
6364

git-compat-util.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,15 @@
153153
#endif
154154
#endif
155155

156+
/* used on Mac OS X */
157+
#ifdef PRECOMPOSE_UNICODE
158+
#include "compat/precompose_utf8.h"
159+
#else
160+
#define precompose_str(in,i_nfd2nfc)
161+
#define precompose_argv(c,v)
162+
#define probe_utf8_pathname_composition(a,b)
163+
#endif
164+
156165
#ifndef NO_LIBGEN_H
157166
#include <libgen.h>
158167
#else

parse-options.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,7 @@ int parse_options(int argc, const char **argv, const char *prefix,
476476
usage_with_options(usagestr, options);
477477
}
478478

479+
precompose_argv(argc, argv);
479480
return parse_options_end(&ctx);
480481
}
481482

0 commit comments

Comments
 (0)