Skip to content

Commit 69cd8f6

Browse files
angavrilovgitster
authored andcommitted
builtin-blame: Reencode commit messages according to git-log rules.
Currently git-blame outputs text from the commit messages (e.g. the author name and the summary string) as-is, without even providing any information about the encoding used for the data. It makes interpreting the data in multilingual environment very difficult. This commit changes the blame implementation to recode the messages using the rules used by other commands like git-log. Namely, the target encoding can be specified through the i18n.commitEncoding or i18n.logOutputEncoding options, or directly on the command line using the --encoding parameter. Converting the encoding before output seems to be more friendly to the porcelain tools than simply providing the value of the encoding header, and does not require changing the output format. If anybody needs the old behavior, it is possible to achieve it by specifying --encoding=none. Signed-off-by: Alexander Gavrilov <angavrilov@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent 208f689 commit 69cd8f6

File tree

9 files changed

+136
-14
lines changed

9 files changed

+136
-14
lines changed

Documentation/blame-options.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ of lines before or after the line given by <start>.
4949
Show the result incrementally in a format designed for
5050
machine consumption.
5151

52+
--encoding=<encoding>::
53+
Specifies the encoding used to output author names
54+
and commit summaries. Setting it to `none` makes blame
55+
output unconverted data. For more information see the
56+
discussion about encoding in the linkgit:git-log[1]
57+
manual page.
58+
5259
--contents <file>::
5360
When <rev> is not specified, the command annotates the
5461
changes starting backwards from the working tree copy.

Documentation/i18n.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ of `i18n.commitencoding` in its `encoding` header. This is to
3737
help other people who look at them later. Lack of this header
3838
implies that the commit log message is encoded in UTF-8.
3939

40-
. 'git-log', 'git-show' and friends looks at the `encoding`
41-
header of a commit object, and tries to re-code the log
42-
message into UTF-8 unless otherwise specified. You can
40+
. 'git-log', 'git-show', 'git-blame' and friends look at the
41+
`encoding` header of a commit object, and try to re-code the
42+
log message into UTF-8 unless otherwise specified. You can
4343
specify the desired output encoding with
4444
`i18n.logoutputencoding` in `.git/config` file, like this:
4545
+

builtin-blame.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,7 +1431,7 @@ static void get_commit_info(struct commit *commit,
14311431
int detailed)
14321432
{
14331433
int len;
1434-
char *tmp, *endp;
1434+
char *tmp, *endp, *reencoded, *message;
14351435
static char author_buf[1024];
14361436
static char committer_buf[1024];
14371437
static char summary_buf[1024];
@@ -1449,24 +1449,29 @@ static void get_commit_info(struct commit *commit,
14491449
die("Cannot read commit %s",
14501450
sha1_to_hex(commit->object.sha1));
14511451
}
1452+
reencoded = reencode_commit_message(commit, NULL);
1453+
message = reencoded ? reencoded : commit->buffer;
14521454
ret->author = author_buf;
1453-
get_ac_line(commit->buffer, "\nauthor ",
1455+
get_ac_line(message, "\nauthor ",
14541456
sizeof(author_buf), author_buf, &ret->author_mail,
14551457
&ret->author_time, &ret->author_tz);
14561458

1457-
if (!detailed)
1459+
if (!detailed) {
1460+
free(reencoded);
14581461
return;
1462+
}
14591463

14601464
ret->committer = committer_buf;
1461-
get_ac_line(commit->buffer, "\ncommitter ",
1465+
get_ac_line(message, "\ncommitter ",
14621466
sizeof(committer_buf), committer_buf, &ret->committer_mail,
14631467
&ret->committer_time, &ret->committer_tz);
14641468

14651469
ret->summary = summary_buf;
1466-
tmp = strstr(commit->buffer, "\n\n");
1470+
tmp = strstr(message, "\n\n");
14671471
if (!tmp) {
14681472
error_out:
14691473
sprintf(summary_buf, "(%s)", sha1_to_hex(commit->object.sha1));
1474+
free(reencoded);
14701475
return;
14711476
}
14721477
tmp += 2;
@@ -1478,6 +1483,7 @@ static void get_commit_info(struct commit *commit,
14781483
goto error_out;
14791484
memcpy(summary_buf, tmp, len);
14801485
summary_buf[len] = 0;
1486+
free(reencoded);
14811487
}
14821488

14831489
/*

commit.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ enum cmit_fmt {
6565

6666
extern int non_ascii(int);
6767
struct rev_info; /* in revision.h, it circularly uses enum cmit_fmt */
68+
extern char *reencode_commit_message(const struct commit *commit,
69+
const char **encoding_p);
6870
extern void get_commit_format(const char *arg, struct rev_info *);
6971
extern void format_commit_message(const struct commit *commit,
7072
const void *format, struct strbuf *sb,

pretty.c

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,20 @@ void pp_remainder(enum cmit_fmt fmt,
783783
}
784784
}
785785

786+
char *reencode_commit_message(const struct commit *commit, const char **encoding_p)
787+
{
788+
const char *encoding;
789+
790+
encoding = (git_log_output_encoding
791+
? git_log_output_encoding
792+
: git_commit_encoding);
793+
if (!encoding)
794+
encoding = "utf-8";
795+
if (encoding_p)
796+
*encoding_p = encoding;
797+
return logmsg_reencode(commit, encoding);
798+
}
799+
786800
void pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit,
787801
struct strbuf *sb, int abbrev,
788802
const char *subject, const char *after_subject,
@@ -799,12 +813,7 @@ void pretty_print_commit(enum cmit_fmt fmt, const struct commit *commit,
799813
return;
800814
}
801815

802-
encoding = (git_log_output_encoding
803-
? git_log_output_encoding
804-
: git_commit_encoding);
805-
if (!encoding)
806-
encoding = "utf-8";
807-
reencoded = logmsg_reencode(commit, encoding);
816+
reencoded = reencode_commit_message(commit, &encoding);
808817
if (reencoded) {
809818
msg = reencoded;
810819
}

t/t8005-blame-i18n.sh

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/bin/sh
2+
3+
test_description='git blame encoding conversion'
4+
. ./test-lib.sh
5+
6+
. "$TEST_DIRECTORY"/t8005/utf8.txt
7+
. "$TEST_DIRECTORY"/t8005/cp1251.txt
8+
. "$TEST_DIRECTORY"/t8005/sjis.txt
9+
10+
test_expect_success 'setup the repository' '
11+
# Create the file
12+
echo "UTF-8 LINE" > file &&
13+
git add file &&
14+
git commit --author "$UTF8_NAME <utf8@localhost>" -m "$UTF8_MSG" &&
15+
16+
echo "CP1251 LINE" >> file &&
17+
git add file &&
18+
git config i18n.commitencoding cp1251 &&
19+
git commit --author "$CP1251_NAME <cp1251@localhost>" -m "$CP1251_MSG" &&
20+
21+
echo "SJIS LINE" >> file &&
22+
git add file &&
23+
git config i18n.commitencoding shift-jis &&
24+
git commit --author "$SJIS_NAME <sjis@localhost>" -m "$SJIS_MSG"
25+
'
26+
27+
cat >expected <<EOF
28+
author $SJIS_NAME
29+
summary $SJIS_MSG
30+
author $SJIS_NAME
31+
summary $SJIS_MSG
32+
author $SJIS_NAME
33+
summary $SJIS_MSG
34+
EOF
35+
36+
test_expect_success \
37+
'blame respects i18n.commitencoding' '
38+
git blame --incremental file | \
39+
grep "^\(author\|summary\) " > actual &&
40+
test_cmp actual expected
41+
'
42+
43+
cat >expected <<EOF
44+
author $CP1251_NAME
45+
summary $CP1251_MSG
46+
author $CP1251_NAME
47+
summary $CP1251_MSG
48+
author $CP1251_NAME
49+
summary $CP1251_MSG
50+
EOF
51+
52+
test_expect_success \
53+
'blame respects i18n.logoutputencoding' '
54+
git config i18n.logoutputencoding cp1251 &&
55+
git blame --incremental file | \
56+
grep "^\(author\|summary\) " > actual &&
57+
test_cmp actual expected
58+
'
59+
60+
cat >expected <<EOF
61+
author $UTF8_NAME
62+
summary $UTF8_MSG
63+
author $UTF8_NAME
64+
summary $UTF8_MSG
65+
author $UTF8_NAME
66+
summary $UTF8_MSG
67+
EOF
68+
69+
test_expect_success \
70+
'blame respects --encoding=utf-8' '
71+
git blame --incremental --encoding=utf-8 file | \
72+
grep "^\(author\|summary\) " > actual &&
73+
test_cmp actual expected
74+
'
75+
76+
cat >expected <<EOF
77+
author $SJIS_NAME
78+
summary $SJIS_MSG
79+
author $CP1251_NAME
80+
summary $CP1251_MSG
81+
author $UTF8_NAME
82+
summary $UTF8_MSG
83+
EOF
84+
85+
test_expect_success \
86+
'blame respects --encoding=none' '
87+
git blame --incremental --encoding=none file | \
88+
grep "^\(author\|summary\) " > actual &&
89+
test_cmp actual expected
90+
'
91+
92+
test_done

t/t8005/cp1251.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
CP1251_NAME="���� �������� �������"
2+
CP1251_MSG="�������� ���������"

t/t8005/sjis.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SJIS_NAME="�I�r�p�~ �P�u�������r�y�� �R�y�t�������r"
2+
SJIS_MSG="�S�u�������r���u �������q���u�~�y�u"

t/t8005/utf8.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
UTF8_NAME="Иван Петрович Сидоров"
2+
UTF8_MSG="Тестовое сообщение"

0 commit comments

Comments
 (0)