Skip to content

Commit 4457018

Browse files
avargitster
authored andcommitted
grep: don't use PCRE2?_UTF8 with "log --encoding=<non-utf8>"
Fix a bug introduced in 18547aa ("grep/pcre: support utf-8", 2016-06-25) that was missed due to a blindspot in our tests, as discussed in the previous commit. I then blindly copied the same bug in 94da919 ("grep: add support for PCRE v2", 2017-06-01) when adding the PCRE v2 code. We should not tell PCRE that we're processing UTF-8 just because we're dealing with non-ASCII. In the case of e.g. "log --encoding=<...>" under is_utf8_locale() the haystack might be in ISO-8859-1, and the needle might be in a non-UTF-8 encoding. Maybe we should be more strict here and die earlier? Should we also be converting the needle to the encoding in question, and failing if it's not a string that's valid in that encoding? Maybe. But for now matching this as non-UTF8 at least has some hope of producing sensible results, since we know that our default heuristic of assuming the text to be matched is in the user locale encoding isn't true when we've explicitly encoded it to be in a different encoding. Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent 4e2443b commit 4457018

File tree

4 files changed

+10
-8
lines changed

4 files changed

+10
-8
lines changed

grep.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,11 +388,11 @@ static void compile_pcre1_regexp(struct grep_pat *p, const struct grep_opt *opt)
388388
int options = PCRE_MULTILINE;
389389

390390
if (opt->ignore_case) {
391-
if (has_non_ascii(p->pattern))
391+
if (!opt->ignore_locale && has_non_ascii(p->pattern))
392392
p->pcre1_tables = pcre_maketables();
393393
options |= PCRE_CASELESS;
394394
}
395-
if (is_utf8_locale() && has_non_ascii(p->pattern))
395+
if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern))
396396
options |= PCRE_UTF8;
397397

398398
p->pcre1_regexp = pcre_compile(p->pattern, options, &error, &erroffset,
@@ -498,14 +498,14 @@ static void compile_pcre2_pattern(struct grep_pat *p, const struct grep_opt *opt
498498
p->pcre2_compile_context = NULL;
499499

500500
if (opt->ignore_case) {
501-
if (has_non_ascii(p->pattern)) {
501+
if (!opt->ignore_locale && has_non_ascii(p->pattern)) {
502502
character_tables = pcre2_maketables(NULL);
503503
p->pcre2_compile_context = pcre2_compile_context_create(NULL);
504504
pcre2_set_character_tables(p->pcre2_compile_context, character_tables);
505505
}
506506
options |= PCRE2_CASELESS;
507507
}
508-
if (is_utf8_locale() && has_non_ascii(p->pattern))
508+
if (!opt->ignore_locale && is_utf8_locale() && has_non_ascii(p->pattern))
509509
options |= PCRE2_UTF;
510510

511511
p->pcre2_pattern = pcre2_compile((PCRE2_SPTR)p->pattern,

grep.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ struct grep_opt {
173173
int funcbody;
174174
int extended_regexp_option;
175175
int pattern_type_option;
176+
int ignore_locale;
176177
char colors[NR_GREP_COLORS][COLOR_MAXLEN];
177178
unsigned pre_context;
178179
unsigned post_context;

revision.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "commit-graph.h"
2929
#include "prio-queue.h"
3030
#include "hashmap.h"
31+
#include "utf8.h"
3132

3233
volatile show_early_output_fn_t show_early_output;
3334

@@ -2655,6 +2656,8 @@ int setup_revisions(int argc, const char **argv, struct rev_info *revs, struct s
26552656

26562657
grep_commit_pattern_type(GREP_PATTERN_TYPE_UNSPECIFIED,
26572658
&revs->grep_filter);
2659+
if (!is_encoding_utf8(get_log_output_encoding()))
2660+
revs->grep_filter.ignore_locale = 1;
26582661
compile_grep_patterns(&revs->grep_filter);
26592662

26602663
if (revs->reverse && revs->reflog_info)

t/t4210-log-i18n.sh

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,8 @@ test_expect_success 'log --grep does not find non-reencoded values (latin1)' '
5959
for engine in fixed basic extended perl
6060
do
6161
prereq=
62-
result=success
6362
if test $engine = "perl"
6463
then
65-
result=failure
6664
prereq="PCRE"
6765
else
6866
prereq=""
@@ -72,7 +70,7 @@ do
7270
then
7371
force_regex=.*
7472
fi
75-
test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" "
73+
test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not find non-reencoded values (latin1 + locale)" "
7674
cat >expect <<-\EOF &&
7775
latin1
7876
utf8
@@ -86,7 +84,7 @@ do
8684
test_must_be_empty actual
8785
"
8886

89-
test_expect_$result GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" "
87+
test_expect_success GETTEXT_LOCALE,$prereq "-c grep.patternType=$engine log --grep does not die on invalid UTF-8 value (latin1 + locale + invalid needle)" "
9088
LC_ALL=\"$is_IS_locale\" git -c grep.patternType=$engine log --encoding=ISO-8859-1 --format=%s --grep=\"$force_regex$invalid_e\" >actual &&
9189
test_must_be_empty actual
9290
"

0 commit comments

Comments
 (0)