Skip to content

Commit a26345b

Browse files
committed
grep: optimize built-in grep by skipping lines that do not hit
The internal "grep" engine we use checks for hits line-by-line, instead of letting the underlying regexec()/fixmatch() routines scan for the first match from the rest of the buffer. This was a major source of overhead compared to the external grep. Introduce a "look-ahead" mechanism to find the next line that would potentially match by using regexec()/fixmatch() in the remainder of the text to skip unmatching lines, and use it when the query criteria is simple enough (i.e. punt for an advanced grep boolean expression like "lines that have both X and Y but not Z" for now) and we are not running under "-v" (aka "--invert-match") option. Note that "-L" (aka "--files-without-match") is not a reason to disable this optimization. Under the option, we are interested if the file has any hit at all, and that is what we determine reliably with or without the optimization. Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent cb57220 commit a26345b

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

grep.c

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,65 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf,
608608
}
609609
}
610610

611+
static int should_lookahead(struct grep_opt *opt)
612+
{
613+
struct grep_pat *p;
614+
615+
if (opt->extended)
616+
return 0; /* punt for too complex stuff */
617+
if (opt->invert)
618+
return 0;
619+
for (p = opt->pattern_list; p; p = p->next) {
620+
if (p->token != GREP_PATTERN)
621+
return 0; /* punt for "header only" and stuff */
622+
}
623+
return 1;
624+
}
625+
626+
static int look_ahead(struct grep_opt *opt,
627+
unsigned long *left_p,
628+
unsigned *lno_p,
629+
char **bol_p)
630+
{
631+
unsigned lno = *lno_p;
632+
char *bol = *bol_p;
633+
struct grep_pat *p;
634+
char *sp, *last_bol;
635+
regoff_t earliest = -1;
636+
637+
for (p = opt->pattern_list; p; p = p->next) {
638+
int hit;
639+
regmatch_t m;
640+
641+
if (p->fixed)
642+
hit = !fixmatch(p->pattern, bol, &m);
643+
else
644+
hit = !regexec(&p->regexp, bol, 1, &m, 0);
645+
if (!hit || m.rm_so < 0 || m.rm_eo < 0)
646+
continue;
647+
if (earliest < 0 || m.rm_so < earliest)
648+
earliest = m.rm_so;
649+
}
650+
651+
if (earliest < 0) {
652+
*bol_p = bol + *left_p;
653+
*left_p = 0;
654+
return 1;
655+
}
656+
for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
657+
; /* find the beginning of the line */
658+
last_bol = sp;
659+
660+
for (sp = bol; sp < last_bol; sp++) {
661+
if (*sp == '\n')
662+
lno++;
663+
}
664+
*left_p -= last_bol - bol;
665+
*bol_p = last_bol;
666+
*lno_p = lno;
667+
return 0;
668+
}
669+
611670
static int grep_buffer_1(struct grep_opt *opt, const char *name,
612671
char *buf, unsigned long size, int collect_hits)
613672
{
@@ -617,6 +676,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
617676
unsigned last_hit = 0;
618677
int binary_match_only = 0;
619678
unsigned count = 0;
679+
int try_lookahead = 0;
620680
enum grep_context ctx = GREP_CONTEXT_HEAD;
621681
xdemitconf_t xecfg;
622682

@@ -645,11 +705,26 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
645705
opt->priv = &xecfg;
646706
}
647707
}
708+
try_lookahead = should_lookahead(opt);
648709

649710
while (left) {
650711
char *eol, ch;
651712
int hit;
652713

714+
/*
715+
* look_ahead() skips quicly to the line that possibly
716+
* has the next hit; don't call it if we need to do
717+
* something more than just skipping the current line
718+
* in response to an unmatch for the current line. E.g.
719+
* inside a post-context window, we will show the current
720+
* line as a context around the previous hit when it
721+
* doesn't hit.
722+
*/
723+
if (try_lookahead
724+
&& !(last_hit
725+
&& lno <= last_hit + opt->post_context)
726+
&& look_ahead(opt, &left, &lno, &bol))
727+
break;
653728
eol = end_of_line(bol, &left);
654729
ch = *eol;
655730
*eol = 0;

0 commit comments

Comments
 (0)