3434
3535/*
3636 * longest - longest-preferred matching engine
37+ *
38+ * On success, returns match endpoint address. Returns NULL on no match.
39+ * Internal errors also return NULL, with v->err set.
3740 */
38- static chr * /* endpoint, or NULL */
39- longest (struct vars * v , /* used only for debug and exec flags */
41+ static chr *
42+ longest (struct vars * v ,
4043 struct dfa * d ,
4144 chr * start , /* where the match should start */
4245 chr * stop , /* match must end at or before here */
@@ -51,11 +54,15 @@ longest(struct vars * v, /* used only for debug and exec flags */
5154 int i ;
5255 struct colormap * cm = d -> cm ;
5356
57+ /* prevent "uninitialized variable" warnings */
58+ if (hitstopp != NULL )
59+ * hitstopp = 0 ;
60+
5461 /* initialize */
5562 css = initialize (v , d , start );
63+ if (css == NULL )
64+ return NULL ;
5665 cp = start ;
57- if (hitstopp != NULL )
58- * hitstopp = 0 ;
5966
6067 /* startup */
6168 FDEBUG (("+++ startup +++\n" ));
@@ -74,8 +81,14 @@ longest(struct vars * v, /* used only for debug and exec flags */
7481 return NULL ;
7582 css -> lastseen = cp ;
7683
77- /* main loop */
84+ /*
85+ * This is the main text-scanning loop. It seems worth having two copies
86+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87+ * builds, when you're not actively tracing.
88+ */
89+ #ifdef REG_DEBUG
7890 if (v -> eflags & REG_FTRACE )
91+ {
7992 while (cp < realstop )
8093 {
8194 FDEBUG (("+++ at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -92,7 +105,10 @@ longest(struct vars * v, /* used only for debug and exec flags */
92105 ss -> lastseen = cp ;
93106 css = ss ;
94107 }
108+ }
95109 else
110+ #endif
111+ {
96112 while (cp < realstop )
97113 {
98114 co = GETCOLOR (cm , * cp );
@@ -107,6 +123,10 @@ longest(struct vars * v, /* used only for debug and exec flags */
107123 ss -> lastseen = cp ;
108124 css = ss ;
109125 }
126+ }
127+
128+ if (ISERR ())
129+ return NULL ;
110130
111131 /* shutdown */
112132 FDEBUG (("+++ shutdown at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -117,6 +137,8 @@ longest(struct vars * v, /* used only for debug and exec flags */
117137 co = d -> cnfa -> eos [(v -> eflags & REG_NOTEOL ) ? 0 : 1 ];
118138 FDEBUG (("color %ld\n" , (long ) co ));
119139 ss = miss (v , d , css , co , cp , start );
140+ if (ISERR ())
141+ return NULL ;
120142 /* special case: match ended at eol? */
121143 if (ss != NULL && (ss -> flags & POSTSTATE ))
122144 return cp ;
@@ -138,14 +160,17 @@ longest(struct vars * v, /* used only for debug and exec flags */
138160
139161/*
140162 * shortest - shortest-preferred matching engine
163+ *
164+ * On success, returns match endpoint address. Returns NULL on no match.
165+ * Internal errors also return NULL, with v->err set.
141166 */
142- static chr * /* endpoint, or NULL */
167+ static chr *
143168shortest (struct vars * v ,
144169 struct dfa * d ,
145170 chr * start , /* where the match should start */
146171 chr * min , /* match must end at or after here */
147172 chr * max , /* match must end at or before here */
148- chr * * coldp , /* store coldstart pointer here, if nonNULL */
173+ chr * * coldp , /* store coldstart pointer here, if non-NULL */
149174 int * hitstopp ) /* record whether hit v->stop, if non-NULL */
150175{
151176 chr * cp ;
@@ -156,11 +181,17 @@ shortest(struct vars * v,
156181 struct sset * ss ;
157182 struct colormap * cm = d -> cm ;
158183
184+ /* prevent "uninitialized variable" warnings */
185+ if (coldp != NULL )
186+ * coldp = NULL ;
187+ if (hitstopp != NULL )
188+ * hitstopp = 0 ;
189+
159190 /* initialize */
160191 css = initialize (v , d , start );
192+ if (css == NULL )
193+ return NULL ;
161194 cp = start ;
162- if (hitstopp != NULL )
163- * hitstopp = 0 ;
164195
165196 /* startup */
166197 FDEBUG (("--- startup ---\n" ));
@@ -180,8 +211,14 @@ shortest(struct vars * v,
180211 css -> lastseen = cp ;
181212 ss = css ;
182213
183- /* main loop */
214+ /*
215+ * This is the main text-scanning loop. It seems worth having two copies
216+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217+ * builds, when you're not actively tracing.
218+ */
219+ #ifdef REG_DEBUG
184220 if (v -> eflags & REG_FTRACE )
221+ {
185222 while (cp < realmax )
186223 {
187224 FDEBUG (("--- at c%d ---\n" , (int ) (css - d -> ssets )));
@@ -200,7 +237,10 @@ shortest(struct vars * v,
200237 if ((ss -> flags & POSTSTATE ) && cp >= realmin )
201238 break ; /* NOTE BREAK OUT */
202239 }
240+ }
203241 else
242+ #endif
243+ {
204244 while (cp < realmax )
205245 {
206246 co = GETCOLOR (cm , * cp );
@@ -217,6 +257,7 @@ shortest(struct vars * v,
217257 if ((ss -> flags & POSTSTATE ) && cp >= realmin )
218258 break ; /* NOTE BREAK OUT */
219259 }
260+ }
220261
221262 if (ss == NULL )
222263 return NULL ;
@@ -390,7 +431,7 @@ hash(unsigned *uv,
390431 * initialize - hand-craft a cache entry for startup, otherwise get ready
391432 */
392433static struct sset *
393- initialize (struct vars * v , /* used only for debug flags */
434+ initialize (struct vars * v ,
394435 struct dfa * d ,
395436 chr * start )
396437{
@@ -403,6 +444,8 @@ initialize(struct vars * v, /* used only for debug flags */
403444 else
404445 { /* no, must (re)build it */
405446 ss = getvacant (v , d , start , start );
447+ if (ss == NULL )
448+ return NULL ;
406449 for (i = 0 ; i < d -> wordsper ; i ++ )
407450 ss -> states [i ] = 0 ;
408451 BSET (ss -> states , d -> cnfa -> pre );
@@ -421,10 +464,20 @@ initialize(struct vars * v, /* used only for debug flags */
421464}
422465
423466/*
424- * miss - handle a cache miss
467+ * miss - handle a stateset cache miss
468+ *
469+ * css is the current stateset, co is the color of the current input character,
470+ * cp points to the character after that (which is where we may need to test
471+ * LACONs). start does not affect matching behavior but is needed for pickss'
472+ * heuristics about which stateset cache entry to replace.
473+ *
474+ * Ordinarily, returns the address of the next stateset (the one that is
475+ * valid after consuming the input character). Returns NULL if no valid
476+ * NFA states remain, ie we have a certain match failure.
477+ * Internal errors also return NULL, with v->err set.
425478 */
426- static struct sset * /* NULL if goes to empty set */
427- miss (struct vars * v , /* used only for debug flags */
479+ static struct sset *
480+ miss (struct vars * v ,
428481 struct dfa * d ,
429482 struct sset * css ,
430483 pcolor co ,
@@ -450,9 +503,23 @@ miss(struct vars * v, /* used only for debug flags */
450503 }
451504 FDEBUG (("miss\n" ));
452505
453- /* first, what set of states would we end up in? */
506+ /*
507+ * Checking for operation cancel in the inner text search loop seems
508+ * unduly expensive. As a compromise, check during cache misses.
509+ */
510+ if (CANCEL_REQUESTED (v -> re ))
511+ {
512+ ERR (REG_CANCEL );
513+ return NULL ;
514+ }
515+
516+ /*
517+ * What set of states would we end up in after consuming the co character?
518+ * We first consider PLAIN arcs that consume the character, and then look
519+ * to see what LACON arcs could be traversed after consuming it.
520+ */
454521 for (i = 0 ; i < d -> wordsper ; i ++ )
455- d -> work [i ] = 0 ;
522+ d -> work [i ] = 0 ; /* build new stateset bitmap in d->work */
456523 ispost = 0 ;
457524 noprogress = 1 ;
458525 gotstate = 0 ;
@@ -469,22 +536,31 @@ miss(struct vars * v, /* used only for debug flags */
469536 noprogress = 0 ;
470537 FDEBUG (("%d -> %d\n" , i , ca -> to ));
471538 }
472- dolacons = (gotstate ) ? (cnfa -> flags & HASLACONS ) : 0 ;
539+ if (!gotstate )
540+ return NULL ; /* character cannot reach any new state */
541+ dolacons = (cnfa -> flags & HASLACONS );
473542 sawlacons = 0 ;
543+ /* outer loop handles transitive closure of reachable-by-LACON states */
474544 while (dolacons )
475- { /* transitive closure */
545+ {
476546 dolacons = 0 ;
477547 for (i = 0 ; i < d -> nstates ; i ++ )
478548 if (ISBSET (d -> work , i ))
479549 for (ca = cnfa -> states [i ]; ca -> co != COLORLESS ; ca ++ )
480550 {
481551 if (ca -> co < cnfa -> ncolors )
482- continue ; /* NOTE CONTINUE */
483- sawlacons = 1 ;
552+ continue ; /* not a LACON arc */
484553 if (ISBSET (d -> work , ca -> to ))
485- continue ; /* NOTE CONTINUE */
554+ continue ; /* arc would be a no-op anyway */
555+ sawlacons = 1 ; /* this LACON affects our result */
486556 if (!lacon (v , cnfa , cp , ca -> co ))
487- continue ; /* NOTE CONTINUE */
557+ {
558+ if (ISERR ())
559+ return NULL ;
560+ continue ; /* LACON arc cannot be traversed */
561+ }
562+ if (ISERR ())
563+ return NULL ;
488564 BSET (d -> work , ca -> to );
489565 dolacons = 1 ;
490566 if (ca -> to == cnfa -> post )
@@ -494,11 +570,9 @@ miss(struct vars * v, /* used only for debug flags */
494570 FDEBUG (("%d :> %d\n" , i , ca -> to ));
495571 }
496572 }
497- if (!gotstate )
498- return NULL ;
499573 h = HASH (d -> work , d -> wordsper );
500574
501- /* next, is that in the cache? */
575+ /* Is this stateset already in the cache? */
502576 for (p = d -> ssets , i = d -> nssused ; i > 0 ; p ++ , i -- )
503577 if (HIT (h , d -> work , p , d -> wordsper ))
504578 {
@@ -508,6 +582,8 @@ miss(struct vars * v, /* used only for debug flags */
508582 if (i == 0 )
509583 { /* nope, need a new cache entry */
510584 p = getvacant (v , d , cp , start );
585+ if (p == NULL )
586+ return NULL ;
511587 assert (p != css );
512588 for (i = 0 ; i < d -> wordsper ; i ++ )
513589 p -> states [i ] = d -> work [i ];
@@ -518,8 +594,15 @@ miss(struct vars * v, /* used only for debug flags */
518594 /* lastseen to be dealt with by caller */
519595 }
520596
597+ /*
598+ * Link new stateset to old, unless a LACON affected the result, in which
599+ * case we don't create the link. That forces future transitions across
600+ * this same arc (same prior stateset and character color) to come through
601+ * miss() again, so that we can recheck the LACON(s), which might or might
602+ * not pass since context will be different.
603+ */
521604 if (!sawlacons )
522- { /* lookahead conds. always cache miss */
605+ {
523606 FDEBUG (("c%d[%d]->c%d\n" ,
524607 (int ) (css - d -> ssets ), co , (int ) (p - d -> ssets )));
525608 css -> outs [co ] = p ;
@@ -563,11 +646,12 @@ lacon(struct vars * v,
563646
564647/*
565648 * getvacant - get a vacant state set
649+ *
566650 * This routine clears out the inarcs and outarcs, but does not otherwise
567651 * clear the innards of the state set -- that's up to the caller.
568652 */
569653static struct sset *
570- getvacant (struct vars * v , /* used only for debug flags */
654+ getvacant (struct vars * v ,
571655 struct dfa * d ,
572656 chr * cp ,
573657 chr * start )
@@ -579,6 +663,8 @@ getvacant(struct vars * v, /* used only for debug flags */
579663 color co ;
580664
581665 ss = pickss (v , d , cp , start );
666+ if (ss == NULL )
667+ return NULL ;
582668 assert (!(ss -> flags & LOCKED ));
583669
584670 /* clear out its inarcs, including self-referential ones */
@@ -636,7 +722,7 @@ getvacant(struct vars * v, /* used only for debug flags */
636722 * pickss - pick the next stateset to be used
637723 */
638724static struct sset *
639- pickss (struct vars * v , /* used only for debug flags */
725+ pickss (struct vars * v ,
640726 struct dfa * d ,
641727 chr * cp ,
642728 chr * start )
@@ -692,7 +778,6 @@ pickss(struct vars * v, /* used only for debug flags */
692778
693779 /* nobody's old enough?!? -- something's really wrong */
694780 FDEBUG (("cannot find victim to replace!\n" ));
695- assert (NOTREACHED );
696781 ERR (REG_ASSERT );
697- return d -> ssets ;
782+ return NULL ;
698783}
0 commit comments