@@ -17,7 +17,7 @@ import javascript
1717/**
1818 * A configuration for which parts of a regular expression should be considered relevant for
1919 * the different predicates in `ReDoS.qll`.
20- * Used to adjust the computations for either superliniear or exponential backtracking.
20+ * Used to adjust the computations for either superlinear or exponential backtracking.
2121 */
2222abstract class ReDoSConfiguration extends string {
2323 bindingset [ this ]
@@ -124,12 +124,12 @@ private class RegexpCharacterConstant extends RegExpConstant {
124124}
125125
126126/**
127- * Holds if `term` is the chosen cannonical representative for all terms with string representation `str`.
127+ * Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
128128 *
129- * Using cannonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
129+ * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
130130 * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
131131 */
132- private predicate isCannonicalTerm ( RegExpTerm term , string str ) {
132+ private predicate isCanonicalTerm ( RegExpTerm term , string str ) {
133133 term =
134134 rank [ 1 ] ( RegExpTerm t , Location loc , File file |
135135 loc = t .getLocation ( ) and
@@ -154,7 +154,7 @@ private newtype TInputSymbol =
154154 */
155155 CharClass ( string charClassString ) {
156156 exists ( RegExpTerm term | term .getRawValue ( ) = charClassString | getRoot ( term ) .isRelevant ( ) ) and
157- exists ( RegExpTerm recc | isCannonicalTerm ( recc , charClassString ) |
157+ exists ( RegExpTerm recc | isCanonicalTerm ( recc , charClassString ) |
158158 recc instanceof RegExpCharacterClass and
159159 not recc .( RegExpCharacterClass ) .isUniversalClass ( )
160160 or
@@ -168,6 +168,13 @@ private newtype TInputSymbol =
168168 /** An epsilon transition in the automaton. */
169169 Epsilon ( )
170170
171+ /**
172+ * Gets the canonical CharClass for `term`.
173+ */
174+ CharClass getCanonicalCharClass ( RegExpTerm term ) {
175+ exists ( string str | isCanonicalTerm ( term , str ) | result = CharClass ( str ) )
176+ }
177+
171178/**
172179 * Holds if `a` and `b` are input symbols from the same regexp.
173180 * (And not a `Dot()`, `Any()` or `Epsilon()`)
@@ -251,10 +258,7 @@ private module CharacterClasses {
251258 */
252259 pragma [ noinline]
253260 predicate hasChildThatMatches ( RegExpCharacterClass cc , string char ) {
254- exists ( string str |
255- isCannonicalTerm ( cc , str ) and
256- exists ( CharClass ( str ) )
257- ) and
261+ exists ( getCanonicalCharClass ( cc ) ) and
258262 exists ( RegExpTerm child | child = cc .getAChild ( ) |
259263 char = child .( RegexpCharacterConstant ) .getValue ( )
260264 or
@@ -350,9 +354,7 @@ private module CharacterClasses {
350354 private class PositiveCharacterClass extends CharacterClass {
351355 RegExpCharacterClass cc ;
352356
353- PositiveCharacterClass ( ) {
354- exists ( string str | isCannonicalTerm ( cc , str ) | this = CharClass ( str ) and not cc .isInverted ( ) )
355- }
357+ PositiveCharacterClass ( ) { this = getCanonicalCharClass ( cc ) and not cc .isInverted ( ) }
356358
357359 override string getARelevantChar ( ) { result = getAMentionedChar ( cc ) }
358360
@@ -365,9 +367,7 @@ private module CharacterClasses {
365367 private class InvertedCharacterClass extends CharacterClass {
366368 RegExpCharacterClass cc ;
367369
368- InvertedCharacterClass ( ) {
369- exists ( string str | isCannonicalTerm ( cc , str ) | this = CharClass ( str ) and cc .isInverted ( ) )
370- }
370+ InvertedCharacterClass ( ) { this = getCanonicalCharClass ( cc ) and cc .isInverted ( ) }
371371
372372 override string getARelevantChar ( ) {
373373 result = nextChar ( getAMentionedChar ( cc ) ) or
@@ -405,9 +405,7 @@ private module CharacterClasses {
405405 RegExpCharacterClassEscape cc ;
406406
407407 PositiveCharacterClassEscape ( ) {
408- exists ( string str | isCannonicalTerm ( cc , str ) |
409- this = CharClass ( str ) and cc .getValue ( ) = [ "d" , "s" , "w" ]
410- )
408+ this = getCanonicalCharClass ( cc ) and cc .getValue ( ) = [ "d" , "s" , "w" ]
411409 }
412410
413411 override string getARelevantChar ( ) {
@@ -442,9 +440,7 @@ private module CharacterClasses {
442440 RegExpCharacterClassEscape cc ;
443441
444442 NegativeCharacterClassEscape ( ) {
445- exists ( string str | isCannonicalTerm ( cc , str ) |
446- this = CharClass ( str ) and cc .getValue ( ) = [ "D" , "S" , "W" ]
447- )
443+ this = getCanonicalCharClass ( cc ) and cc .getValue ( ) = [ "D" , "S" , "W" ]
448444 }
449445
450446 override string getARelevantChar ( ) {
@@ -802,14 +798,14 @@ private module PrefixConstruction {
802798 result = prefix ( prev ) and delta ( prev , Epsilon ( ) , state )
803799 or
804800 not delta ( prev , Epsilon ( ) , state ) and
805- result = prefix ( prev ) + getMinimumEdgeChar ( prev , state )
801+ result = prefix ( prev ) + getCanonicalEdgeChar ( prev , state )
806802 )
807803 }
808804
809805 /**
810- * Gets the minimum char for which there exists a transition from `prev` to `next` in the NFA.
806+ * Gets a canonical char for which there exists a transition from `prev` to `next` in the NFA.
811807 */
812- private string getMinimumEdgeChar ( State prev , State next ) {
808+ private string getCanonicalEdgeChar ( State prev , State next ) {
813809 result =
814810 min ( string c | delta ( prev , any ( InputSymbol symbol | c = intersect ( Any ( ) , symbol ) ) , next ) )
815811 }
@@ -882,6 +878,7 @@ private module SuffixConstruction {
882878 // all edges (at least one) with some char leads to another state that is rejectable.
883879 // the `next` states might not share a common suffix, which can cause FPs.
884880 exists ( string char | char = hasEdgeToLikelyRejectableHelper ( s ) |
881+ // noopt to force `hasEdgeToLikelyRejectableHelper` to be first in the join-order.
885882 exists ( State next | deltaClosedChar ( s , char , next ) | isLikelyRejectable ( next ) ) and
886883 forall ( State next | deltaClosedChar ( s , char , next ) | isLikelyRejectable ( next ) )
887884 )
@@ -935,7 +932,7 @@ private module SuffixConstruction {
935932 }
936933
937934 /**
938- * Holds if there is not edge from `s` labeled with "|", "\n", or "Z" in our NFA.
935+ * Holds if there is no edge from `s` labeled with "|", "\n", or "Z" in our NFA.
939936 * This predicate is used as a cheap pre-processing to speed up `hasRejectEdge`.
940937 */
941938 private predicate hasSimpleRejectEdge ( State s ) {
@@ -953,6 +950,8 @@ private module SuffixConstruction {
953950 exists ( string char , InputSymbol sym |
954951 char = w .charAt ( i ) and
955952 deltaClosed ( prev , sym , result ) and
953+ // noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`.
954+ // Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found.
956955 sym = getAProcessInputSymbol ( char )
957956 )
958957 )
@@ -1015,8 +1014,8 @@ private string rotate(string str, int i) {
10151014}
10161015
10171016/**
1018- * Holds if `term` may cause superliniear backtracking on strings containing many repetitions of `pump`.
1019- * Gets the minimum possible string that causes superliniear backtracking.
1017+ * Holds if `term` may cause superlinear backtracking on strings containing many repetitions of `pump`.
1018+ * Gets the shortest string that causes superlinear backtracking.
10201019 */
10211020private predicate isReDoSAttackable ( RegExpTerm term , string pump , State s ) {
10221021 exists ( int i , string c | s = Match ( term , i ) |
0 commit comments