-
Notifications
You must be signed in to change notification settings - Fork 2k
Expand file tree
/
Copy pathNfaUtils.qll
More file actions
1386 lines (1249 loc) · 44.9 KB
/
NfaUtils.qll
File metadata and controls
1386 lines (1249 loc) · 44.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* A shared library for creating and reasoning about NFA's.
*/
overlay[local?]
module;
private import codeql.regex.RegexTreeView
private import codeql.util.Numbers
private import codeql.util.Strings
/**
* Classes and predicates that create an NFA and various algorithms for working with it.
*/
module Make<RegexTreeViewSig TreeImpl> {
private import TreeImpl
/**
* Gets the char after `c` (from a simplified ASCII table).
*/
private string nextChar(string c) {
exists(int code | code = asciiPrintable(c) | code + 1 = asciiPrintable(result))
}
/**
* Holds if `t` matches at least an epsilon symbol.
*
* That is, this term does not restrict the language of the enclosing regular expression.
*
* This is implemented as an under-approximation, and this predicate does not hold for sub-patterns in particular.
*/
predicate matchesEpsilon(RegExpTerm t) {
t instanceof RegExpStar
or
t instanceof RegExpOpt
or
t.(RegExpRange).getLowerBound() = 0
or
exists(RegExpTerm child |
child = t.getAChild() and
matchesEpsilon(child)
|
t instanceof RegExpAlt or
t instanceof RegExpGroup or
t instanceof RegExpPlus or
t instanceof RegExpRange
)
or
matchesEpsilon(t.(RegExpBackRef).getGroup())
or
forex(RegExpTerm child | child = t.(RegExpSequence).getAChild() | matchesEpsilon(child))
}
final private class FinalRegExpSubPattern = RegExpSubPattern;
/**
* A lookahead/lookbehind that matches the empty string.
*/
class EmptyPositiveSubPattern extends FinalRegExpSubPattern {
EmptyPositiveSubPattern() {
(
this instanceof RegExpPositiveLookahead
or
this instanceof RegExpPositiveLookbehind
) and
matchesEpsilon(this.getOperand())
}
}
final private class FinalRegExpTerm = RegExpTerm;
/**
* A branch in a disjunction that is the root node in a literal, or a literal
* whose root node is not a disjunction.
*/
class RegExpRoot extends FinalRegExpTerm {
RegExpRoot() {
exists(RegExpParent parent |
exists(RegExpAlt alt |
alt.isRootTerm() and
this = alt.getAChild() and
parent = alt.getParent()
)
or
this.isRootTerm() and
not this instanceof RegExpAlt and
parent = this.getParent()
)
}
/**
* Holds if this root term is relevant to the ReDoS analysis.
*/
predicate isRelevant() {
// is actually used as a RegExp
super.isUsedAsRegExp() and
// not excluded for library specific reasons
not isExcluded(super.getRootTerm().getParent())
}
}
/**
* A constant in a regular expression that represents valid Unicode character(s).
*/
private class RegexpCharacterConstant instanceof RegExpConstant {
RegexpCharacterConstant() { this.isCharacter() }
string toString() { result = super.toString() }
RegExpTerm getRootTerm() { result = super.getRootTerm() }
string getValue() { result = super.getValue() }
}
/**
* A regexp term that is relevant for this ReDoS analysis.
*/
class RelevantRegExpTerm extends FinalRegExpTerm {
RelevantRegExpTerm() { getRoot(this).isRelevant() }
}
/**
* Gets a string for the full location of `t`.
*/
bindingset[t]
pragma[inline_late]
string getTermLocationString(RegExpTerm t) {
exists(string file, int startLine, int startColumn, int endLine, int endColumn |
t.hasLocationInfo(file, startLine, startColumn, endLine, endColumn) and
result = file + ":" + startLine + ":" + startColumn + "-" + endLine + ":" + endColumn
)
}
/**
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
* The string representation includes which flags are used with the regular expression.
*
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
*/
private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
term =
min(RelevantRegExpTerm t |
str = getCanonicalizationString(t)
|
t order by getTermLocationString(t), t.toString()
)
}
/**
* Gets a string representation of `term` that is used for canonicalization.
*/
private string getCanonicalizationString(RelevantRegExpTerm term) {
exists(string ignoreCase |
(if isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and
result = term.getRawValue() + "|" + ignoreCase
)
}
/**
* An abstract input symbol, representing a set of concrete characters.
*/
private newtype TInputSymbol =
/** An input symbol corresponding to character `c`. */
Char(string c) {
c =
getACodepoint(any(RegexpCharacterConstant cc |
cc instanceof RelevantRegExpTerm and
not isIgnoreCase(cc.getRootTerm())
).getValue())
or
// normalize everything to lower case if the regexp is case insensitive
c =
any(RegexpCharacterConstant cc, string char |
cc instanceof RelevantRegExpTerm and
isIgnoreCase(cc.getRootTerm()) and
char = getACodepoint(cc.getValue())
|
char.toLowerCase()
)
} or
/**
* An input symbol representing all characters matched by
* a (non-universal) character class that has string representation `charClassString`.
*/
CharClass(string charClassString) {
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
recc instanceof RegExpCharacterClass and
not recc.(RegExpCharacterClass).isUniversalClass()
or
isEscapeClass(recc, _)
)
} or
/** An input symbol representing all characters matched by `.`. */
Dot() or
/** An input symbol representing all characters. */
Any() or
/** An epsilon transition in the automaton. */
Epsilon()
/**
* Gets the the CharClass corresponding to the canonical representative `term`.
*/
private CharClass getCharClassForCanonicalTerm(RegExpTerm term) {
exists(string str | isCanonicalTerm(term, str) | result = CharClass(str))
}
/**
* Gets a char class that represents `term`, even when `term` is not the canonical representative.
*/
CharacterClass getCanonicalCharClass(RegExpTerm term) {
exists(string str | str = getCanonicalizationString(term) and result = CharClass(str))
}
/**
* Holds if `a` and `b` are input symbols from the same regexp.
*/
private predicate sharesRoot(InputSymbol a, InputSymbol b) {
exists(RegExpRoot root |
belongsTo(a, root) and
belongsTo(b, root)
)
}
/**
* Holds if the `a` is an input symbol from a regexp that has root `root`.
*/
private predicate belongsTo(InputSymbol a, RegExpRoot root) {
exists(State s | getRoot(s.getRepr()) = root |
delta(s, a, _)
or
delta(_, a, s)
)
}
/**
* An abstract input symbol, representing a set of concrete characters.
*/
class InputSymbol extends TInputSymbol {
InputSymbol() { not this instanceof Epsilon }
/**
* Gets a string representation of this input symbol.
*/
string toString() {
this = Char(result)
or
this = CharClass(result)
or
this = Dot() and result = "."
or
this = Any() and result = "[^]"
}
}
/**
* An abstract input symbol that represents a character class.
*/
abstract class CharacterClass extends InputSymbol {
/**
* Gets a character that is relevant for intersection-tests involving this
* character class.
*
* Specifically, this is any of the characters mentioned explicitly in the
* character class, offset by one if it is inverted. For character class escapes,
* the result is as if the class had been written out as a series of intervals.
*
* This set is large enough to ensure that for any two intersecting character
* classes, one contains a relevant character from the other.
*/
abstract string getARelevantChar();
/**
* Holds if this character class matches `char`.
*/
bindingset[char]
abstract predicate matches(string char);
/**
* Gets a character matched by this character class.
*/
string choose() { result = this.getARelevantChar() and this.matches(result) }
}
/**
* Provides implementations for `CharacterClass`.
*/
private module CharacterClasses {
/**
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
*/
pragma[noinline]
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
if isIgnoreCase(cc.getRootTerm())
then
// normalize everything to lower case if the regexp is case insensitive
exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase())
else hasChildThatMatchesIgnoringCasingFlags(cc, char)
}
/**
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
* Ignores whether the character class is inside a regular expression that has the ignore case flag.
*/
pragma[noinline]
predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
exists(getCharClassForCanonicalTerm(cc)) and
exists(RegExpTerm child | child = cc.getAChild() |
char = child.(RegexpCharacterConstant).getValue()
or
rangeMatchesOnLetterOrDigits(child, char)
or
not rangeMatchesOnLetterOrDigits(child, _) and
char = getARelevantChar() and
exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) |
lo <= char and
char <= hi
)
or
exists(string charClass | isEscapeClass(child, charClass) |
charClass.toLowerCase() = charClass and
classEscapeMatches(charClass, char)
or
char = getARelevantChar() and
charClass.toUpperCase() = charClass and
not classEscapeMatches(charClass, char)
)
)
}
/**
* Holds if `range` is a range on lower-case, upper-case, or digits, and matches `char`.
* This predicate is used to restrict the searchspace for ranges by only joining `getAnyPossiblyMatchedChar`
* on a few ranges.
*/
private predicate rangeMatchesOnLetterOrDigits(RegExpCharacterRange range, string char) {
exists(string lo, string hi |
range.isRange(lo, hi) and lo = lowercaseLetter() and hi = lowercaseLetter()
|
lo <= char and
char <= hi and
char = lowercaseLetter()
)
or
exists(string lo, string hi |
range.isRange(lo, hi) and lo = upperCaseLetter() and hi = upperCaseLetter()
|
lo <= char and
char <= hi and
char = upperCaseLetter()
)
or
exists(string lo, string hi | range.isRange(lo, hi) and lo = digit() and hi = digit() |
lo <= char and
char <= hi and
char = digit()
)
}
private string lowercaseLetter() { result = "abcdefghijklmnopqrstuvwxyz".charAt(_) }
private string upperCaseLetter() { result = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".charAt(_) }
private string digit() { result = [0 .. 9].toString() }
/**
* Gets a char that could be matched by a regular expression.
* Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`.
*/
string getARelevantChar() {
exists(asciiPrintable(result))
or
exists(RegexpCharacterConstant c | result = getACodepoint(c.getValue()))
or
classEscapeMatches(_, result)
}
/**
* Gets a char that is mentioned in the character class `c`.
*/
private string getAMentionedChar(RegExpCharacterClass c) {
exists(RegExpTerm child | child = c.getAChild() |
result = child.(RegexpCharacterConstant).getValue()
or
child.(RegExpCharacterRange).isRange(result, _)
or
child.(RegExpCharacterRange).isRange(_, result)
or
exists(string charClass | isEscapeClass(child, charClass) |
result = min(string s | classEscapeMatches(charClass.toLowerCase(), s))
or
result = max(string s | classEscapeMatches(charClass.toLowerCase(), s))
)
)
}
bindingset[char, cc]
private string caseNormalize(string char, RegExpTerm cc) {
if isIgnoreCase(cc.getRootTerm()) then result = char.toLowerCase() else result = char
}
/**
* An implementation of `CharacterClass` for positive (non inverted) character classes.
*/
private class PositiveCharacterClass extends CharacterClass {
RegExpCharacterClass cc;
PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() }
override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }
override predicate matches(string char) { hasChildThatMatches(cc, char) }
}
/**
* An implementation of `CharacterClass` for inverted character classes.
*/
private class InvertedCharacterClass extends CharacterClass {
RegExpCharacterClass cc;
InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() }
override string getARelevantChar() {
result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
nextChar(result) = caseNormalize(getAMentionedChar(cc), cc)
}
bindingset[char]
override predicate matches(string char) {
not hasChildThatMatches(cc, char) and
(
// detect unsupported char classes that doesn't match anything (e.g. `\p{L}` in ruby), and don't report any matches
hasChildThatMatches(cc, _)
or
not exists(cc.getAChild()) // [^] still matches everything
)
}
}
/**
* Holds if the character class escape `clazz` (\d, \s, or \w) matches `char`.
*/
pragma[noinline]
private predicate classEscapeMatches(string clazz, string char) {
clazz = "d" and
char = "0123456789".charAt(_)
or
clazz = "s" and
char = [" ", "\t", "\r", "\n", 11.toUnicode(), 12.toUnicode()] // 11.toUnicode() = \v, 12.toUnicode() = \f
or
clazz = "w" and
char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_".charAt(_)
}
/**
* An implementation of `CharacterClass` for \d, \s, and \w.
*/
private class PositiveCharacterClassEscape extends CharacterClass {
string charClass;
RegExpTerm cc;
PositiveCharacterClassEscape() {
isEscapeClass(cc, charClass) and
this = getCharClassForCanonicalTerm(cc) and
charClass = ["d", "s", "w"]
}
override string getARelevantChar() {
charClass = "d" and
result = ["0", "9"]
or
charClass = "s" and
result = " "
or
charClass = "w" and
if isIgnoreCase(cc.getRootTerm())
then result = ["a", "z", "_", "0", "9"]
else result = ["a", "Z", "_", "0", "9"]
}
override predicate matches(string char) { classEscapeMatches(charClass, char) }
override string choose() {
charClass = "d" and
result = "9"
or
charClass = "s" and
result = " "
or
charClass = "w" and
result = "a"
}
}
/**
* An implementation of `CharacterClass` for \D, \S, and \W.
*/
private class NegativeCharacterClassEscape extends CharacterClass {
string charClass;
NegativeCharacterClassEscape() {
exists(RegExpTerm cc |
isEscapeClass(cc, charClass) and
this = getCharClassForCanonicalTerm(cc) and
charClass = ["D", "S", "W"]
)
}
override string getARelevantChar() {
charClass = "D" and
result = ["a", "Z", "!"]
or
charClass = "S" and
result = ["a", "9", "!"]
or
charClass = "W" and
result = [" ", "!"]
}
bindingset[char]
override predicate matches(string char) {
not classEscapeMatches(charClass.toLowerCase(), char) and
// detect unsupported char classes (e.g. `\p{L}` in ruby), and don't report any matches
classEscapeMatches(charClass.toLowerCase(), _)
}
}
/** Gets a representative for all char classes that match the same chars as `c`. */
CharacterClass normalize(CharacterClass c) {
exists(string normalization |
normalization = getNormalizationString(c) and
result =
min(CharacterClass cc, string raw |
getNormalizationString(cc) = normalization and cc = CharClass(raw)
|
cc order by raw
)
)
}
/** Gets a string representing all the chars matched by `c` */
private string getNormalizationString(CharacterClass c) {
(c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and
result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar())
or
(c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and
// the string produced by the concat can not contain repeated chars
// so by starting the below with "nn" we can guarantee that
// it will not overlap with the above case.
// and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
result =
"nn:" +
concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar())
}
}
private class EdgeLabel extends TInputSymbol {
string toString() {
this = Epsilon() and result = ""
or
exists(InputSymbol s | this = s and result = s.toString())
}
}
/**
* A RegExp term that acts like a plus.
* Either it's a RegExpPlus, or it is a range {1,X} where X is >= 30.
* 30 has been chosen as a threshold because for exponential blowup 2^30 is enough to get a decent DOS attack.
*/
private class EffectivelyPlus instanceof RegExpTerm {
EffectivelyPlus() {
this instanceof RegExpPlus
or
exists(RegExpRange range |
range.getLowerBound() = 1 and
(range.getUpperBound() >= 30 or not exists(range.getUpperBound()))
|
this = range
)
}
string toString() { result = super.toString() }
RegExpTerm getAChild() { result = super.getChild(_) }
RegExpTerm getChild(int i) { result = super.getChild(i) }
}
/**
* A RegExp term that acts like a star.
* Either it's a RegExpStar, or it is a range {0,X} where X is >= 30.
*/
private class EffectivelyStar instanceof RegExpTerm {
EffectivelyStar() {
this instanceof RegExpStar
or
exists(RegExpRange range |
range.getLowerBound() = 0 and
(range.getUpperBound() >= 30 or not exists(range.getUpperBound()))
|
this = range
)
}
string toString() { result = super.toString() }
RegExpTerm getAChild() { result = super.getAChild() }
RegExpTerm getChild(int i) { result = super.getChild(i) }
}
/**
* A RegExp term that acts like a question mark.
* Either it's a RegExpQuestion, or it is a range {0,1}.
*/
private class EffectivelyQuestion instanceof RegExpTerm {
EffectivelyQuestion() {
this instanceof RegExpOpt
or
exists(RegExpRange range | range.getLowerBound() = 0 and range.getUpperBound() = 1 |
this = range
)
}
string toString() { result = super.toString() }
RegExpTerm getAChild() { result = super.getAChild() }
RegExpTerm getChild(int i) { result = super.getChild(i) }
}
/**
* Gets the state before matching `t`.
*/
pragma[inline]
private State before(RegExpTerm t) { result = Match(t, 0) }
/**
* Gets a state the NFA may be in after matching `t`.
*/
State after(RegExpTerm t) {
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
or
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
result = before(seq.getChild(i + 1))
or
i + 1 = seq.getNumChild() and result = after(seq)
)
or
exists(RegExpGroup grp | t = grp.getAChild() | result = after(grp))
or
exists(EffectivelyStar star | t = star.getAChild() |
not isPossessive(star) and
result = before(star)
)
or
exists(EffectivelyPlus plus | t = plus.getAChild() |
not isPossessive(plus) and
result = before(plus)
or
result = after(plus)
)
or
exists(EffectivelyQuestion opt | t = opt.getAChild() | result = after(opt))
or
exists(RegExpRoot root | t = root |
if matchesAnySuffix(root) then result = AcceptAnySuffix(root) else result = Accept(root)
)
}
pragma[noinline]
private int getCodepointLengthForState(string s) {
result = getCodepointLength(s) and
s = any(RegexpCharacterConstant reg).getValue()
}
/**
* Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
*/
predicate delta(State q1, EdgeLabel lbl, State q2) {
exists(RegexpCharacterConstant s, int i |
q1 = Match(s, i) and
(
not isIgnoreCase(s.getRootTerm()) and
lbl = Char(getCodepointAt(s.getValue(), i))
or
// normalize everything to lower case if the regexp is case insensitive
isIgnoreCase(s.getRootTerm()) and
exists(string c | c = getCodepointAt(s.getValue(), i) | lbl = Char(c.toLowerCase()))
) and
(
q2 = Match(s, i + 1)
or
getCodepointLengthForState(s.getValue()) = i + 1 and
q2 = after(s)
)
)
or
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
if isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
)
or
exists(RegExpCharacterClass cc |
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
or
q1 = before(cc) and
lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
q2 = after(cc)
)
or
exists(RegExpTerm cc | isEscapeClass(cc, _) |
q1 = before(cc) and
lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and
q2 = after(cc)
)
or
exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild()))
or
exists(RegExpSequence seq | lbl = Epsilon() | q1 = before(seq) and q2 = before(seq.getChild(0)))
or
exists(RegExpGroup grp | lbl = Epsilon() | q1 = before(grp) and q2 = before(grp.getChild(0)))
or
exists(RegExpGroup grp | lbl = Epsilon() |
not exists(grp.getAChild()) and
q1 = before(grp) and
q2 = before(grp.getSuccessor())
)
or
exists(EffectivelyStar star | lbl = Epsilon() |
q1 = before(star) and q2 = before(star.getChild(0))
or
q1 = before(star) and q2 = after(star)
)
or
exists(EffectivelyPlus plus | lbl = Epsilon() |
q1 = before(plus) and q2 = before(plus.getChild(0))
)
or
exists(EffectivelyQuestion opt | lbl = Epsilon() |
q1 = before(opt) and q2 = before(opt.getChild(0))
or
q1 = before(opt) and q2 = after(opt)
)
or
exists(RegExpRoot root | q1 = AcceptAnySuffix(root) |
lbl = Any() and q2 = q1
or
lbl = Epsilon() and q2 = Accept(root)
)
or
exists(RegExpRoot root | q1 = Match(root, 0) |
matchesAnyPrefix(root) and lbl = Any() and q2 = q1
)
or
exists(RegExpDollar dollar | q1 = before(dollar) |
lbl = Epsilon() and q2 = Accept(getRoot(dollar))
)
or
exists(EmptyPositiveSubPattern empty | q1 = before(empty) |
lbl = Epsilon() and q2 = after(empty)
)
}
/**
* Gets a state that `q` has an epsilon transition to.
*/
State epsilonSucc(State q) { delta(q, Epsilon(), result) }
/**
* Gets a state that has an epsilon transition to `q`.
*/
State epsilonPred(State q) { q = epsilonSucc(result) }
/**
* Holds if there is a state `q` that can be reached from `q1`
* along epsilon edges, such that there is a transition from
* `q` to `q2` that consumes symbol `s`.
*/
predicate deltaClosed(State q1, InputSymbol s, State q2) { delta(epsilonSucc*(q1), s, q2) }
/**
* Gets the root containing the given term, that is, the root of the literal,
* or a branch of the root disjunction.
*/
RegExpRoot getRoot(RegExpTerm term) {
result = term or
result = getRoot(term.getParent())
}
/**
* A state in the NFA.
*/
newtype TState =
/**
* A state representing that the NFA is about to match a term.
* `i` is used to index into multi-char literals.
*/
Match(RelevantRegExpTerm t, int i) {
i = 0
or
exists(getCodepointAt(t.(RegexpCharacterConstant).getValue(), i))
} or
/**
* An accept state, where exactly the given input string is accepted.
*/
Accept(RegExpRoot l) { l.isRelevant() } or
/**
* An accept state, where the given input string, or any string that has this
* string as a prefix, is accepted.
*/
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
/**
* Gets a state that is about to match the regular expression `t`.
*/
State mkMatch(RegExpTerm t) { result = Match(t, 0) }
/**
* A state in the NFA corresponding to a regular expression.
*
* Each regular expression literal `l` has one accepting state
* `Accept(l)`, one state that accepts all suffixes `AcceptAnySuffix(l)`,
* and a state `Match(t, i)` for every subterm `t`,
* which represents the state of the NFA before starting to
* match `t`, or the `i`th character in `t` if `t` is a constant.
*/
final class State extends TState {
RegExpTerm repr;
State() {
this = Match(repr, _) or
this = Accept(repr) or
this = AcceptAnySuffix(repr)
}
/**
* Gets a string representation for this state in a regular expression.
*/
string toString() {
exists(int i | this = Match(repr, i) | result = "Match(" + repr + "," + i + ")")
or
this instanceof Accept and
result = "Accept(" + repr + ")"
or
this instanceof AcceptAnySuffix and
result = "AcceptAny(" + repr + ")"
}
/**
* Gets the term represented by this state.
*/
RegExpTerm getRepr() { result = repr }
/**
* Holds if the term represented by this state is found at the specified location offsets.
*/
predicate hasLocationInfo(string file, int line, int column, int endline, int endcolumn) {
repr.hasLocationInfo(file, line, column, endline, endcolumn)
}
}
/**
* Gets the minimum char that is matched by both the character classes `c` and `d`.
*/
private string getMinOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) {
result = min(getAOverlapBetweenCharacterClasses(c, d))
}
/**
* Gets a char that is matched by both the character classes `c` and `d`.
* And `c` and `d` is not the same character class.
*/
private string getAOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) {
sharesRoot(c, d) and
result = [c.getARelevantChar(), d.getARelevantChar()] and
c.matches(result) and
d.matches(result) and
not c = d
}
/**
* Gets a character that is represented by both `c` and `d`.
*/
string intersect(InputSymbol c, InputSymbol d) {
(sharesRoot(c, d) or [c, d] = Any()) and
(
c = Char(result) and
d = getAnInputSymbolMatching(result)
or
result = getMinOverlapBetweenCharacterClasses(c, d)
or
result = c.(CharacterClass).choose() and
(
d = c
or
d = Dot() and
not (result = "\n" or result = "\r")
or
d = Any()
)
or
(c = Dot() or c = Any()) and
(d = Dot() or d = Any()) and
result = "a"
)
or
result = intersect(d, c)
}
/**
* Gets a symbol that matches `char`.
*/
bindingset[char]
InputSymbol getAnInputSymbolMatching(string char) {
result = Char(char)
or
result.(CharacterClass).matches(char)
or
result = Dot() and
not (char = "\n" or char = "\r")
or
result = Any()
}
/**
* Holds if `state` is a start state.
*/
predicate isStartState(State state) {
state = mkMatch(any(RegExpRoot r))
or
exists(RegExpCaret car | state = after(car))
}
/**
* Holds if `state` is a candidate for ReDoS with string `pump`.
*/
signature predicate isCandidateSig(State state, string pump);
/**
* Holds if `state` is a candidate for ReDoS.
*/
signature predicate isCandidateSig(State state);
/**
* Predicates for constructing a prefix string that leads to a given state.
*/
module PrefixConstruction<isCandidateSig/1 isCandidate> {
/**
* Holds if `state` is the textually last start state for the regular expression.
*/
private predicate lastStartState(RelevantState state) {
exists(RegExpRoot root |
state =
max(RelevantState s |
isStartState(s) and
getRoot(s.getRepr()) = root
|
s order by getTermLocationString(s.getRepr()), s.getRepr().toString()
)
)
}
/**
* Holds if there exists any transition (Epsilon() or other) from `a` to `b`.
*/
private predicate existsTransition(State a, State b) { delta(a, _, b) }
/**
* Gets the minimum number of transitions it takes to reach `state` from the `start` state.
*/
int prefixLength(State start, State state) =
shortestDistances(lastStartState/1, existsTransition/2)(start, state, result)
/**
* Gets the minimum number of transitions it takes to reach `state` from the start state.
*/
private int lengthFromStart(State state) { result = prefixLength(_, state) }
/**
* Gets a string for which the regular expression will reach `state`.
*
* Has at most one result for any given `state`.
* This predicate will not always have a result even if there is a ReDoS issue in
* the regular expression.
*/
string prefix(State state) {
lastStartState(state) and
result = ""
or
// the search stops past the last redos candidate state.
lengthFromStart(state) <= max(lengthFromStart(any(State s | isCandidate(s)))) and
exists(State prev |
// select a unique predecessor (by an arbitrary measure)
prev =
min(State s |
lengthFromStart(s) = lengthFromStart(state) - 1 and
delta(s, _, state)
|
s order by getTermLocationString(s.getRepr()), s.getRepr().toString()
)
|