-
Notifications
You must be signed in to change notification settings - Fork 2k
Expand file tree
/
Copy pathMissingRegExpAnchor.ql
More file actions
228 lines (212 loc) · 7.21 KB
/
MissingRegExpAnchor.ql
File metadata and controls
228 lines (212 loc) · 7.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/**
* @name Missing regular expression anchor
* @description Regular expressions without anchors can be vulnerable to bypassing.
* @kind problem
* @problem.severity warning
* @precision medium
* @id js/regex/missing-regexp-anchor
* @tags correctness
* security
* external/cwe/cwe-20
*/
import javascript
import HostnameRegexpShared
/** Holds if `term` is one of the transitive left children of a regexp. */
predicate isLeftArmTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent |
term = parent.getChild(0) and
isLeftArmTerm(parent)
)
}
/** Holds if `term` is one of the transitive right children of a regexp. */
predicate isRightArmTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent |
term = parent.getLastChild() and
isRightArmTerm(parent)
)
}
/**
* Holds if `term` is an anchor that is not the first or last node
* in its tree.
*/
predicate isInteriorAnchor(RegExpAnchor term) {
not isLeftArmTerm(term) and
not isRightArmTerm(term)
}
/**
* Holds if `term` contains an anchor that is not the first or last node
* in its tree, such as `(foo|bar$|baz)`.
*/
predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
/**
* Holds if `term` starts with a word boundary or lookbehind assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getChild(0) |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookbehind
)
}
/**
* Holds if `term` ends with a word boundary or lookahead assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getLastChild() |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookahead
)
}
/**
* Holds if `term` is an empty sequence, usually arising from
* literals with a trailing alternative such as `foo|`.
*/
predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
/**
* Holds if `term` contains a letter constant.
*
* We use this as a heuristic to filter out uninteresting results.
*/
predicate containsLetters(RegExpTerm term) {
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
}
/**
* Holds if `term` consists only of an anchor and a parenthesized term,
* such as the left side of `^(foo|bar)|baz`.
*
* The precedence of the anchor is likely to be intentional in this case,
* as the group wouldn't be needed otherwise.
*/
predicate isAnchoredGroup(RegExpSequence term) {
term.getNumChild() = 2 and
term.getAChild() instanceof RegExpAnchor and
term.getAChild() instanceof RegExpGroup
}
/**
* Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
* and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
*/
predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
isAnchoredGroup(alt.getAChild()) and
not alt.getAChild() instanceof RegExpGroup
}
/**
* Holds if `src` is a pattern for a collection of alternatives where
* only the first or last alternative is anchored, indicating a
* precedence mistake explained by `msg`.
*
* The canonical example of such a mistake is: `^a|b|c`, which is
* parsed as `(^a)|(b)|(c)`.
*/
predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
root = src.getRegExpTerm() and
not containsInteriorAnchor(root) and
not isEmpty(root.getAChild()) and
not hasExplicitAnchorPrecedence(root) and
containsLetters(anchoredTerm) and
(
anchoredTerm = root.getChild(0) and
anchoredTerm.getChild(0) instanceof RegExpCaret and
not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
direction = "beginning"
or
anchoredTerm = root.getLastChild() and
anchoredTerm.getLastChild() instanceof RegExpDollar and
not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
direction = "end"
) and
// is not used for replace
not exists(DataFlow::MethodCallNode replace |
replace.getMethodName() = "replace" and
src.getARegExpObject().flowsTo(replace.getArgument(0))
) and
msg =
"Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
"' is anchored at the " + direction +
", but the other parts of this regular expression are not"
)
}
/**
* Holds if `term` is a final term, that is, no term will match anything after this one.
*/
predicate isFinalRegExpTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpSequence seq |
isFinalRegExpTerm(seq) and
term = seq.getLastChild()
)
or
exists(RegExpTerm parent |
isFinalRegExpTerm(parent) and
term = parent.getAChild() and
not parent instanceof RegExpSequence and
not parent instanceof RegExpQuantifier
)
}
/**
* Holds if `src` contains a hostname pattern that is missing a `$` anchor.
*/
predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld, i) and
isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
tld.getChild(0) instanceof RegExpCaret and
msg =
"This hostname pattern may match any domain name, as it is missing a '$' or '/' at the end."
)
}
/**
* Holds if `src` is an unanchored pattern for a URL, indicating a
* mistake explained by `msg`.
*/
predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
alwaysMatchesHostname(term) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld) and
not isConstantInvalidInsideOrigin(term.getAChild*()) and
not term.getAChild*() instanceof RegExpAnchor and
// that is not used for capture or replace
not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
name = "exec" and
mcn = src.getARegExpObject().getAMethodCall() and
exists(mcn.getAPropertyRead())
or
exists(DataFlow::Node arg |
arg = mcn.getArgument(0) and
(
src.getARegExpObject().flowsTo(arg) or
src.getAParse() = arg
)
|
name = "replace"
or
name = "match" and exists(mcn.getAPropertyRead())
)
) and
msg =
"When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
)
}
from DataFlow::Node nd, string msg
where
isUnanchoredHostnameRegExp(nd, msg)
or
isSemiAnchoredHostnameRegExp(nd, msg)
or
hasMisleadingAnchorPrecedence(nd, msg)
select nd, msg