codeql/python/ql/test/query-tests/Expressions/Regex/test.py at 0df3dd68d6eef76e97625883a4ce08b3aaf87f20 · github/codeql · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import re

#Unmatchable caret
re.compile(b' ^abc')
re.compile(b"(?s) ^abc")
re.compile(b"\[^123]")

#Likely false positives for unmatchable caret
re.compile(b"[^123]")
re.compile(b"[123^]")
re.sub(b'(?m)^(?!$)', indent*' ', s)
re.compile(b"()^abc")
re.compile(b"(?:(?:\n\r?)|^)( *)\S")
re.compile(b"^diff (?:-r [0-9a-f]+ ){1,2}(.*)$")

#Backspace escape
re.compile(br"[\b\t ]") # Should warn
re.compile(br"E\d+\b.*") # Fine
re.compile(br"E\d+\b[ \b\t]") #Both

#Missing part in named group
re.compile(br'(P<name>[\w]+)')
re.compile(br'(_(P<name>[\w]+)|)')
#This is OK...
re.compile(br'(?P<name>\w+)')


#Unmatchable dollar
re.compile(b"abc$ ")
re.compile(b"abc$ (?s)")
re.compile(b"\[$]  ")

#Not unmatchable dollar
re.match(b"[$]  ", b"$  ")
re.match(b"\$  ", b"$  ")
re.match(b"abc$(?m)", b"abc")
re.match(b"abc$()", b"abc")
re.match(b"((a$)|b)*", b"bba")
re.match(b"((a$)|b){4}", b"bbba") # Inspired by FP report here: https://github.com/github/codeql/issues/2403
re.match(b"((a$).*)", b"a")
re.match("(\Aab$|\Aba$)$\Z", "ab")
re.match(b"((a$\Z)|b){4}", b"bbba")
re.match(b"(a){00}b", b"b")

#Duplicate character in set
re.compile(b"[AA]")
re.compile(b"[000]")
re.compile(b"[-0-9-]")

#Possible false positives
re.compile(b"[S\S]")
re.compile(b"[0\000]")
re.compile(b"[\0000]")
re.compile(b"[^^]")
re.compile(b"[-0-9]")
re.compile(b"[]]")
re.compile(b"^^^x.*")
re.compile(b".*x$$$")
re.compile(b"x*^y")
re.compile(b"x$y*")

# False positive for unmatchable caret
re.compile(br'(?!DEFAULT_PREFS)(?!CAN_SET_ANON)^[A-Z_]+$')

#Equivalent for unmatchable dollar
re.compile(br'^[A-Z_]+(?!DEFAULT_PREFS)(?!CAN_SET_ANON)$')

#And for negative look-behind assertions
re.compile(br'(?<!DEFAULT_PREFS)(?<!CAN_SET_ANON)^[A-Z_]+$')
re.compile(br'^[A-Z_]+(?<!DEFAULT_PREFS)(?<!CAN_SET_ANON)$')


#OK
re.compile(br'(?=foo)^\w+')
re.compile(br'\w+$(?<=foo)')


#Not OK
re.compile(br'(?<=foo)^\w+')
re.compile(br'\w+$(?=foo)')


#OK -- ODASA-ODASA-3968
re.compile('(?:[^%]|^)?%\((\w*)\)[a-z]')

#ODASA-3985
#Half Surrogate pairs
re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
#Outside BMP
re.compile(u'[\U00010000-\U0010ffff]')

#ODASA-6394  -- Flags defined by keyword
REGEX0 = re.compile(r'''   ^\s*    ''', re.VERBOSE)

REGEX1 = re.compile(r'''   ^\s*    ''', flags=re.VERBOSE)

REGEX2 = re.compile(r'''
            ^\s*
            (?P<modifier>[+-]?)
            (?: (?P<week>   \d+ (?:\.\d*)? ) \s* [wW]  )? \s*
            (?: (?P<day>    \d+ (?:\.\d*)? ) \s* [dD]  )? \s*
            (?: (?P<hour>   \d+ (?:\.\d*)? ) \s* [hH]  )? \s*
            (?: (?P<minute> \d+ (?:\.\d*)? ) \s* [mM]  )? \s*
            (?: (?P<second> \d+ (?:\.\d*)? ) \s* [sS]  )? \s*
            $
            ''',
            flags=re.VERBOSE)

REGEX3 = re.compile(r'''
            ^\s*
            (?P<modifier>[+-]?)
            (?: (?P<week>   \d+ (?:\.\d*)? ) \s* [wW]  )? \s*
            (?: (?P<day>    \d+ (?:\.\d*)? ) \s* [dD]  )? \s*
            (?: (?P<hour>   \d+ (?:\.\d*)? ) \s* [hH]  )? \s*
            (?: (?P<minute> \d+ (?:\.\d*)? ) \s* [mM]  )? \s*
            (?: (?P<second> \d+ (?:\.\d*)? ) \s* [sS]  )? \s*
            $
            ''',
            re.VERBOSE)

#ODASA-6780
DYLIB_RE = re.compile(r"""(?x)
(?P<location>^.*)(?:^|/)
(?P<name>
    (?P<shortname>\w+?)
    (?:\.(?P<version>[^._]+))?
    (?:_(?P<suffix>[^._]+))?
    \.dylib$
)
""")

#ODASA-6786
VERBOSE_REGEX = r"""
        \[                                 # [
        (?P<header>[^]]+)                  # very permissive!
        \]                                 # ]
        """

# Compiled regular expression marking it as verbose
ODASA_6786 = re.compile(VERBOSE_REGEX, re.VERBOSE)

#Named group with caret and empty choice.
re.compile(r'(?:(?P<n1>^(?:|x)))')

#Potentially mis-parsed character set
re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)")

#Allow unicode in raw strings
re.compile(r"[\U00010000-\U0010FFFF]")
re.compile(r"[\u0000-\uFFFF]")

#Allow unicode names
re.compile(r"[\N{degree sign}\N{EM DASH}]")