/** * @name Incomplete URL substring sanitization * @description Security checks on the substrings of an unparsed URL are often vulnerable to bypassing. * @kind problem * @problem.severity warning * @precision high * @id py/incomplete-url-substring-sanitization * @tags correctness * security * external/cwe/cwe-20 */ import python import semmle.python.regex private string commonTopLevelDomainRegex() { result = "com|org|edu|gov|uk|net|io" } predicate looksLikeUrl(StrConst s) { exists(string text | text = s.getText() | text.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(" + commonTopLevelDomainRegex() +")(:[0-9]+)?/?") or // target is a HTTP URL to a domain on any TLD text.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?") ) } predicate incomplete_sanitization(Expr sanitizer, StrConst url) { looksLikeUrl(url) and ( sanitizer.(Compare).compares(url, any(In i), _) or call_to_startswith(sanitizer, url) or unsafe_call_to_endswith(sanitizer, url) ) } predicate call_to_startswith(Call sanitizer, StrConst url) { sanitizer.getFunc().(Attribute).getName() = "startswith" and sanitizer.getArg(0) = url } predicate unsafe_call_to_endswith(Call sanitizer, StrConst url) { sanitizer.getFunc().(Attribute).getName() = "endswith" and sanitizer.getArg(0) = url and not url.getText().regexpMatch("(?i)\\.([a-z0-9-]+)(\\.[a-z0-9-]+)+") } from Expr sanitizer, StrConst url where incomplete_sanitization(sanitizer, url) select sanitizer, "'$@' may be at an arbitrary position in the sanitized URL.", url, url.getText()