From 0354ae8514baeccbdfda6c47163792365669d596 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 20 Jun 2017 17:20:37 +0200 Subject: [PATCH] bpo-30713: Reject newline in urllib.parse The splittype(), splitport() and splithost() functions of the urllib.parse module now reject URLs which contain a newline character. --- Lib/test/test_urlparse.py | 27 +++++++++++++++++++++++++++ Lib/urllib/parse.py | 12 ++++++------ Misc/NEWS | 3 +++ 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index e5f6130e4a075e..6beaa3d4f3b86b 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -981,6 +981,15 @@ def test_splittype(self): self.assertEqual(splittype('type:'), ('type', '')) self.assertEqual(splittype('type:opaque:string'), ('type', 'opaque:string')) + # bpo-30713: The newline character U+000A is invalid in URLs + for url in ( + '\ntype:string', + 'ty\npe:string', + 'type:str\ning', + 'type:string\n', + ): + self.assertEqual(splittype(url), (None, url)) + def test_splithost(self): splithost = urllib.parse.splithost self.assertEqual(splithost('//www.example.org:80/foo/bar/baz.html'), @@ -1010,6 +1019,15 @@ def test_splithost(self): self.assertEqual(splithost("//example.net/file#"), ('example.net', '/file#')) + # bpo-30713: The newline character U+000A is invalid in URLs + for url in ( + '\n//hostname/url', + '//host\nname/url', + '//hostname/u\nrl', + '//hostname/url\n', + ): + self.assertEqual(splithost(url), (None, url)) + def test_splituser(self): splituser = urllib.parse.splituser self.assertEqual(splituser('User:Pass@www.python.org:080'), @@ -1052,6 +1070,15 @@ def test_splitport(self): self.assertEqual(splitport('[::1]'), ('[::1]', None)) self.assertEqual(splitport(':88'), ('', '88')) + # bpo-30713: The newline character U+000A is invalid in URLs + for url in ( + '\nparrot:88', + 'par\nrot:88', + 'parrot:8\n8', + 'parrot:88\n', + ): + self.assertEqual(splitport(url), (url, None)) + def test_splitnport(self): splitnport = urllib.parse.splitnport self.assertEqual(splitnport('parrot:88'), ('parrot', 88)) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 01eb54906c8a53..65bf7215e00d89 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -934,9 +934,9 @@ def splittype(url): """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" global _typeprog if _typeprog is None: - _typeprog = re.compile('([^/:]+):(.*)', re.DOTALL) + _typeprog = re.compile('([^/:\n]+):(.*)') - match = _typeprog.match(url) + match = _typeprog.fullmatch(url) if match: scheme, data = match.groups() return scheme.lower(), data @@ -947,9 +947,9 @@ def splithost(url): """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" global _hostprog if _hostprog is None: - _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL) + _hostprog = re.compile('//([^/#?\n]*)(.*)') - match = _hostprog.match(url) + match = _hostprog.fullmatch(url) if match: host_port, path = match.groups() if path and path[0] != '/': @@ -973,9 +973,9 @@ def splitport(host): """splitport('host:port') --> 'host', 'port'.""" global _portprog if _portprog is None: - _portprog = re.compile('(.*):([0-9]*)$', re.DOTALL) + _portprog = re.compile('(.*):([0-9]*)') - match = _portprog.match(host) + match = _portprog.fullmatch(host) if match: host, port = match.groups() if port: diff --git a/Misc/NEWS b/Misc/NEWS index 88b1e3e22f7118..807e9bace6262e 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -368,6 +368,9 @@ Extension Modules Library ------- +- [Security] bpo-30713: The splittype(), splitport() and splithost() functions + of the urllib.parse module now reject URLs which contain a newline character. + - bpo-29755: Fixed the lgettext() family of functions in the gettext module. They now always return bytes.