Enable Javascript in your browser and then refresh this page, for a much enhanced experience.
Regular Expressions solution in Uncategorized category for URL Normalization by PositronicLlama
"""Normalize a URL."""
import re
import string
# Unfortunately, urllib.parse is not available for this mission, and neither is
# os (for os.path.normpath). So we'll have to do things with regular expressions.
RE_URL = re.compile('(?P[a-zA-Z]+)://(?P[^/:]+)(?::(?P[0-9]+))?(?:/(?P.*))?')
RE_OCTET = re.compile('%([0-9a-fA-F]{2})')
RE_UNSAFE = re.compile('([^-_~.a-zA-Z0-9])')
DEFAULT_PORT = 80
# The path component of a URI is case-sensitive unless specified otherwise
# by a particular scheme. It is incorrect to reduce it to lower case, but
# this mission requires doing so. See:
#
# http://tools.ietf.org/html/rfc3986#section-6.2.2.1
# "The other generic syntax components are assumed to be case-sensitive."
#
PATH_LOWER = True
ASCII_LOWER = str.maketrans(dict(zip(string.ascii_uppercase, string.ascii_lowercase)))
def normalize_segment(segment):
"""Return a normalized version of segment."""
segment = RE_OCTET.sub(lambda m: chr(int(m.group(1), 16)), segment)
if PATH_LOWER:
segment = str.translate(segment, ASCII_LOWER)
segment = RE_UNSAFE.sub(lambda m: "%{:02X}".format(ord(m.group(1))), segment)
return segment
def normalize_path(path):
"""Return a normalized version of path."""
if path is None:
return ''
segments = []
for segment in path.split('/'):
if segment == '..':
if segments:
segments.pop()
elif segment != '.':
segments.append(normalize_segment(segment))
return '/'.join(segments)
def checkio(url):
"""Return a normalized version of url."""
m = RE_URL.match(url)
if m is None:
return url
scheme = m.group('scheme').lower()
host = m.group('host').lower()
port = int(m.group('port') or DEFAULT_PORT)
path = normalize_path(m.group('path'))
authority = host
if port != DEFAULT_PORT:
authority += ':{}'.format(port)
url = "{}://{}".format(scheme, authority)
if path:
url += "/{}".format(path)
return url
if __name__ == '__main__':
assert checkio("Http://Www.Checkio.org") == \
"http://www.checkio.org", "1st rule"
assert checkio("http://www.checkio.org/%cc%b1bac") == \
"http://www.checkio.org/%CC%B1bac", "2nd rule"
assert checkio("http://www.checkio.org/task%5F%31") == \
"http://www.checkio.org/task_1", "3rd rule"
assert checkio("http://www.checkio.org:80/home/") == \
"http://www.checkio.org/home/", "4th rule"
assert checkio("http://www.checkio.org/task/./1/../2/././name") == \
"http://www.checkio.org/task/2/name", "5th rule"
Sept. 3, 2013
Comments: