Enable Javascript in your browser and then refresh this page, for a much enhanced experience.
First solution in Clear category for URL Normalization by simleo
import string, re
## import os.path
## import urllib.parse as urlparse
#--- cut & paste from stdlib to fix missing modules ---
from collections import namedtuple
def normpath(path):
"""Normalize path, eliminating double slashes, etc."""
if isinstance(path, bytes):
sep = b'/'
empty = b''
dot = b'.'
dotdot = b'..'
else:
sep = '/'
empty = ''
dot = '.'
dotdot = '..'
if path == empty:
return dot
initial_slashes = path.startswith(sep)
# POSIX allows one or two initial slashes, but treats three or more
# as single slash.
if (initial_slashes and
path.startswith(sep*2) and not path.startswith(sep*3)):
initial_slashes = 2
comps = path.split(sep)
new_comps = []
for comp in comps:
if comp in (empty, dot):
continue
if (comp != dotdot or (not initial_slashes and not new_comps) or
(new_comps and new_comps[-1] == dotdot)):
new_comps.append(comp)
elif new_comps:
new_comps.pop()
comps = new_comps
path = sep.join(comps)
if initial_slashes:
path = sep*initial_slashes + path
return path or dot
class _NetlocResultMixinBase(object):
"""Shared methods for the parsed result objects containing a netloc element"""
__slots__ = ()
@property
def username(self):
return self._userinfo[0]
@property
def password(self):
return self._userinfo[1]
@property
def hostname(self):
hostname = self._hostinfo[0]
if not hostname:
hostname = None
elif hostname is not None:
hostname = hostname.lower()
return hostname
@property
def port(self):
port = self._hostinfo[1]
if port is not None:
port = int(port, 10)
return port
# Result objects are more helpful than simple tuples
class _ResultMixinStr(object):
"""Standard approach to encoding parsed results from str to bytes"""
__slots__ = ()
def encode(self, encoding='ascii', errors='strict'):
return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr):
__slots__ = ()
@property
def _userinfo(self):
netloc = self.netloc
userinfo, have_info, hostinfo = netloc.rpartition('@')
if have_info:
username, have_password, password = userinfo.partition(':')
if not have_password:
password = None
else:
username = password = None
return username, password
@property
def _hostinfo(self):
netloc = self.netloc
_, _, hostinfo = netloc.rpartition('@')
_, have_open_br, bracketed = hostinfo.partition('[')
if have_open_br:
hostname, _, port = bracketed.partition(']')
_, have_port, port = port.partition(':')
else:
hostname, have_port, port = hostinfo.partition(':')
if not have_port:
port = None
return hostname, port
_SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment')
class SplitResult(_SplitResultBase, _NetlocResultMixinStr):
__slots__ = ()
def geturl(self):
return urlunsplit(self)
MAX_CACHE_SIZE = 20
_parse_cache = {}
def _splitnetloc(url, start=0):
delim = len(url) # position of end of domain part of url, default is end
for c in '/?#': # look for delimiters; the order is NOT important
wdelim = url.find(c, start) # find first of this delim
if wdelim >= 0: # if found
delim = min(delim, wdelim) # use earliest delim position
return url[start:delim], url[delim:] # return (domain, rest)
def _noop(obj):
return obj
def _coerce_args(*args):
# Invokes decode if necessary to create str args
# and returns the coerced inputs along with
# an appropriate result coercion function
# - noop for str inputs
# - encoding function otherwise
str_input = isinstance(args[0], str)
for arg in args[1:]:
# We special-case the empty string to support the
# "scheme=''" default argument to some functions
if arg and isinstance(arg, str) != str_input:
raise TypeError("Cannot mix str and non-str arguments")
if str_input:
return args + (_noop,)
return _decode_args(args) + (_encode_result,)
def urlsplit(url, scheme='', allow_fragments=True):
"""Parse a URL into 5 components:
:///?#
Return a 5-tuple: (scheme, netloc, path, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
url, scheme, _coerce_result = _coerce_args(url, scheme)
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
if cached:
return _coerce_result(cached)
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache()
netloc = query = fragment = ''
i = url.find(':')
if i > 0:
if url[:i] == 'http': # optimize the common case
scheme = url[:i].lower()
url = url[i+1:]
if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
if (('[' in netloc and ']' not in netloc) or
(']' in netloc and '[' not in netloc)):
raise ValueError("Invalid IPv6 URL")
if allow_fragments and '#' in url:
url, fragment = url.split('#', 1)
if '?' in url:
url, query = url.split('?', 1)
v = SplitResult(scheme, netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)
for c in url[:i]:
if c not in scheme_chars:
break
else:
try:
# make sure "url" is not actually a port number (in which case
# "scheme" is really part of the path
_testportnum = int(url[i+1:])
except ValueError:
scheme, url = url[:i].lower(), url[i+1:]
if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
if (('[' in netloc and ']' not in netloc) or
(']' in netloc and '[' not in netloc)):
raise ValueError("Invalid IPv6 URL")
if allow_fragments and scheme in uses_fragment and '#' in url:
url, fragment = url.split('#', 1)
if scheme in uses_query and '?' in url:
url, query = url.split('?', 1)
v = SplitResult(scheme, netloc, url, query, fragment)
_parse_cache[key] = v
return _coerce_result(v)
def urlunsplit(components):
"""Combine the elements of a tuple as returned by urlsplit() into a
complete URL as a string. The data argument can be any five-item iterable.
This may result in a slightly different, but equivalent URL, if the URL that
was parsed originally had unnecessary delimiters (for example, a ? with an
empty query; the RFC states that these are equivalent)."""
scheme, netloc, url, query, fragment, _coerce_result = (
_coerce_args(*components))
if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
if url and url[:1] != '/': url = '/' + url
url = '//' + (netloc or '') + url
if scheme:
url = scheme + ':' + url
if query:
url = url + '?' + query
if fragment:
url = url + '#' + fragment
return _coerce_result(url)
#------------------------------------------------------
UNRESERVED = string.ascii_letters + string.digits + '-._~'
DECODE_MAP = dict(('%%%02X' % ord(c), c) for c in UNRESERVED)
ESCAPE = re.compile('%[a-z0-9]{2}', re.IGNORECASE)
def checkio(url):
def decode(m):
esc = m.group(0).upper()
try:
esc = DECODE_MAP[esc].lower()
except KeyError:
pass
return esc
url = re.sub(ESCAPE, decode, url.lower())
scheme, netloc, path, query, fragment = urlsplit(url)
try:
host, port = netloc.split(':')
except ValueError:
pass
else:
if port == '80':
netloc = host
if path:
path = normpath(path)
if url.endswith('/'):
path = '%s/' % path
return urlunsplit((scheme, netloc, path, query, fragment))
#These "asserts" using only for self-checking and not necessary for auto-testing
if __name__ == '__main__':
assert checkio("Http://Www.Checkio.org") == \
"http://www.checkio.org", "1st rule"
assert checkio("http://www.checkio.org/%cc%b1bac") == \
"http://www.checkio.org/%CC%B1bac", "2nd rule"
assert checkio("http://www.checkio.org/task%5F%31") == \
"http://www.checkio.org/task_1", "3rd rule"
assert checkio("http://www.checkio.org:80/home/") == \
"http://www.checkio.org/home/", "4th rule"
assert checkio("http://www.checkio.org:8080/home/") == \
"http://www.checkio.org:8080/home/", "4th rule again"
assert checkio("http://www.checkio.org/task/./1/../2/././name") == \
"http://www.checkio.org/task/2/name", "5th rule"
print('First set of tests done')
Jan. 20, 2014
Comments: