Enable Javascript in your browser and then refresh this page, for a much enhanced experience.
Pedantic solution in Uncategorized category for URL Normalization by htamas
def checkio(url):
# 1. parse url
# 1.1. detach scheme
if ':' in url.split('/', 1)[0]:
scheme, url = url.split(':', 1)
else:
scheme = ''
# 1.2. detach fragment
if '#' in url:
url, fragment = url.rsplit('#', 1)
else:
fragment = ''
# 1.3. detach query
if '?' in url:
url, query = url.rsplit('?', 1)
else:
query = ''
# 1.4. detach authority
if url.startswith('//'):
url = url[2:]
if '/' in url:
authority, path = url.split('/', 1)
path = '/' + path
else:
authority, path = url, ''
else:
authority, path = '', url
# 1.5. detach userinfo
if '@' in authority:
userinfo, authority = authority.split('@', 1)
else:
userinfo = ''
# 1.6. split host and port
if ':' in authority:
host, port = authority.split(':', 1)
else:
host, port = authority, ''
# 1.7. split path segments
segments = path.split('/')
# 2. perform normalization
# 2.1. convert scheme and host to lowercase
scheme = scheme.lower()
host = host.lower()
# 2.1a. path segments should NOT be lowercased according to RFC 3986
# section 6.2.2.1. but the evaluator requires me to do that :S
segments = [s.lower() for s in segments]
# 2.2. process percent-encoded octets in userinfo, host and path segments
hexdigits = '0123456789ABCDEFabcdef'
alnum = ''.join(chr(i) for i in range(128) if chr(i).isalnum())
unreserved = alnum + '-._~'
# we associate a boolean to each escapable url part that tells whether we
# would like to lowercase unescaped characters (see also comment 2.1a.)
escapable = [(userinfo, False), (host, True)] + [(s, True) for s in segments]
processed = []
for s, lc in escapable:
t = s.split('%')
for i in range(1, len(t)):
# t[:2] is a possible percent-encoded octet (without the %)
if t[i][0] in hexdigits and t[i][1] in hexdigits:
c = chr(int(t[i][:2], 16))
if c in unreserved:
t[i] = (c.lower() if lc else c) + t[i][2:]
else:
t[i] = '%' + t[i][:2].upper() + t[i][2:]
else:
t[i] = '%' + t[i] # invalid percent encoding, ignore it
processed.append(''.join(t))
userinfo, host, *segments = processed
# 2.3. remove default port
defports = {
'ftp': 21, 'gopher': 70, 'http': 80, 'https': 443, 'ldap': 389,
'ldaps': 636, 'mms': 1755, 'news': 119, 'pop': 110, 'rlogin': 513,
'rsync': 873, 'rtsp': 554, 'rtspu': 554, 'sip': 5060, 'sips': 5061,
'snews': 563, 'ssh': 22, 'telnet': 23, 'tn3270': 23
}
if scheme in defports and port == str(defports[scheme]):
port = ''
# 2.4. remove dot-segments
processed = []
for s in segments:
if s == '.':
pass
elif s == '..':
processed and processed.pop() # remove last segment if it exists
else:
processed.append(s)
segments = processed
# 3. recompose url
# 3.1. join path
path = '/'.join(segments)
# 3.2. reconstruct authority
authority = host
if port:
authority += ':' + port
if userinfo:
authority = userinfo + authority
# 3.3. build complete url
url = ''
if scheme:
url += scheme + ':'
if authority:
url += '//' + authority
url += path
if query:
url += '?' + query
if fragment:
url += '#' + fragment
return url
Dec. 16, 2013
Comments: