Enable Javascript in your browser and then refresh this page, for a much enhanced experience.
Documented and Relatively Extensible solution in Clear category for URL Normalization by alexgerdom
import string
import re
from collections import deque
# ============================================== Character Sets
# Adapted from the appendix of https://tools.ietf.org/html/rfc3986
ALPHA = set(string.ascii_letters)
DIGIT = set(string.digits)
HEXDIG = set(string.hexdigits)
UNRESERVED = ALPHA | DIGIT | {"-", ".", "_", "~"}
# ============================================== Regex
# Regex to break a uri into its main components
# source: https://tools.ietf.org/html/rfc3986#page-50
URI_RE = re.compile(r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?')
# Regex to find percent-encoded octets
PERCENT_ENC_RE = re.compile(r'%[0123456789abcdefABCDEF]{2}')
# ============================================== Parsing functions
def parse(uri):
"""
Takes a uri as a string, and returns a tuple containing its
scheme, authority, path, query, and fragment components as
specified in the RFC. Components will be strings if they
are found, otherwise None.
NOTE: In order to simplify combining parts after normalization
back into a normalized url. I take the convention of including
the main delimiting characters (e.g. '//') along with the components,
even though they technically are not part of the component themselves.
"""
match = URI_RE.match(uri)
groups = match.groups()
scheme = groups[0] # ex ['http:'|'https:'|'mailto:']
authority = groups[2] # ex '//www.ics.uci.edu'
path = groups[4] # ex '/pub/ietf/uri/'
query = groups[6] # ex '?SomeQuery=True'
fragment = groups[8] # ex: '#page50'
return scheme, authority, path, query, fragment
def decompose_authority(authority):
"""
Takes the authority component of a uri, and returns a tuple containing
its userinfo, host, and port components. Subcomponents will be strings
if they are found, otherwise None.
"""
# ===================================================================
# Relevant part of the ABNF from the RFC
# authority = [ userinfo "@" ] host [ ":" port ]
# userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
# host = IP-literal / IPv4address / reg-name
# port = *DIGIT
# ===================================================================
userinfo = host = port = None
# Extract userinfo
has_userinfo = '@' in authority
if has_userinfo:
userinfo, authority = authority[:authority.index('@') + 1]
# Extract host and port
portmatch = re.match(r"(^.+)(:\d+)$", authority)
if portmatch:
host, port = portmatch.groups()
else:
host = authority
return userinfo, host, port
# ============================================== Normalization Rules and Helpers
def decode_unreserved_characters(octet_match):
"""
Decodes percent-encoded octets of unreserved characters.
Takes a regex match object for a percent-encoded octet.
Returns the decoded character if it is unreserved, or the encoded
character if it is reserved.
"""
match = octet_match.group()
decoded = chr(int(match[1:], base=16))
if decoded in UNRESERVED:
return decoded
else:
return match
def fix_percent_encoding(urlsegment):
"""
Takes part (or all) of a uri as a string, and returns the
equivalent part as a string with unreserved characters decoded.
"""
return PERCENT_ENC_RE.sub(decode_unreserved_characters, urlsegment)
# Adapted from https://tools.ietf.org/html/rfc3986#section-5.2.4
def remove_dot_segments(path):
"""
Takes a path as a string.
Returns as a string the equivalent path obtained by resolving occurances of
'/../' and '/./'
Ex:
>>> remove_dot_segments('/a/b/../c/./d.html')
'/a/c/d.html'
"""
inbuff = deque(path.split('/'))
outbuff = deque([])
while inbuff:
cur = inbuff.popleft()
if cur == '..':
try:
outbuff.pop()
except IndexError:
m = ('Abnormal path specified:[{}]\n'
'remove_dot_segments is not intended to handle'
'abnormal paths such as in www.example.com/../../bar/')
raise NotImplementedError(m.format(path))
elif cur == '.':
continue
else:
outbuff.append(cur)
return '/'.join(outbuff)
def checkio(url):
"""
Normalize a url.
"""
uri = url
# ============== Handle percent encoded octets
# [Rule 3: Decode any unreserved characters]
# Decode first in case we have an upper case percent-encoded octet
uri = fix_percent_encoding(uri)
# [Rule 1: Convert the scheme and host to lower case]
# Though the task specifies that the scheme and host need to be
# converted to lowercase, it's actually intended for the path to be
# normalized as well. I believe this function may need refactoring
# if we want to handle uri's that contain a userinfo subcomponent
# in the authority segment. (Need to check if userinfo is case sensitive)
uri = uri.lower()
# [Rule 2: Capitalize letters in escape sequences]
uri = PERCENT_ENC_RE.sub(lambda m: m.group().upper(), uri)
# ============== Start breaking down and processing the url
scheme, authority, path, query, fragment = parse(uri)
userinfo, host, port = decompose_authority(authority)
# [Rule 4: If using the default port, remove it from the url]
port = '' if port == ':80' else port
# [Rule 5: Remove dot segments from the path]
path = remove_dot_segments(path)
# Recombine segments
segments = (scheme, userinfo, host, port, path, query, fragment)
segments = (seg if seg else '' for seg in segments)
return "".join(segments)
July 22, 2016