Documented and Relatively Extensible clear solution for URL Normalization by alexgerdom - python coding challenges

Enable Javascript in your browser and then refresh this page, for a much enhanced experience.
Documented and Relatively Extensible solution in Clear category for URL Normalization by alexgerdom
import string
import re
from collections import deque

# ============================================== Character Sets
# Adapted from the appendix of https://tools.ietf.org/html/rfc3986
ALPHA = set(string.ascii_letters)
DIGIT = set(string.digits)
HEXDIG = set(string.hexdigits)
UNRESERVED = ALPHA | DIGIT | {"-", ".", "_", "~"}

# ============================================== Regex
# Regex to break a uri into its main components
# source: https://tools.ietf.org/html/rfc3986#page-50

URI_RE = re.compile(r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?')

# Regex to find percent-encoded octets
PERCENT_ENC_RE = re.compile(r'%[0123456789abcdefABCDEF]{2}')


# ============================================== Parsing functions
def parse(uri):
    """
    Takes a uri as a string, and returns a tuple containing its
    scheme, authority, path, query, and fragment components as
    specified in the RFC. Components will be strings if they
    are found, otherwise None.

    NOTE: In order to simplify combining parts after normalization
    back into a normalized url. I take the convention of including
    the main delimiting characters (e.g. '//') along with the components,
    even though they technically are not part of the component themselves.
    """
    match = URI_RE.match(uri)
    groups = match.groups()
    scheme = groups[0]  # ex ['http:'|'https:'|'mailto:']
    authority = groups[2]  # ex '//www.ics.uci.edu'
    path = groups[4]  # ex '/pub/ietf/uri/'
    query = groups[6]  # ex '?SomeQuery=True'
    fragment = groups[8]  # ex: '#page50'
    return scheme, authority, path, query, fragment


def decompose_authority(authority):
    """
    Takes the authority component of a uri, and returns a tuple containing
    its userinfo, host, and port components. Subcomponents will be strings
    if they are found, otherwise None.
    """
    # ===================================================================
    # Relevant part of the ABNF from the RFC
    #    authority     = [ userinfo "@" ] host [ ":" port ]
    #    userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
    #    host          = IP-literal / IPv4address / reg-name
    #    port          = *DIGIT
    # ===================================================================
    userinfo = host = port = None
    # Extract userinfo
    has_userinfo = '@' in authority
    if has_userinfo:
        userinfo, authority = authority[:authority.index('@') + 1]

    # Extract host and port
    portmatch = re.match(r"(^.+)(:\d+)$", authority)
    if portmatch:
        host, port = portmatch.groups()
    else:
        host = authority
    return userinfo, host, port


# ============================================== Normalization Rules and Helpers

def decode_unreserved_characters(octet_match):
    """
    Decodes percent-encoded octets of unreserved characters.

    Takes a regex match object for a percent-encoded octet.
    Returns the decoded character if it is unreserved, or the encoded
    character if it is reserved.
    """
    match = octet_match.group()
    decoded = chr(int(match[1:], base=16))
    if decoded in UNRESERVED:
        return decoded
    else:
        return match


def fix_percent_encoding(urlsegment):
    """
    Takes part (or all) of a uri as a string, and returns the
    equivalent part as a string with unreserved characters decoded.
    """
    return PERCENT_ENC_RE.sub(decode_unreserved_characters, urlsegment)


# Adapted from https://tools.ietf.org/html/rfc3986#section-5.2.4
def remove_dot_segments(path):
    """
    Takes a path as a string.
    Returns as a string the equivalent path obtained by resolving occurances of
    '/../' and '/./'

    Ex:
        >>> remove_dot_segments('/a/b/../c/./d.html')
        '/a/c/d.html'
    """
    inbuff = deque(path.split('/'))
    outbuff = deque([])
    while inbuff:
        cur = inbuff.popleft()
        if cur == '..':
            try:
                outbuff.pop()
            except IndexError:
                m = ('Abnormal path specified:[{}]\n'
                     'remove_dot_segments is not intended to handle'
                     'abnormal paths such as in www.example.com/../../bar/')
                raise NotImplementedError(m.format(path))
        elif cur == '.':
            continue
        else:
            outbuff.append(cur)
    return '/'.join(outbuff)


def checkio(url):
    """
    Normalize a url.
    """
    uri = url
    # ============== Handle percent encoded octets
    # [Rule 3: Decode any unreserved characters]
    # Decode first in case we have an upper case percent-encoded octet
    uri = fix_percent_encoding(uri)

    # [Rule 1: Convert the scheme and host to lower case]
    # Though the task specifies that the scheme and host need to be
    # converted to lowercase, it's actually intended for the path to be 
    # normalized as well. I believe this function may need refactoring
    # if we want to handle uri's that contain a userinfo subcomponent
    # in the authority segment. (Need to check if userinfo is case sensitive)
    uri = uri.lower()

    # [Rule 2: Capitalize letters in escape sequences]
    uri = PERCENT_ENC_RE.sub(lambda m: m.group().upper(), uri)

    # ============== Start breaking down and processing the url
    scheme, authority, path, query, fragment = parse(uri)
    userinfo, host, port = decompose_authority(authority)

    # [Rule 4: If using the default port, remove it from the url]
    port = '' if port == ':80' else port

    # [Rule 5: Remove dot segments from the path]
    path = remove_dot_segments(path)

    # Recombine segments
    segments = (scheme, userinfo, host, port, path, query, fragment)
    segments = (seg if seg  else '' for seg in segments)
    return "".join(segments)
July 22, 2016
Rock
0 %
You ClassRoom Demo is Ready With the new tool CheckiO ClassRoom, you will be able to analyze your students' progress, have your own leaderboard, change missions order on the map, and so on.