Enable Javascript in your browser and then refresh this page, for a much enhanced experience.
Third solution in Clear category for URL Normalization by mdigi
# migrated from python 2.7
import re
def checkio(url):
# rule 1 -> lower case
url = url.lower()
# rule 2 -> upper case for escape sequences
f = '%[a-z][0-9]|%[0-9][a-z]|%[a-z][a-z]'
up = lambda x: x.group(0).upper()
url = re.sub(f, up, url)
# rule 3 -> decode % encoded octetes of unreserved char
#tilde case
url = re.sub('%7[Ee]', '~', url)
# underscore case
url = re.sub('%5[Ff]', '_', url)
# period case
url = re.sub('%2[Ee]', '.', url)
# hpyhen case
url = re.sub('%2[Dd]', '-', url)
# digits case
digit = lambda x: x.group(0)[-1] # function to return last number in 3X
url = re.sub('%3[0-9]', digit, url)
# alpha case
alpha = 'abcdefghijklmnopqrstuvwxyz'
''' get the hex # from the url, then turn it into an integer, then offset it
so it can be references in the alphabet string
0x41 = 65, 0x61 = 97
'''
abc = lambda x: alpha[int(x.group(0)[1::], 16) - 65] if int(x.group(0)[1]) < 6 \
else alpha[int(x.group(0)[1::], 16) - 97]
f = '%4[1-9]|%4[A-F]|%5[0-9]|%5A|%6[1-9]|%6[A-F]|%7[0-9]|%7A'
url = re.sub(f, abc, url)
# rule 4 -> remove default port
url = re.sub(':80/', '/', url)
url = re.sub(':80$', '', url)
# rule 5 -> remove dot-segments
'''1st remove double dots & the path preceding the double dots
the "\" is to use the following dot, so find "../" & replace with ""
'''
url = re.sub('/\w+/\../', '/', url)
url = re.sub('/\w+/\..$', '', url)
# now remove single dots
url = re.sub('\./', '', url)
return url
#These "asserts" using only for self-checking and not necessary for auto-testing
if __name__ == '__main__':
assert checkio("Http://Www.Checkio.org") == \
"http://www.checkio.org", "1st rule"
assert checkio("http://www.checkio.org/%cc%b1bac") ==\
"http://www.checkio.org/%CC%B1bac", "2nd rule"
assert checkio("http://www.checkio.org/task%5F%31") == \
"http://www.checkio.org/task_1", "3rd rule"
assert checkio("http://www.checkio.org:80/home/") == \
"http://www.checkio.org/home/", "4th rule"
assert checkio("http://www.checkio.org:8080/home/") == \
"http://www.checkio.org:8080/home/", "4th rule again"
assert checkio("http://www.checkio.org/task/./1/../2/././name") == \
"http://www.checkio.org/task/2/name", "5th rule"
print('First set of tests done')
March 12, 2014
Comments: