Enable Javascript in your browser and then refresh this page, for a much enhanced experience.
First solution in Clear category for URL Normalization by wo.tomasz
import re
dp_set = {':20', ':21', ':22', ':23', ':25', ':53', ':67', ':68', ':80', ':110', ':119', ':123', ':143', ':161', ':194', ':443', ':546', ':547'}
def CapitalizePercentEncodingTriplet(url):
for t in re.findall('[%][a-z0-9]{2}', url):
url = url.replace(t, t.upper())
return url
def DecodingPercentEncodedOctets(url):
lo = re.findall('[%][2][D-E]|[%][3][0-9]|[%][4-7][0-9A-F]', url)
sl = [(o, chr(int(o.replace('%', '0x'), 0))) for o in lo]
for s in sl:
url = url.replace(s[0], s[1].lower())
return url
def RemovePort(url):
ports = set(re.findall('[:]\d+', url))
for p in ports.intersection(dp_set):
url = url.replace(p, '')
return url
def RemoveDotSegments(url):
while url.find(r'/./') > -1:
url = url.replace(r'/./', r'/')
fend =url[-3:] == r'/..'
if fend:
url += r'/'
while url.find(r'/../') > -1:
url = re.sub(r'/[^/]+/../', r'/', url, 1)
if fend:
url = url[0:-1]
return url
def checkio(url):
url = url.lower()
url = CapitalizePercentEncodingTriplet(url)
url = DecodingPercentEncodedOctets(url)
url = RemovePort(url)
url = RemoveDotSegments(url)
return url
#These "asserts" using only for self-checking and not necessary for auto-testing
if __name__ == '__main__':
assert checkio("Http://Www.Checkio.org") == \
"http://www.checkio.org", "1st rule"
assert checkio("http://www.checkio.org/%cc%b1bac") == \
"http://www.checkio.org/%CC%B1bac", "2nd rule"
assert checkio("http://www.checkio.org/task%5F%31") == \
"http://www.checkio.org/task_1", "3rd rule"
assert checkio("http://www.checkio.org:80/home/") == \
"http://www.checkio.org/home/", "4th rule"
assert checkio("http://www.checkio.org:8080/home/") == \
"http://www.checkio.org:8080/home/", "4th rule again"
assert checkio("http://www.checkio.org/task/./1/../2/././name") == \
"http://www.checkio.org/task/2/name", "5th rule"
print('First set of tests done')
Jan. 25, 2022
Comments: