mirror of https://github.com/CIRCL/AIL-framework
fix: [Crawler] force domains/subdomains lower case (rfc4343)
parent
cc61c99290
commit
a4c03b4ba4
|
@ -43,25 +43,49 @@ def unpack_url(url):
|
|||
to_crawl = {}
|
||||
faup.decode(url)
|
||||
url_unpack = faup.get()
|
||||
to_crawl['domain'] = url_unpack['domain'].decode()
|
||||
# # FIXME: # TODO: remove me
|
||||
try:
|
||||
to_crawl['domain'] = url_unpack['domain'].decode()
|
||||
except:
|
||||
to_crawl['domain'] = url_unpack['domain']
|
||||
to_crawl['domain'] = to_crawl['domain'].lower()
|
||||
|
||||
|
||||
# force lower case domain/subdomain (rfc4343)
|
||||
# # FIXME: # TODO: remove me
|
||||
try:
|
||||
url_host = url_unpack['host'].decode()
|
||||
except:
|
||||
url_host = url_unpack['host']
|
||||
|
||||
new_url_host = url_host.lower()
|
||||
url_lower_case = url.replace(url_host, new_url_host, 1)
|
||||
|
||||
if url_unpack['scheme'] is None:
|
||||
to_crawl['scheme'] = 'http'
|
||||
url= 'http://{}'.format(url_unpack['url'].decode())
|
||||
url= 'http://{}'.format(url_lower_case)
|
||||
else:
|
||||
scheme = url_unpack['scheme'].decode()
|
||||
# # FIXME: # TODO: remove me
|
||||
try:
|
||||
scheme = url_unpack['scheme'].decode()
|
||||
except Exception as e:
|
||||
scheme = url_unpack['scheme']
|
||||
if scheme in default_proto_map:
|
||||
to_crawl['scheme'] = scheme
|
||||
url = url_unpack['url'].decode()
|
||||
url = url_lower_case
|
||||
else:
|
||||
redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_unpack['url'].decode()))
|
||||
redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
|
||||
to_crawl['scheme'] = 'http'
|
||||
url= 'http://{}'.format(url_unpack['url'].decode().replace(scheme, '', 1))
|
||||
url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
|
||||
|
||||
if url_unpack['port'] is None:
|
||||
to_crawl['port'] = default_proto_map[to_crawl['scheme']]
|
||||
else:
|
||||
port = url_unpack['port'].decode()
|
||||
# # FIXME: # TODO: remove me
|
||||
try:
|
||||
port = url_unpack['port'].decode()
|
||||
except:
|
||||
port = url_unpack['port']
|
||||
# Verify port number #################### make function to verify/correct port number
|
||||
try:
|
||||
int(port)
|
||||
|
@ -80,12 +104,16 @@ def unpack_url(url):
|
|||
|
||||
to_crawl['url'] = url
|
||||
if to_crawl['port'] == 80:
|
||||
to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())
|
||||
to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
|
||||
else:
|
||||
to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])
|
||||
to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
|
||||
|
||||
# # FIXME: # TODO: remove me
|
||||
try:
|
||||
to_crawl['tld'] = url_unpack['tld'].decode()
|
||||
except:
|
||||
to_crawl['tld'] = url_unpack['tld']
|
||||
|
||||
to_crawl['tld'] = url_unpack['tld'].decode()
|
||||
return to_crawl
|
||||
|
||||
# get url, paste and service_type to crawl
|
||||
|
|
|
@ -224,7 +224,11 @@ if __name__ == "__main__":
|
|||
|
||||
faup.decode(url)
|
||||
url_unpack = faup.get()
|
||||
domain = url_unpack['domain'].decode()
|
||||
## TODO: # FIXME: remove me
|
||||
try:
|
||||
domain = url_unpack['domain'].decode().lower()
|
||||
except Exception as e:
|
||||
domain = url_unpack['domain'].lower()
|
||||
|
||||
## TODO: blackilst by port ?
|
||||
# check blacklist
|
||||
|
@ -233,7 +237,7 @@ if __name__ == "__main__":
|
|||
|
||||
subdomain = re.findall(url_regex, url)
|
||||
if len(subdomain) > 0:
|
||||
subdomain = subdomain[0][4]
|
||||
subdomain = subdomain[0][4].lower()
|
||||
else:
|
||||
continue
|
||||
|
||||
|
|
Loading…
Reference in New Issue