fix: [Crawler] force domains/subdomains lower case (rfc4343)

pull/350/head
Terrtia 2019-05-06 11:46:20 +02:00
parent cc61c99290
commit a4c03b4ba4
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
2 changed files with 44 additions and 12 deletions

View File

@ -43,25 +43,49 @@ def unpack_url(url):
to_crawl = {}
faup.decode(url)
url_unpack = faup.get()
to_crawl['domain'] = url_unpack['domain'].decode()
# # FIXME: # TODO: remove me
try:
to_crawl['domain'] = url_unpack['domain'].decode()
except:
to_crawl['domain'] = url_unpack['domain']
to_crawl['domain'] = to_crawl['domain'].lower()
# force lower case domain/subdomain (rfc4343)
# # FIXME: # TODO: remove me
try:
url_host = url_unpack['host'].decode()
except:
url_host = url_unpack['host']
new_url_host = url_host.lower()
url_lower_case = url.replace(url_host, new_url_host, 1)
if url_unpack['scheme'] is None:
to_crawl['scheme'] = 'http'
url= 'http://{}'.format(url_unpack['url'].decode())
url= 'http://{}'.format(url_lower_case)
else:
scheme = url_unpack['scheme'].decode()
# # FIXME: # TODO: remove me
try:
scheme = url_unpack['scheme'].decode()
except Exception as e:
scheme = url_unpack['scheme']
if scheme in default_proto_map:
to_crawl['scheme'] = scheme
url = url_unpack['url'].decode()
url = url_lower_case
else:
redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_unpack['url'].decode()))
redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case))
to_crawl['scheme'] = 'http'
url= 'http://{}'.format(url_unpack['url'].decode().replace(scheme, '', 1))
url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1))
if url_unpack['port'] is None:
to_crawl['port'] = default_proto_map[to_crawl['scheme']]
else:
port = url_unpack['port'].decode()
# # FIXME: # TODO: remove me
try:
port = url_unpack['port'].decode()
except:
port = url_unpack['port']
# Verify port number #################### make function to verify/correct port number
try:
int(port)
@ -80,12 +104,16 @@ def unpack_url(url):
to_crawl['url'] = url
if to_crawl['port'] == 80:
to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode())
to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host)
else:
to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'])
to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port'])
# # FIXME: # TODO: remove me
try:
to_crawl['tld'] = url_unpack['tld'].decode()
except:
to_crawl['tld'] = url_unpack['tld']
to_crawl['tld'] = url_unpack['tld'].decode()
return to_crawl
# get url, paste and service_type to crawl

View File

@ -224,7 +224,11 @@ if __name__ == "__main__":
faup.decode(url)
url_unpack = faup.get()
domain = url_unpack['domain'].decode()
## TODO: # FIXME: remove me
try:
domain = url_unpack['domain'].decode().lower()
except Exception as e:
domain = url_unpack['domain'].lower()
## TODO: blackilst by port ?
# check blacklist
@ -233,7 +237,7 @@ if __name__ == "__main__":
subdomain = re.findall(url_regex, url)
if len(subdomain) > 0:
subdomain = subdomain[0][4]
subdomain = subdomain[0][4].lower()
else:
continue