mirror of https://github.com/CIRCL/AIL-framework
fix onions, cc and domain classifier modules
parent
e983c839ad
commit
f017680365
|
@ -29,12 +29,12 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Source: http://www.richardsramblings.com/regex/credit-card-numbers/
|
# Source: http://www.richardsramblings.com/regex/credit-card-numbers/
|
||||||
cards = [
|
cards = [
|
||||||
r'4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}', # 16-digit VISA, with separators
|
r'\b4\d{3}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16-digit VISA, with separators
|
||||||
r'5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}', # 16 digits MasterCard
|
r'\b5[1-5]\d{2}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # 16 digits MasterCard
|
||||||
r'6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}', # Discover Card
|
r'\b6(?:011|22(?:(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # Discover Card
|
||||||
r'35(?:2[89]|[3-8]\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}', # Japan Credit Bureau (JCB)
|
r'\b35(?:2[89]|[3-8]\d)(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}(?:[\ \-]?)\d{4}\b', # Japan Credit Bureau (JCB)
|
||||||
r'3[47]\d\d(?:[\ \-]?)\d{6}(?:[\ \-]?)\d{5}', # American Express
|
r'\b3[47]\d\d(?:[\ \-]?)\d{6}(?:[\ \-]?)\d{5}\b', # American Express
|
||||||
r'(?:5[0678]\d\d|6304|6390|67\d\d)\d{8,15}', # Maestro
|
r'\b(?:5[0678]\d\d|6304|6390|67\d\d)\d{8,15}\b', # Maestro
|
||||||
]
|
]
|
||||||
|
|
||||||
regex = re.compile('|'.join(cards))
|
regex = re.compile('|'.join(cards))
|
||||||
|
|
|
@ -27,6 +27,7 @@ def main():
|
||||||
|
|
||||||
publisher.info("""ZMQ DomainClassifier is Running""")
|
publisher.info("""ZMQ DomainClassifier is Running""")
|
||||||
|
|
||||||
|
c = DomainClassifier.domainclassifier.Extract(rawtext="")
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
message = p.get_from_set()
|
message = p.get_from_set()
|
||||||
|
@ -40,7 +41,7 @@ def main():
|
||||||
paste = PST.get_p_content()
|
paste = PST.get_p_content()
|
||||||
mimetype = PST._get_p_encoding()
|
mimetype = PST._get_p_encoding()
|
||||||
if mimetype == "text/plain":
|
if mimetype == "text/plain":
|
||||||
c = DomainClassifier.domainclassifier.Extract(rawtext=paste)
|
c.text(rawtext=paste)
|
||||||
c.potentialdomain()
|
c.potentialdomain()
|
||||||
c.validdomain(rtype=['A'], extended=True)
|
c.validdomain(rtype=['A'], extended=True)
|
||||||
localizeddomains = c.include(expression=r'\.lu$')
|
localizeddomains = c.include(expression=r'\.lu$')
|
||||||
|
|
12
bin/Onion.py
12
bin/Onion.py
|
@ -36,6 +36,8 @@ from Helper import Process
|
||||||
|
|
||||||
def fetch(p, r_cache, urls, domains, path):
|
def fetch(p, r_cache, urls, domains, path):
|
||||||
failed = []
|
failed = []
|
||||||
|
downloaded = []
|
||||||
|
print len(urls), 'Urls to fetch.'
|
||||||
for url, domain in zip(urls, domains):
|
for url, domain in zip(urls, domains):
|
||||||
if r_cache.exists(url) or url in failed:
|
if r_cache.exists(url) or url in failed:
|
||||||
continue
|
continue
|
||||||
|
@ -47,10 +49,11 @@ def fetch(p, r_cache, urls, domains, path):
|
||||||
|
|
||||||
if process.returncode == 0:
|
if process.returncode == 0:
|
||||||
r_cache.setbit(url, 0, 1)
|
r_cache.setbit(url, 0, 1)
|
||||||
r_cache.expire(url, 3600)
|
r_cache.expire(url, 360000)
|
||||||
|
downloaded.append(url)
|
||||||
tempfile = process.stdout.read().strip()
|
tempfile = process.stdout.read().strip()
|
||||||
with open(tempfile, 'r') as f:
|
with open(tempfile, 'r') as f:
|
||||||
filename = path + domain
|
filename = path + domain + '.gz'
|
||||||
content = base64.standard_b64decode(f.read())
|
content = base64.standard_b64decode(f.read())
|
||||||
save_path = os.path.join(os.environ['AIL_HOME'],
|
save_path = os.path.join(os.environ['AIL_HOME'],
|
||||||
p.config.get("Directories", "pastes"),
|
p.config.get("Directories", "pastes"),
|
||||||
|
@ -65,9 +68,12 @@ def fetch(p, r_cache, urls, domains, path):
|
||||||
yield url
|
yield url
|
||||||
os.unlink(tempfile)
|
os.unlink(tempfile)
|
||||||
else:
|
else:
|
||||||
|
r_cache.setbit(url, 0, 0)
|
||||||
|
r_cache.expire(url, 3600)
|
||||||
failed.append(url)
|
failed.append(url)
|
||||||
print 'Failed at downloading', url
|
print 'Failed at downloading', url
|
||||||
print process.stdout.read()
|
print process.stdout.read()
|
||||||
|
print 'Failed:', len(failed), 'Downloaded:', len(downloaded)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -121,8 +127,6 @@ if __name__ == "__main__":
|
||||||
# Saving the list of extracted onion domains.
|
# Saving the list of extracted onion domains.
|
||||||
PST.__setattr__(channel, domains_list)
|
PST.__setattr__(channel, domains_list)
|
||||||
PST.save_attribute_redis(channel, domains_list)
|
PST.save_attribute_redis(channel, domains_list)
|
||||||
pprint.pprint(domains_list)
|
|
||||||
print PST.p_path
|
|
||||||
to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,
|
to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,
|
||||||
PST.p_name)
|
PST.p_name)
|
||||||
if len(domains_list) > 0:
|
if len(domains_list) > 0:
|
||||||
|
|
|
@ -10,6 +10,9 @@ import base64
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
|
# Max size in Mb
|
||||||
|
max_size = 5
|
||||||
|
|
||||||
def create_connection(address, timeout=None, source_address=None):
|
def create_connection(address, timeout=None, source_address=None):
|
||||||
sock = socks.socksocket()
|
sock = socks.socksocket()
|
||||||
sock.connect(address)
|
sock.connect(address)
|
||||||
|
@ -21,7 +24,7 @@ def get_page(url, torclient_host='127.0.0.1', torclient_port=9050):
|
||||||
request = urllib2.Request(url)
|
request = urllib2.Request(url)
|
||||||
# UA of the Tor browser bundle
|
# UA of the Tor browser bundle
|
||||||
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0')
|
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0')
|
||||||
return urllib2.urlopen(request).read()
|
return urllib2.urlopen(request, timeout=5).read(max_size * 100000)
|
||||||
|
|
||||||
|
|
||||||
def makegzip64(s):
|
def makegzip64(s):
|
||||||
|
|
Loading…
Reference in New Issue