mirror of https://github.com/CIRCL/AIL-framework
parent
b7c9e489c9
commit
fca00beed9
|
@ -0,0 +1,58 @@
|
|||
#!/usr/bin/env python2
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
"""
|
||||
The DomClassifier Module
|
||||
============================
|
||||
|
||||
The DomClassifier modules is fetching the list of files to be
|
||||
processed and index each file with a full-text indexer (Whoosh until now).
|
||||
|
||||
"""
|
||||
import time
|
||||
from packages import Paste
|
||||
from pubsublogger import publisher
|
||||
|
||||
import DomainClassifier.domainclassifier
|
||||
from Helper import Process
|
||||
|
||||
|
||||
def main():
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'DomClassifier'
|
||||
|
||||
p = Process(config_section)
|
||||
|
||||
publisher.info("""ZMQ DomainClassifier is Running""")
|
||||
|
||||
while True:
|
||||
try:
|
||||
message = p.get_from_set()
|
||||
|
||||
if message is not None:
|
||||
PST = Paste.Paste(message)
|
||||
else:
|
||||
publisher.debug("Script DomClassifier is idling 10s")
|
||||
time.sleep(1)
|
||||
continue
|
||||
paste = PST.get_p_content()
|
||||
mimetype = PST._get_p_encoding()
|
||||
if mimetype == "text/plain":
|
||||
c = DomainClassifier.domainclassifier.Extract(rawtext=paste)
|
||||
c.potentialdomain()
|
||||
c.validdomain(rtype=['A'], extended=True)
|
||||
localizeddomains = c.include(expression=r'\.lu$')
|
||||
if localizeddomains:
|
||||
print (localizeddomains)
|
||||
localizeddomains = c.localizedomain(cc='LU')
|
||||
if localizeddomains:
|
||||
print (localizeddomains)
|
||||
except IOError:
|
||||
print "CRC Checksum Failed on :", PST.p_path
|
||||
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
|
||||
PST.p_source, PST.p_date, PST.p_name))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -108,6 +108,8 @@ function launching_scripts {
|
|||
sleep 0.1
|
||||
screen -S "Script" -X screen -t "Line" bash -c './Line.py; read x'
|
||||
sleep 0.1
|
||||
screen -S "Script" -X screen -t "DomainClassifier" bash -c './DomClassifier.py; read x'
|
||||
sleep 0.1
|
||||
screen -S "Script" -X screen -t "Categ" bash -c './Categ.py; read x'
|
||||
sleep 0.1
|
||||
screen -S "Script" -X screen -t "Tokenize" bash -c './Tokenize.py; read x'
|
||||
|
|
|
@ -10,16 +10,10 @@ host = localhost
|
|||
port = 6379
|
||||
db = 0
|
||||
|
||||
[Redis_Log]
|
||||
host = localhost
|
||||
port = 6380
|
||||
db = 0
|
||||
|
||||
[Redis_Queues]
|
||||
host = localhost
|
||||
port = 6381
|
||||
db_sub = 0
|
||||
db_pub = 1
|
||||
db = 0
|
||||
|
||||
[Redis_Data_Merging]
|
||||
host = localhost
|
||||
|
@ -37,35 +31,7 @@ host = localhost
|
|||
port = 2013
|
||||
db = 1
|
||||
|
||||
# PUB / SUB : ZMQ
|
||||
[Feed]
|
||||
address = tcp://crf.circl.lu:5556
|
||||
topicfilter = 102
|
||||
|
||||
[PubSub_Global]
|
||||
address = tcp://127.0.0.1:5000
|
||||
channel = filelist
|
||||
|
||||
[PubSub_Longlines]
|
||||
address = tcp://127.0.0.1:5001
|
||||
channel_0 = Longlines
|
||||
channel_1 = Shortlines
|
||||
|
||||
[PubSub_Words]
|
||||
address = tcp://127.0.0.1:5002
|
||||
channel_0 = words
|
||||
|
||||
[PubSub_Categ]
|
||||
address = tcp://127.0.0.1:5003
|
||||
channel_0 = creditcard_categ
|
||||
channel_1 = mails_categ
|
||||
channel_2 = onion_categ
|
||||
channel_3 = web_categ
|
||||
|
||||
[PubSub_Url]
|
||||
address = tcp://127.0.0.1:5004
|
||||
channel = urls
|
||||
# country code logged as critical
|
||||
[Url]
|
||||
cc_critical = DE
|
||||
|
||||
# Indexer configuration
|
||||
|
|
|
@ -15,6 +15,9 @@ subscribe = Redis_Global
|
|||
subscribe = Redis_Global
|
||||
publish = Redis_LinesShort,Redis_LinesLong
|
||||
|
||||
[DomClassifier]
|
||||
subscribe = Redis_Global
|
||||
|
||||
[Tokenize]
|
||||
subscribe = Redis_LinesShort
|
||||
publish = Redis_Words
|
||||
|
|
|
@ -35,6 +35,8 @@ pycountry
|
|||
# To fetch Onion urls
|
||||
PySocks
|
||||
|
||||
DomainClassifier
|
||||
|
||||
#ASN lookup requirements
|
||||
http://adns-python.googlecode.com/files/adns-python-1.2.1.tar.gz
|
||||
https://github.com/trolldbois/python-cymru-services/archive/master.zip
|
||||
|
|
Loading…
Reference in New Issue