Add Domain Classifier module.

Cleanup in the config files.
pull/38/head
Raphaël Vinot 2014-09-05 10:41:00 +02:00
parent b7c9e489c9
commit fca00beed9
5 changed files with 67 additions and 36 deletions

58
bin/DomClassifier.py Executable file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
The DomClassifier Module
============================
The DomClassifier modules is fetching the list of files to be
processed and index each file with a full-text indexer (Whoosh until now).
"""
import time
from packages import Paste
from pubsublogger import publisher
import DomainClassifier.domainclassifier
from Helper import Process
def main():
publisher.port = 6380
publisher.channel = "Script"
config_section = 'DomClassifier'
p = Process(config_section)
publisher.info("""ZMQ DomainClassifier is Running""")
while True:
try:
message = p.get_from_set()
if message is not None:
PST = Paste.Paste(message)
else:
publisher.debug("Script DomClassifier is idling 10s")
time.sleep(1)
continue
paste = PST.get_p_content()
mimetype = PST._get_p_encoding()
if mimetype == "text/plain":
c = DomainClassifier.domainclassifier.Extract(rawtext=paste)
c.potentialdomain()
c.validdomain(rtype=['A'], extended=True)
localizeddomains = c.include(expression=r'\.lu$')
if localizeddomains:
print (localizeddomains)
localizeddomains = c.localizedomain(cc='LU')
if localizeddomains:
print (localizeddomains)
except IOError:
print "CRC Checksum Failed on :", PST.p_path
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
PST.p_source, PST.p_date, PST.p_name))
if __name__ == "__main__":
main()

View File

@ -108,6 +108,8 @@ function launching_scripts {
sleep 0.1
screen -S "Script" -X screen -t "Line" bash -c './Line.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "DomainClassifier" bash -c './DomClassifier.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "Categ" bash -c './Categ.py; read x'
sleep 0.1
screen -S "Script" -X screen -t "Tokenize" bash -c './Tokenize.py; read x'

View File

@ -10,16 +10,10 @@ host = localhost
port = 6379
db = 0
[Redis_Log]
host = localhost
port = 6380
db = 0
[Redis_Queues]
host = localhost
port = 6381
db_sub = 0
db_pub = 1
db = 0
[Redis_Data_Merging]
host = localhost
@ -37,35 +31,7 @@ host = localhost
port = 2013
db = 1
# PUB / SUB : ZMQ
[Feed]
address = tcp://crf.circl.lu:5556
topicfilter = 102
[PubSub_Global]
address = tcp://127.0.0.1:5000
channel = filelist
[PubSub_Longlines]
address = tcp://127.0.0.1:5001
channel_0 = Longlines
channel_1 = Shortlines
[PubSub_Words]
address = tcp://127.0.0.1:5002
channel_0 = words
[PubSub_Categ]
address = tcp://127.0.0.1:5003
channel_0 = creditcard_categ
channel_1 = mails_categ
channel_2 = onion_categ
channel_3 = web_categ
[PubSub_Url]
address = tcp://127.0.0.1:5004
channel = urls
# country code logged as critical
[Url]
cc_critical = DE
# Indexer configuration

View File

@ -15,6 +15,9 @@ subscribe = Redis_Global
subscribe = Redis_Global
publish = Redis_LinesShort,Redis_LinesLong
[DomClassifier]
subscribe = Redis_Global
[Tokenize]
subscribe = Redis_LinesShort
publish = Redis_Words

View File

@ -35,6 +35,8 @@ pycountry
# To fetch Onion urls
PySocks
DomainClassifier
#ASN lookup requirements
http://adns-python.googlecode.com/files/adns-python-1.2.1.tar.gz
https://github.com/trolldbois/python-cymru-services/archive/master.zip