mirror of https://github.com/CIRCL/AIL-framework
commit
f229e2fbee
|
@ -0,0 +1,90 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
|
"""
|
||||||
|
The ZMQ_Sub_DomainClassifier Module
|
||||||
|
============================
|
||||||
|
|
||||||
|
The ZMQ_Sub_DomainClassifier modules is fetching the list of files to be processed
|
||||||
|
and index each file with a full-text indexer (Whoosh until now).
|
||||||
|
|
||||||
|
"""
|
||||||
|
import redis
|
||||||
|
import ConfigParser
|
||||||
|
import time
|
||||||
|
from packages import Paste
|
||||||
|
from packages import ZMQ_PubSub
|
||||||
|
from pubsublogger import publisher
|
||||||
|
|
||||||
|
import DomainClassifier.domainclassifier
|
||||||
|
import os
|
||||||
|
|
||||||
|
configfile = './packages/config.cfg'
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main Function"""
|
||||||
|
|
||||||
|
# CONFIG #
|
||||||
|
cfg = ConfigParser.ConfigParser()
|
||||||
|
cfg.read(configfile)
|
||||||
|
|
||||||
|
# Redis
|
||||||
|
r_serv1 = redis.StrictRedis(
|
||||||
|
host=cfg.get("Redis_Queues", "host"),
|
||||||
|
port=cfg.getint("Redis_Queues", "port"),
|
||||||
|
db=cfg.getint("Redis_Queues", "db"))
|
||||||
|
|
||||||
|
# LOGGING #
|
||||||
|
publisher.channel = "Script"
|
||||||
|
|
||||||
|
# ZMQ #
|
||||||
|
# Subscriber
|
||||||
|
channel = cfg.get("PubSub_Global", "channel")
|
||||||
|
subscriber_name = "DomainClassifier"
|
||||||
|
subscriber_config_section = "PubSub_Global"
|
||||||
|
|
||||||
|
cc = cfg.get("PubSub_DomainClassifier", "cc")
|
||||||
|
cc_tld = cfg.get("PubSub_DomainClassifier", "cc_tld")
|
||||||
|
|
||||||
|
sub = ZMQ_PubSub.ZMQSub(configfile, subscriber_config_section, channel, subscriber_name)
|
||||||
|
|
||||||
|
# FUNCTIONS #
|
||||||
|
publisher.info("""ZMQ DomainClassifier is Running""")
|
||||||
|
c = DomainClassifier.domainclassifier.Extract(rawtext="")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
message = sub.get_msg_from_queue(r_serv1)
|
||||||
|
|
||||||
|
if message is not None:
|
||||||
|
PST = Paste.Paste(message.split(" ", -1)[-1])
|
||||||
|
else:
|
||||||
|
if r_serv1.sismember("SHUTDOWN_FLAGS", "Indexer"):
|
||||||
|
r_serv1.srem("SHUTDOWN_FLAGS", "Indexer")
|
||||||
|
publisher.warning("Shutdown Flag Up: Terminating.")
|
||||||
|
break
|
||||||
|
publisher.debug("Script DomainClassifier is idling 10s")
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
|
docpath = message.split(" ", -1)[-1]
|
||||||
|
paste = PST.get_p_content()
|
||||||
|
mimetype = PST._get_p_encoding()
|
||||||
|
if mimetype == "text/plain":
|
||||||
|
c.text(rawtext=paste)
|
||||||
|
c.potentialdomain()
|
||||||
|
c.validdomain(rtype=['A'],extended=True)
|
||||||
|
localizeddomains = c.include(expression=cc_tld)
|
||||||
|
if localizeddomains:
|
||||||
|
print (localizeddomains)
|
||||||
|
localizeddomains = c.localizedomain(cc=cc)
|
||||||
|
if localizeddomains:
|
||||||
|
print (localizeddomains)
|
||||||
|
except IOError:
|
||||||
|
print "CRC Checksum Failed on :", PST.p_path
|
||||||
|
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(PST.p_source, PST.p_date, PST.p_name))
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -67,6 +67,10 @@ channel = urls
|
||||||
# country code logged as critical
|
# country code logged as critical
|
||||||
cc_critical = DE
|
cc_critical = DE
|
||||||
|
|
||||||
|
[PubSub_DomainClassifier]
|
||||||
|
cc = DE
|
||||||
|
cc_tld = r'\.de$'
|
||||||
|
|
||||||
# Indexer configuration
|
# Indexer configuration
|
||||||
[Indexer]
|
[Indexer]
|
||||||
type = whoosh
|
type = whoosh
|
||||||
|
|
|
@ -26,6 +26,8 @@ ipython
|
||||||
flask
|
flask
|
||||||
texttable
|
texttable
|
||||||
|
|
||||||
|
#DomainClassifier
|
||||||
|
DomainClassifier
|
||||||
#Indexer requirements
|
#Indexer requirements
|
||||||
whoosh
|
whoosh
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue