2014-08-11 11:04:09 +02:00
|
|
|
#!/usr/bin/env python2
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
"""
|
|
|
|
The ZMQ_Sub_Indexer Module
|
|
|
|
============================
|
|
|
|
|
|
|
|
The ZMQ_Sub_Indexer modules is fetching the list of files to be processed
|
|
|
|
and index each file with a full-text indexer (Whoosh until now).
|
|
|
|
|
|
|
|
"""
|
2014-08-14 17:55:18 +02:00
|
|
|
import time
|
|
|
|
from packages import Paste
|
2014-08-11 11:04:09 +02:00
|
|
|
from pubsublogger import publisher
|
|
|
|
|
|
|
|
from whoosh.index import create_in, exists_in, open_dir
|
2014-08-14 17:55:18 +02:00
|
|
|
from whoosh.fields import Schema, TEXT, ID
|
2014-08-11 11:04:09 +02:00
|
|
|
import os
|
|
|
|
|
2014-08-19 19:07:07 +02:00
|
|
|
import Helper
|
2014-08-11 11:04:09 +02:00
|
|
|
|
2014-08-14 17:55:18 +02:00
|
|
|
|
2014-08-19 19:07:07 +02:00
|
|
|
if __name__ == "__main__":
|
2014-08-22 17:35:40 +02:00
|
|
|
publisher.port = 6380
|
2014-08-19 19:07:07 +02:00
|
|
|
publisher.channel = "Script"
|
|
|
|
|
|
|
|
# Subscriber
|
|
|
|
sub_config_section = 'PubSub_Global'
|
|
|
|
sub_name = 'indexer'
|
2014-08-11 11:04:09 +02:00
|
|
|
|
2014-08-19 19:07:07 +02:00
|
|
|
config_section = 'PubSub_Global'
|
|
|
|
config_channel = 'channel'
|
|
|
|
subscriber_name = 'indexer'
|
2014-08-11 11:04:09 +02:00
|
|
|
|
2014-08-19 19:07:07 +02:00
|
|
|
h = Helper.Redis_Queues(config_section, config_channel, subscriber_name)
|
2014-08-11 11:04:09 +02:00
|
|
|
|
|
|
|
# Indexer configuration - index dir and schema setup
|
2014-08-19 19:07:07 +02:00
|
|
|
indexpath = h.config.get("Indexer", "path")
|
|
|
|
indexertype = h.config.get("Indexer", "type")
|
2014-08-11 11:04:09 +02:00
|
|
|
if indexertype == "whoosh":
|
2014-08-19 19:07:07 +02:00
|
|
|
schema = Schema(title=TEXT(stored=True), path=ID(stored=True,
|
|
|
|
unique=True),
|
|
|
|
content=TEXT)
|
2014-08-11 11:04:09 +02:00
|
|
|
if not os.path.exists(indexpath):
|
|
|
|
os.mkdir(indexpath)
|
|
|
|
if not exists_in(indexpath):
|
|
|
|
ix = create_in(indexpath, schema)
|
|
|
|
else:
|
|
|
|
ix = open_dir(indexpath)
|
|
|
|
|
|
|
|
# LOGGING #
|
|
|
|
publisher.info("""ZMQ Indexer is Running""")
|
|
|
|
|
|
|
|
while True:
|
2014-08-14 17:55:18 +02:00
|
|
|
try:
|
2014-08-20 15:14:57 +02:00
|
|
|
message = h.redis_rpop()
|
2014-08-11 11:04:09 +02:00
|
|
|
|
2014-08-14 17:55:18 +02:00
|
|
|
if message is not None:
|
|
|
|
PST = Paste.Paste(message.split(" ", -1)[-1])
|
2014-08-11 11:04:09 +02:00
|
|
|
else:
|
2014-08-19 19:07:07 +02:00
|
|
|
if h.redis_queue_shutdown():
|
2014-08-11 11:04:09 +02:00
|
|
|
break
|
|
|
|
publisher.debug("Script Indexer is idling 10s")
|
|
|
|
time.sleep(1)
|
|
|
|
continue
|
2014-08-14 17:55:18 +02:00
|
|
|
docpath = message.split(" ", -1)[-1]
|
2014-08-11 11:04:09 +02:00
|
|
|
paste = PST.get_p_content()
|
|
|
|
print "Indexing :", docpath
|
|
|
|
if indexertype == "whoosh":
|
|
|
|
indexwriter = ix.writer()
|
2014-08-14 17:55:18 +02:00
|
|
|
indexwriter.update_document(
|
|
|
|
title=unicode(docpath, errors='ignore'),
|
|
|
|
path=unicode(docpath, errors='ignore'),
|
|
|
|
content=unicode(paste, errors='ignore'))
|
2014-08-11 11:04:09 +02:00
|
|
|
indexwriter.commit()
|
|
|
|
except IOError:
|
2014-08-14 17:55:18 +02:00
|
|
|
print "CRC Checksum Failed on :", PST.p_path
|
2014-08-19 19:07:07 +02:00
|
|
|
publisher.error('Duplicate;{};{};{};CRC Checksum Failed'.format(
|
|
|
|
PST.p_source, PST.p_date, PST.p_name))
|