diff --git a/bin/Indexer.py b/bin/Indexer.py index 39f0fe51..29990bfd 100755 --- a/bin/Indexer.py +++ b/bin/Indexer.py @@ -15,10 +15,28 @@ from pubsublogger import publisher from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, ID +import shutil import os +from os.path import join, getsize from Helper import Process +# Config variable +TIME_WAIT = 60*15 #sec + +# return in bytes +def check_index_size(baseindexpath, indexname): + the_index_name = join(baseindexpath, indexname) + cur_sum = 0 + for root, dirs, files in os.walk(the_index_name): + cur_sum += sum(getsize(join(root, name)) for name in files) + return cur_sum + +def move_index_into_old_index_folder(baseindexpath): + for cur_file in os.listdir(baseindexpath): + if not cur_file == "old_index": + shutil.move(join(baseindexpath, cur_file), join(join(baseindexpath, "old_index"), cur_file)) + if __name__ == "__main__": publisher.port = 6380 @@ -29,19 +47,50 @@ if __name__ == "__main__": p = Process(config_section) # Indexer configuration - index dir and schema setup - indexpath = os.path.join(os.environ['AIL_HOME'], + baseindexpath = join(os.environ['AIL_HOME'], p.config.get("Indexer", "path")) + indexRegister_path = join(os.environ['AIL_HOME'], + p.config.get("Indexer", "register")) indexertype = p.config.get("Indexer", "type") + INDEX_SIZE_THRESHOLD = int(p.config.get("Indexer", "index_max_size")) if indexertype == "whoosh": schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT) - if not os.path.exists(indexpath): - os.mkdir(indexpath) - if not exists_in(indexpath): - ix = create_in(indexpath, schema) - else: - ix = open_dir(indexpath) + if not os.path.exists(baseindexpath): + os.mkdir(baseindexpath) + + # create the index register if not present + time_now = int(time.time()) + if not os.path.isfile(indexRegister_path): #index are not organised + print("Indexes are not organized") + print("moving all files in folder 'old_index' ") + #move all files to old_index folder + move_index_into_old_index_folder(baseindexpath) + print("Creating new index") + #create all_index.txt + with open(indexRegister_path, 'w') as f: + f.write(str(time_now)) + #create dir + os.mkdir(join(baseindexpath, str(time_now))) + + with open(indexRegister_path, "r") as f: + allIndex = f.read() + allIndex = allIndex.split() # format [time1\ntime2] + allIndex.sort() + + try: + indexname = allIndex[-1].strip('\n\r') + except IndexError as e: + indexname = time_now + + indexpath = join(baseindexpath, str(indexname)) + if not exists_in(indexpath): + ix = create_in(indexpath, schema) + else: + ix = open_dir(indexpath) + + last_refresh = time_now # LOGGING # publisher.info("ZMQ Indexer is Running") @@ -58,7 +107,24 @@ if __name__ == "__main__": continue docpath = message.split(" ", -1)[-1] paste = PST.get_p_content() - print "Indexing :", docpath + print "Indexing - "+indexname+" :", docpath + + + if time.time() - last_refresh > TIME_WAIT: #avoid calculating the index's size at each message + last_refresh = time.time() + if check_index_size(baseindexpath, indexname) >= INDEX_SIZE_THRESHOLD*(1000*1000): + timestamp = int(time.time()) + print("Creating new index", timestamp) + indexpath = join(baseindexpath, str(timestamp)) + indexname = str(timestamp) + #update all_index + with open(indexRegister_path, "a") as f: + f.write(str(timestamp)) + #create new dir + os.mkdir(indexpath) + ix = create_in(indexpath, schema) + + if indexertype == "whoosh": indexwriter = ix.writer() indexwriter.update_document( diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index f2e8285d..a634e4f1 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -123,6 +123,8 @@ cc_tld = r'\.de$' [Indexer] type = whoosh path = indexdir +#size in Mb +index_max_size = 2000 ############################################################################### diff --git a/var/www/Flasks/Flask_search.py b/var/www/Flasks/Flask_search.py index b5c60898..a89f011c 100644 --- a/var/www/Flasks/Flask_search.py +++ b/var/www/Flasks/Flask_search.py @@ -7,10 +7,14 @@ import redis import json import os +import datetime import flask from flask import Flask, render_template, jsonify, request import Paste +from whoosh import index +from whoosh.fields import Schema, TEXT, ID +from whoosh.qparser import QueryParser # ============ VARIABLES ============ import Flask_config @@ -20,7 +24,62 @@ cfg = Flask_config.cfg r_serv_pasteName = Flask_config.r_serv_pasteName max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal + + +baseindexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) +indexRegister_path = os.path.join(os.environ['AIL_HOME'], + cfg.get("Indexer", "register")) + # ============ FUNCTIONS ============ +def get_current_index(): + with open(indexRegister_path, "r") as f: + allIndex = f.read() + allIndex = allIndex.split() # format [time1\ntime2] + allIndex.sort() + try: + indexname = allIndex[-1].strip('\n\r') + except IndexError as e: + indexname = "no-index" + indexpath = os.path.join(baseindexpath, indexname) + return indexpath + +def get_index_list(selected_index=""): + temp = [] + index_list = [] + for dirs in os.listdir(baseindexpath): + if os.path.isdir(os.path.join(baseindexpath, dirs)): + value = dirs + name = to_iso_date(dirs) + " - " + \ + str(get_dir_size(dirs) / (1000*1000)) + " Mb " + \ + "(" + str(get_item_count(dirs)) + " Items" + ")" + flag = dirs==selected_index.split('/')[-1] + if dirs == "old_index": + temp = [value, name, flag] + else: + index_list.append([value, name, flag]) + + index_list.sort(reverse=True, key=lambda x: x[0]) + if len(temp) != 0: + index_list.append(temp) + return index_list + +def get_dir_size(directory): + cur_sum = 0 + for directory, subdirs, files in os.walk(os.path.join(baseindexpath,directory)): + try: + cur_sum += sum(os.path.getsize(os.path.join(directory, name)) for name in files) + except OSError as e: #File disappeared + pass + return cur_sum + +def get_item_count(dirs): + ix = index.open_dir(os.path.join(baseindexpath, dirs)) + return ix.doc_count_all() + +def to_iso_date(timestamp): + if timestamp == "old_index": + return "old_index" + return str(datetime.datetime.fromtimestamp(int(timestamp))).split()[0] # ============ ROUTES ============ @@ -34,8 +93,15 @@ def search(): c = [] #preview of the paste content paste_date = [] paste_size = [] + index_name = request.form['index_name'] num_elem_to_get = 50 + # select correct index + if index_name is None or index_name == "0": + selected_index = get_current_index() + else: + selected_index = os.path.join(baseindexpath, index_name) + # Search filename for path in r_serv_pasteName.smembers(q[0]): r.append(path) @@ -49,13 +115,9 @@ def search(): paste_size.append(paste._get_p_size()) # Search full line - from whoosh import index - from whoosh.fields import Schema, TEXT, ID schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) - indexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) - ix = index.open_dir(indexpath) - from whoosh.qparser import QueryParser + ix = index.open_dir(selected_index) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(" ".join(q)) results = searcher.search_page(query, 1, pagelen=num_elem_to_get) @@ -72,7 +134,14 @@ def search(): results = searcher.search(query) num_res = len(results) - return render_template("search.html", r=r, c=c, query=request.form['query'], paste_date=paste_date, paste_size=paste_size, char_to_display=max_preview_modal, num_res=num_res) + index_min = 1 + index_max = len(get_index_list()) + return render_template("search.html", r=r, c=c, + query=request.form['query'], paste_date=paste_date, + paste_size=paste_size, char_to_display=max_preview_modal, + num_res=num_res, index_min=index_min, index_max=index_max, + index_list=get_index_list(selected_index) + ) @app.route("/get_more_search_result", methods=['POST']) @@ -81,20 +150,23 @@ def get_more_search_result(): q = [] q.append(query) page_offset = int(request.form['page_offset']) + index_name = request.form['index_name'] num_elem_to_get = 50 + # select correct index + if index_name is None or index_name == "0": + selected_index = get_current_index() + else: + selected_index = os.path.join(baseindexpath, index_name) + path_array = [] preview_array = [] date_array = [] size_array = [] - from whoosh import index - from whoosh.fields import Schema, TEXT, ID schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) - indexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) - ix = index.open_dir(indexpath) - from whoosh.qparser import QueryParser + ix = index.open_dir(selected_index) with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(" ".join(q)) results = searcher.search_page(query, page_offset, num_elem_to_get) @@ -113,7 +185,6 @@ def get_more_search_result(): to_return["preview_array"] = preview_array to_return["date_array"] = date_array to_return["size_array"] = size_array - print "len(path_array)="+str(len(path_array)) if len(path_array) < num_elem_to_get: #pagelength to_return["moreData"] = False else: diff --git a/var/www/templates/search.html b/var/www/templates/search.html index 9b43967d..727d30a3 100644 --- a/var/www/templates/search.html +++ b/var/www/templates/search.html @@ -89,6 +89,16 @@
+
+
+ Index: + +
+
@@ -100,16 +110,14 @@ - {% set i = 0 %} {% for path in r %} - - - - - + + + + + - {% set i = i + 1 %} {% endfor %}
{{ i + 1 }} {{ path }}{{ paste_date[i] }}{{ paste_size[i] }}

{{ loop.index0 + 1 }} {{ path }}{{ paste_date[loop.index0] }}{{ paste_size[loop.index0] }}

@@ -157,6 +165,28 @@ if (init_num_of_elements_in_table == pagelen) { $("#load_more_json_button1").show(); } + + $('#index_name').on('change', function() { + var form = document.createElement('form'); + form.setAttribute("method", 'post'); + form.setAttribute("action", "{{ url_for('search') }}"); + + var input1 = document.createElement('input'); + input1.setAttribute("type", "hidden"); + input1.setAttribute("name", "index_name"); + input1.setAttribute("value", this.value); + form.appendChild(input1); + + var input2 = document.createElement('input'); + input2.setAttribute("type", "hidden"); + input2.setAttribute("name", "query"); + input2.setAttribute("value", "{{ query }}"); + form.appendChild(input2); + + document.body.appendChild(form); + form.submit(); + }) + }); @@ -171,7 +201,7 @@ } function load_search_50_data() { - var options = { query: query, page_offset: page_offset }; + var options = { query: query, page_offset: page_offset, index_name: $("#index_name").val() }; $.post( "{{ url_for('get_more_search_result') }}", options).done(function( data ) { for(i=0; i