From 248469d61eeac3c6c90e9e562ff677baf04a9091 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Tue, 14 Mar 2017 10:37:31 +0100 Subject: [PATCH 01/10] Indexer now create an index_dir when it became too large (search in index not updated yet) --- bin/Indexer.py | 58 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/bin/Indexer.py b/bin/Indexer.py index 39f0fe51..70ca598a 100755 --- a/bin/Indexer.py +++ b/bin/Indexer.py @@ -16,9 +16,24 @@ from pubsublogger import publisher from whoosh.index import create_in, exists_in, open_dir from whoosh.fields import Schema, TEXT, ID import os +from os.path import join, getsize from Helper import Process +# Config variable +INDEX_SIZE_THRESHOLD = 500 #Mb +TIME_WAIT = 1.0 #sec + +# return in bytes +def check_index_size(indexnum): + global baseindexpath + the_index_name = "index_"+str(indexnum) if indexnum != 0 else "old_index" + the_index_name = os.path.join(baseindexpath, the_index_name) + cur_sum = 0 + for root, dirs, files in os.walk(the_index_name): + cur_sum += sum(getsize(join(root, name)) for name in files) + return cur_sum + if __name__ == "__main__": publisher.port = 6380 @@ -29,19 +44,36 @@ if __name__ == "__main__": p = Process(config_section) # Indexer configuration - index dir and schema setup - indexpath = os.path.join(os.environ['AIL_HOME'], + baseindexpath = os.path.join(os.environ['AIL_HOME'], p.config.get("Indexer", "path")) + indexRegister_path = os.path.join(os.environ['AIL_HOME'], + p.config.get("Indexer", "register")) indexertype = p.config.get("Indexer", "type") if indexertype == "whoosh": schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT) - if not os.path.exists(indexpath): - os.mkdir(indexpath) - if not exists_in(indexpath): - ix = create_in(indexpath, schema) - else: - ix = open_dir(indexpath) + if not os.path.exists(baseindexpath): + os.mkdir(baseindexpath) + + # create the index register if not present + if not os.path.isfile(indexRegister_path): + with open(indexRegister_path, 'w') as f: + f.write("1") + + with open(indexRegister_path, "r") as f: + allIndex = f.read() + allIndex = allIndex.split(',') + allIndex.sort() + indexnum = int(allIndex[-1]) + + indexpath = os.path.join(baseindexpath, "index_"+str(indexnum)) + if not exists_in(indexpath): + ix = create_in(indexpath, schema) + else: + ix = open_dir(indexpath) + + last_refresh = time.time() # LOGGING # publisher.info("ZMQ Indexer is Running") @@ -59,6 +91,18 @@ if __name__ == "__main__": docpath = message.split(" ", -1)[-1] paste = PST.get_p_content() print "Indexing :", docpath + + + if time.time() - last_refresh > TIME_WAIT: #avoid calculating the index's size at each message + last_refresh = time.time() + if check_index_size(indexnum) > INDEX_SIZE_THRESHOLD*(1000*1000): + indexpath = os.path.join(baseindexpath, "index_"+str(indexnum+1)) + ix = create_in(indexpath, schema, indexname=str(indexnum+1)) + ## Correctly handle the file + with open(indexRegister_path, "a") as f: + f.write(","+str(indexnum)) + + if indexertype == "whoosh": indexwriter = ix.writer() indexwriter.update_document( From e4757f5ceb78378a79fd5f1183e994a0dd92b612 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Wed, 15 Mar 2017 09:39:48 +0100 Subject: [PATCH 02/10] Added possibility to choose the index in search + Updated search page with jinja2 iter0 --- var/www/Flasks/Flask_search.py | 57 ++++++++++++++++++++++++++++---- var/www/templates/search.html | 46 +++++++++++++++++++++----- var/www/templates/searchbox.html | 1 + 3 files changed, 90 insertions(+), 14 deletions(-) diff --git a/var/www/Flasks/Flask_search.py b/var/www/Flasks/Flask_search.py index b5c60898..3d28b4cc 100644 --- a/var/www/Flasks/Flask_search.py +++ b/var/www/Flasks/Flask_search.py @@ -20,7 +20,34 @@ cfg = Flask_config.cfg r_serv_pasteName = Flask_config.r_serv_pasteName max_preview_char = Flask_config.max_preview_char max_preview_modal = Flask_config.max_preview_modal + + +baseindexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) +indexRegister_path = os.path.join(os.environ['AIL_HOME'], + cfg.get("Indexer", "register")) + # ============ FUNCTIONS ============ +def get_current_index(): + with open(indexRegister_path, "r") as f: + allIndex = f.read() + allIndex = allIndex.split(',') + allIndex.sort() + indexnum = int(allIndex[-1]) + indexpath = os.path.join(baseindexpath, "index_"+str(indexnum)) + return indexpath + +def get_index_list(selected_index=""): + index_list = [] + for dirs in os.listdir(baseindexpath): + if os.path.isdir(os.path.join(baseindexpath, dirs)): + index_list.append([ dirs, dirs + " - " + str(get_dir_size(dirs) / (1000*1000)) + " Mb", dirs==selected_index.split('/')[-1]]) + return index_list + +def get_dir_size(directory): + cur_sum = 0 + for directory, subdirs, files in os.walk(os.path.join(baseindexpath,directory)): + cur_sum += sum(os.path.getsize(os.path.join(directory, name)) for name in files) + return cur_sum # ============ ROUTES ============ @@ -34,8 +61,15 @@ def search(): c = [] #preview of the paste content paste_date = [] paste_size = [] + index_num = request.form['index_num'] num_elem_to_get = 50 + # select correct index + if index_num is None or index_num == "0": + selected_index = get_current_index() + else: + selected_index = os.path.join(baseindexpath, index_num) + # Search filename for path in r_serv_pasteName.smembers(q[0]): r.append(path) @@ -53,8 +87,7 @@ def search(): from whoosh.fields import Schema, TEXT, ID schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) - indexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) - ix = index.open_dir(indexpath) + ix = index.open_dir(selected_index) from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(" ".join(q)) @@ -72,7 +105,14 @@ def search(): results = searcher.search(query) num_res = len(results) - return render_template("search.html", r=r, c=c, query=request.form['query'], paste_date=paste_date, paste_size=paste_size, char_to_display=max_preview_modal, num_res=num_res) + index_min = 1 + index_max = len(get_index_list()) + return render_template("search.html", r=r, c=c, + query=request.form['query'], paste_date=paste_date, + paste_size=paste_size, char_to_display=max_preview_modal, + num_res=num_res, index_min=index_min, index_max=index_max, + index_list=get_index_list(selected_index) + ) @app.route("/get_more_search_result", methods=['POST']) @@ -81,8 +121,15 @@ def get_more_search_result(): q = [] q.append(query) page_offset = int(request.form['page_offset']) + index_num = request.form['index_num'] num_elem_to_get = 50 + # select correct index + if index_num is None or index_num == "0": + selected_index = get_current_index() + else: + selected_index = os.path.join(baseindexpath, index_num) + path_array = [] preview_array = [] date_array = [] @@ -92,8 +139,7 @@ def get_more_search_result(): from whoosh.fields import Schema, TEXT, ID schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) - indexpath = os.path.join(os.environ['AIL_HOME'], cfg.get("Indexer", "path")) - ix = index.open_dir(indexpath) + ix = index.open_dir(selected_index) from whoosh.qparser import QueryParser with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(" ".join(q)) @@ -113,7 +159,6 @@ def get_more_search_result(): to_return["preview_array"] = preview_array to_return["date_array"] = date_array to_return["size_array"] = size_array - print "len(path_array)="+str(len(path_array)) if len(path_array) < num_elem_to_get: #pagelength to_return["moreData"] = False else: diff --git a/var/www/templates/search.html b/var/www/templates/search.html index 9b43967d..7d78c83e 100644 --- a/var/www/templates/search.html +++ b/var/www/templates/search.html @@ -89,6 +89,16 @@
+
+
+ Index: + +
+
@@ -100,16 +110,14 @@ - {% set i = 0 %} {% for path in r %} - - - - - + + + + + - {% set i = i + 1 %} {% endfor %}
{{ i + 1 }} {{ path }}{{ paste_date[i] }}{{ paste_size[i] }}

{{ loop.index0 + 1 }} {{ path }}{{ paste_date[loop.index0] }}{{ paste_size[loop.index0] }}

@@ -157,6 +165,28 @@ if (init_num_of_elements_in_table == pagelen) { $("#load_more_json_button1").show(); } + + $('#index_num').on('change', function() { + var form = document.createElement('form'); + form.setAttribute("method", 'post'); + form.setAttribute("action", "{{ url_for('search') }}"); + + var input1 = document.createElement('input'); + input1.setAttribute("type", "hidden"); + input1.setAttribute("name", "index_num"); + input1.setAttribute("value", this.value); + form.appendChild(input1); + + var input2 = document.createElement('input'); + input2.setAttribute("type", "hidden"); + input2.setAttribute("name", "query"); + input2.setAttribute("value", "{{ query }}"); + form.appendChild(input2); + + document.body.appendChild(form); + form.submit(); + }) + }); @@ -171,7 +201,7 @@ } function load_search_50_data() { - var options = { query: query, page_offset: page_offset }; + var options = { query: query, page_offset: page_offset, index_num: $("#index_num").val() }; $.post( "{{ url_for('get_more_search_result') }}", options).done(function( data ) { for(i=0; i