Added DomainTrending seems working.

Started search features with related html pages, not finish yet.
pull/57/head
Mokaddem 2016-07-05 16:53:03 +02:00
parent 8c1eeea6e6
commit 7ff9b9a583
8 changed files with 269 additions and 18 deletions

View File

@ -10,19 +10,72 @@ import re
import redis
import os
from packages import lib_words
from packages.Date import Date
from pubsublogger import publisher
from packages import Paste
from Helper import Process
from pyfaup.faup import Faup
def analyse(field_name):
# Config Var
threshold_need_to_look = 50
range_to_look = 10
threshold_to_plot = 1 #500%
to_plot = set()
clean_frequency = 10 #minutes
def analyse(server, field_name):
field = url_parsed[field_name]
if field is not None:
prev_score = r_serv1.hget(field, date)
prev_score = server.hget(field, date)
if prev_score is not None:
r_serv1.hset(field, date, int(prev_score) + 1)
server.hset(field, date, int(prev_score) + 1)
else:
r_serv1.hset(field, date, 1)
server.hset(field, date, 1)
def analyse_and_progression(server, field_name):
field = url_parsed[field_name]
if field is not None:
prev_score = server.hget(field, date)
if prev_score is not None:
print field + ' prev_score:' + prev_score
server.hset(field, date, int(prev_score) + 1)
if int(prev_score) + 1 > threshold_need_to_look: #threshold for false possitive
if(check_for_progression(server, field, date)):
to_plot.add(field)
else:
server.hset(field, date, 1)
def check_for_progression(server, field, date):
previous_data = set()
tot_sum = 0
for i in range(0, range_to_look):
curr_value = server.hget(field, Date(date).substract_day(i))
if curr_value is None: #no further data
break
else:
curr_value = int(curr_value)
previous_data.add(curr_value)
tot_sum += curr_value
if i == 0:
today_val = curr_value
print 'totsum='+str(tot_sum)
print 'div='+str(tot_sum/today_val)
if tot_sum/today_val >= threshold_to_plot:
return True
else:
return False
def clean_to_plot():
temp_to_plot = set()
curr_date = datetime.date.today()
date = Date(str(curr_date.year)+str(curr_date.month)+str(curr_date.day))
for elem in to_plot:
if(check_for_progression(field, date)):
temp_to_plot.add(elem)
to_plot = temp_to_plot
if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
@ -45,6 +98,11 @@ if __name__ == '__main__':
host=p.config.get("Redis_Level_DB", "host"),
port=p.config.get("Redis_Level_DB", "port"),
db=p.config.get("Redis_Level_DB", "db"))
r_serv2 = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_Domain", "host"),
port=p.config.get("Redis_Level_DB_Domain", "port"),
db=p.config.get("Redis_Level_DB_Domain", "db"))
# FILE CURVE SECTION #
csv_path_proto = os.path.join(os.environ['AIL_HOME'],
@ -57,6 +115,10 @@ if __name__ == '__main__':
tldsfile_path = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "tldsfile"))
csv_path_domain = os.path.join(os.environ['AIL_HOME'],
p.config.get("Directories", "domainstrending_csv"))
faup = Faup()
generate_new_graph = False
# Endless loop getting messages from the input queue
@ -71,17 +133,22 @@ if __name__ == '__main__':
today = datetime.date.today()
year = today.year
month = today.month
print 'b1'
lib_words.create_curve_with_word_file(r_serv1, csv_path_proto,
protocolsfile_path, year,
month)
print 'b2'
lib_words.create_curve_with_word_file(r_serv1, csv_path_tld,
tldsfile_path, year,
month)
print 'b3'
lib_words.create_curve_with_list(r_serv2, csv_path_domain,
to_plot, year,
month)
print 'end building'
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
print 'sleeping'
time.sleep(5)
continue
else:
@ -91,5 +158,8 @@ if __name__ == '__main__':
faup.decode(url)
url_parsed = faup.get()
analyse('scheme') #Scheme analysis
analyse('tld') #Tld analysis
analyse(r_serv1, 'scheme') #Scheme analysis
analyse(r_serv1, 'tld') #Tld analysis
analyse_and_progression(r_serv2, 'domain') #Domain analysis
print "to_plot:"
print to_plot

View File

@ -30,3 +30,12 @@ class Date(object):
def _set_day(self, day):
self.day = day
def substract_day(self, numDay):
import datetime
computed_date = datetime.date(int(self.year), int(self.month), int(self.day)) - datetime.timedelta(numDay)
comp_year = str(computed_date.year)
comp_month = str(computed_date.month).zfill(2)
comp_day = str(computed_date.day).zfill(2)
return comp_year + comp_month + comp_day

View File

@ -186,7 +186,9 @@ class Paste(object):
if the paste doesn't contain any human dictionnary words
..seealso: git@github.com:saffsd/langid.py.git
"""
FIXME: This procedure is using more than 20% of CPU
"""
identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
return identifier.classify(self.get_p_content())

View File

@ -81,13 +81,14 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month
to keep the timeline of the curve correct.
"""
threshold = 50
first_day = date(year, month, 01)
last_day = date(year, month, calendar.monthrange(year, month)[1])
words = []
with open(feederfilename, 'rb') as f:
# words of the files
words = sorted([word.strip() for word in f])
words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' ])
headers = ['Date'] + words
with open(csvfilename+'.csv', 'wb') as f:
@ -102,6 +103,47 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month
# from the 1srt day to the last of the list
for word in words:
value = r_serv.hget(word, curdate)
if value is None:
row.append(0)
else:
# if the word have a value for the day
# FIXME Due to performance issues (too many tlds, leads to more than 7s to perform this procedure), I added a threshold
if value >= threshold:
row.append(value)
writer.writerow(row)
def create_curve_with_list(server, csvfilename, to_plot, year, month):
"""Create a csv file used with dygraph.
:param r_serv: -- connexion to redis database
:param csvfilename: -- the path to the .csv file created
:param to_plot: -- the list which contain a words to plot.
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
This function create a .csv file using datas in redis.
It's checking if the words contained in to_plot and
their respectives values by days exists.
"""
first_day = date(year, month, 01)
last_day = date(year, month, calendar.monthrange(year, month)[1])
words = sorted(to_plot)
headers = ['Date'] + words
with open(csvfilename+'.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(headers)
# for each days
for dt in rrule(DAILY, dtstart=first_day, until=last_day):
row = []
curdate = dt.strftime("%Y%m%d")
row.append(curdate)
# from the 1srt day to the last of the list
for word in words:
value = server.hget(word, curdate)
if value is None:
row.append(0)
else:

View File

@ -7,7 +7,9 @@ import json
from flask import Flask, render_template, jsonify, request
import flask
import os
import sys
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/'))
import Paste
# CONFIG #
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
@ -18,6 +20,7 @@ if not os.path.exists(configfile):
cfg = ConfigParser.ConfigParser()
cfg.read(configfile)
max_preview_char = 500
# REDIS #
r_serv = redis.StrictRedis(
@ -49,6 +52,10 @@ def get_queues(r):
r.hgetall("queues").iteritems()]
def list_len(s):
return len(s)
app.jinja_env.filters['list_len'] = list_len
@app.route("/_logs")
def logs():
return flask.Response(event_stream(), mimetype="text/event-stream")
@ -65,6 +72,7 @@ def search():
q = []
q.append(query)
r = []
c = []
# Search
from whoosh import index
from whoosh.fields import Schema, TEXT, ID
@ -78,7 +86,10 @@ def search():
results = searcher.search(query, limit=None)
for x in results:
r.append(x.items()[0][1])
return render_template("search.html", r=r)
content = Paste.Paste(x.items()[0][1]).get_p_content()
content_range = max_preview_char if len(content)>max_preview_char else len(content)-1
c.append(content[0:content_range])
return render_template("search.html", r=r, c=c)
@app.route("/")
def index():
@ -104,6 +115,10 @@ def protocolstrending():
def tldstrending():
return render_template("Tldstrending.html")
@app.route("/showsavedpaste/")
def showsavedpaste():
return render_template("show_saved_paste.html")
if __name__ == "__main__":
app.run(host='0.0.0.0', port=7000, threaded=True)

View File

@ -130,7 +130,7 @@
<!-- instanciate and plot graphs -->
<script type="text/javascript">
var graph_tld = new Graph("TldsTrending", "../static//csv/tldstrendingdata.csv");
var graph_domain = new Graph("DomainTrending", "../static//csv/tldstrendingdata.csv");
var graph_domain = new Graph("DomainTrending", "../static//csv/domainstrendingdata.csv");
</script>
</div>
<script src="{{ url_for('static', filename='js/bootstrap.min.js') }}"></script>

View File

@ -16,6 +16,16 @@
<script type="text/javascript" src="{{ url_for('static', filename='js/dygraph-combined.js') }}"></script>
<script src="{{ url_for('static', filename='js/jquery-1.4.2.js') }}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<style>
.tooltip-inner {
text-align: left;
height: 200%;
width: 200%;
max-width: 500px;
max-height: 500px;
font-size: 13px;
}
</style>
</head>
<body>
@ -39,6 +49,26 @@
</div>
<!-- /.navbar-static-side -->
</nav>
<!-- Modal -->
<div id="mymodal" class="modal fade" role="dialog">
<div class="modal-dialog modal-lg">
<!-- Modal content-->
<div id="mymodalcontent" class="modal-content">
<div id="mymodalbody" class="modal-body">
<p>Some text in the modal.</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
<div id="page-wrapper">
<!-- /.row -->
<div class="row"> </div>
@ -53,10 +83,26 @@
</div>
<!-- /.panel-heading -->
<div class="panel-body">
<table class="table">
{% for result in r %}
<tr><td>{{ result }}</td></tr>
<table class="table table-hover">
<thead>
<tr>
<th>#</th>
<th>Path</th>
<th>Action</th>
</tr>
</thead>
<tbody>
{% set i = 0 %}
{% for path in r %}
{% set prev_content = c[i] %}
<tr>
<td>{{ i + 1 }}</td>
<td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ path }}"> {{ path }}</a></td>
<td><p><span class="glyphicon glyphicon-info-sign" data-toggle="tooltip" data-placement="left" title="{{ prev_content }}"></span> <button type="button" class="btn-link" data-toggle="modal" data-target="#mymodal" data-url="{{ url_for('showsavedpaste') }}?paste={{ path }}"><span class="fa fa-search-plus"></span></button></p></td>
</tr>
{% set i = i + 1 %}
{% endfor %}
</tbody>
</table>
</div>
<!-- /.panel-body -->
@ -69,4 +115,23 @@
<script src="{{ url_for('static', filename='js/bootstrap.min.js') }}"></script>
</body>
<!-- enable tooltip -->
<script>
$(document).ready(function(){
$('[data-toggle="tooltip"]').tooltip();
});
</script>
<!-- Dynamically update the modal -->
<script type="text/javascript">
// On click, get html content from url and update the corresponding modal
$("[data-toggle='modal']").on("click", function (event) {
event.preventDefault();
var url = $(this).attr('data-url');
var modal_id = $(this).attr('data-target');
$.get(url, function (data) {
$("#mymodalbody").html(data);
});
});
</script>
</html>

View File

@ -0,0 +1,48 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Paste information</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<h2> Paste: </h2>
<h3> {{ request.args.get('paste') }} </h3>
<hr></br>
<table class="table table-condensed">
<thead>
<tr>
<th>Date</th>
<th>Source</th>
<th>Encoding</th>
<th>Language</th>
<th>Size</th>
<th>Mime</th>
<th>Number of line</th>
</tr>
</thead>
<tbody>
<tr>
<td>John</td>
<td>Doe</td>
<td>john@example.com</td>
</tr>
<tr>
<td>Mary</td>
<td>Moe</td>
<td>mary@example.com</td>
</tr>
<tr>
<td>July</td>
<td>Dooley</td>
<td>july@example.com</td>
</tr>
</tbody>
</table>
</body>
</html>