mirror of https://github.com/CIRCL/lookyloo
chg: Properly use archived indexes for stats
parent
858bff544a
commit
18fbbb4e7b
|
@ -1426,10 +1426,11 @@ class Lookyloo():
|
||||||
today = date.today()
|
today = date.today()
|
||||||
calendar_week = today.isocalendar()[1]
|
calendar_week = today.isocalendar()[1]
|
||||||
|
|
||||||
stats_dict = {'submissions': 0, 'submissions_with_redirects': 0, 'redirects': 0}
|
stats_dict = {'submissions': 0, 'redirects': 0}
|
||||||
stats: Dict[int, Dict[int, Dict[str, Any]]] = {}
|
stats: Dict[int, Dict[int, Dict[str, Any]]] = {}
|
||||||
weeks_stats: Dict[int, Dict] = {}
|
weeks_stats: Dict[int, Dict] = {}
|
||||||
|
|
||||||
|
# Only recent captures that are not archived
|
||||||
for cache in self.sorted_capture_cache():
|
for cache in self.sorted_capture_cache():
|
||||||
if not hasattr(cache, 'timestamp'):
|
if not hasattr(cache, 'timestamp'):
|
||||||
continue
|
continue
|
||||||
|
@ -1443,7 +1444,6 @@ class Lookyloo():
|
||||||
stats[date_submission.year][date_submission.month]['submissions'] += 1
|
stats[date_submission.year][date_submission.month]['submissions'] += 1
|
||||||
stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
|
stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
|
||||||
if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
|
if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
|
||||||
stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1
|
|
||||||
stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
|
stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
|
||||||
stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
|
stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
|
||||||
|
|
||||||
|
@ -1455,10 +1455,19 @@ class Lookyloo():
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
|
weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
|
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
|
||||||
if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
|
if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1
|
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
|
weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)
|
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)
|
||||||
|
|
||||||
|
# Build limited stats based on archved captures and the indexes
|
||||||
|
archives_stats: Dict[int, Dict[int, int]] = defaultdict(lambda: defaultdict(int))
|
||||||
|
for _, capture_path in self.redis.hscan_iter('lookup_dirs_archived'):
|
||||||
|
capture_ts = datetime.fromisoformat(capture_path.rsplit('/', 1)[-1])
|
||||||
|
if capture_ts.year not in stats:
|
||||||
|
stats[capture_ts.year] = {}
|
||||||
|
if capture_ts.month not in stats[capture_ts.year]:
|
||||||
|
stats[capture_ts.year][capture_ts.month] = {'submissions': 0}
|
||||||
|
archives_stats[capture_ts.year][capture_ts.month] += 1
|
||||||
|
|
||||||
statistics: Dict[str, List] = {'weeks': [], 'years': []}
|
statistics: Dict[str, List] = {'weeks': [], 'years': []}
|
||||||
for week_number in sorted(weeks_stats.keys()):
|
for week_number in sorted(weeks_stats.keys()):
|
||||||
week_stat = weeks_stats[week_number]
|
week_stat = weeks_stats[week_number]
|
||||||
|
@ -1472,15 +1481,21 @@ class Lookyloo():
|
||||||
year_stats: Dict[str, Union[int, List]] = {'year': year, 'months': [], 'yearly_submissions': 0, 'yearly_redirects': 0}
|
year_stats: Dict[str, Union[int, List]] = {'year': year, 'months': [], 'yearly_submissions': 0, 'yearly_redirects': 0}
|
||||||
for month in sorted(stats[year].keys()):
|
for month in sorted(stats[year].keys()):
|
||||||
month_stats = stats[year][month]
|
month_stats = stats[year][month]
|
||||||
|
if len(month_stats) == 1:
|
||||||
|
# archived captures, missing many values
|
||||||
|
month_stats['month_number'] = month
|
||||||
|
else:
|
||||||
urls = month_stats.pop('uniq_urls')
|
urls = month_stats.pop('uniq_urls')
|
||||||
month_stats['month_number'] = month
|
month_stats['month_number'] = month
|
||||||
month_stats['uniq_urls'] = len(urls)
|
month_stats['uniq_urls'] = len(urls)
|
||||||
month_stats['uniq_domains'] = len(uniq_domains(urls))
|
month_stats['uniq_domains'] = len(uniq_domains(urls))
|
||||||
year_stats['months'].append(month_stats) # type: ignore
|
|
||||||
|
|
||||||
|
year_stats['months'].append(month_stats) # type: ignore
|
||||||
year_stats['yearly_submissions'] += month_stats['submissions']
|
year_stats['yearly_submissions'] += month_stats['submissions']
|
||||||
|
if 'redirects' in month_stats:
|
||||||
year_stats['yearly_redirects'] += month_stats['redirects']
|
year_stats['yearly_redirects'] += month_stats['redirects']
|
||||||
statistics['years'].append(year_stats)
|
statistics['years'].append(year_stats)
|
||||||
|
|
||||||
return statistics
|
return statistics
|
||||||
|
|
||||||
def store_capture(self, uuid: str, is_public: bool,
|
def store_capture(self, uuid: str, is_public: bool,
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
"redirect.png": "PAjzlPV97rEFvH55mG1ZC9wRl98be3yMeX/nENuFkJcds6/AXgSR2ig/QyPULgobSnNgiYieLVWY/oqsgeywrQ==",
|
"redirect.png": "PAjzlPV97rEFvH55mG1ZC9wRl98be3yMeX/nENuFkJcds6/AXgSR2ig/QyPULgobSnNgiYieLVWY/oqsgeywrQ==",
|
||||||
"secure.svg": "H8ni7t0d60nCJDVGuZpuxC+RBy/ipAjWT627D12HlZGg6LUmjSwPTQTUekm3UJupEP7TUkhXyq6WHc5gy7QBjg==",
|
"secure.svg": "H8ni7t0d60nCJDVGuZpuxC+RBy/ipAjWT627D12HlZGg6LUmjSwPTQTUekm3UJupEP7TUkhXyq6WHc5gy7QBjg==",
|
||||||
"stats.css": "/kY943FwWBTne4IIyf7iBROSfbGd82TeBicEXqKkRwawMVRIvM/Pk5MRa7okUyGIxaDjFQGmV/U1vy+PhN6Jbw==",
|
"stats.css": "/kY943FwWBTne4IIyf7iBROSfbGd82TeBicEXqKkRwawMVRIvM/Pk5MRa7okUyGIxaDjFQGmV/U1vy+PhN6Jbw==",
|
||||||
"stats_graph.js": "0OEouA6NAxLG2wMd7D2vtGoMrXKna7My98Euc6ecyfdO4/6mIJS87vzISOS4zSZ8u4ehpa+p7E0nWhsXXE7H/Q==",
|
"stats_graph.js": "Sxmr0yKWu5oSu3tUvpjN0nbFs0J/cHOyiOB+yOTcDr2+5L39bd6bqmD73JsSL4g/Z4B42IZrSq4iqF9M0jiYCA==",
|
||||||
"tree.css": "R/pWQnE8kMacDrzGy/NpA1tJoHAZpOjFiX6dqtBe+PqAnhYMn1CIQzODh8Ifvh6hBTPLRWX3bsabfEvfaI7Z6A==",
|
"tree.css": "R/pWQnE8kMacDrzGy/NpA1tJoHAZpOjFiX6dqtBe+PqAnhYMn1CIQzODh8Ifvh6hBTPLRWX3bsabfEvfaI7Z6A==",
|
||||||
"tree.js": "yutMvl9vXhN0htkl40tRHF3r5LCDsH+MJplB6ihMENIsMryEyVoTr4Zq1UadgMG5tP+W2/jqm2zfXh414Kk8AA==",
|
"tree.js": "yutMvl9vXhN0htkl40tRHF3r5LCDsH+MJplB6ihMENIsMryEyVoTr4Zq1UadgMG5tP+W2/jqm2zfXh414Kk8AA==",
|
||||||
"up.jpg": "d1ljZJ9f5JekyM6RLFFH2Ua44j6neiQBdUIXOenRTjGppQr3JaeglpQIH6BjPCJL177+TH52U3UIRNS5YAyKIg==",
|
"up.jpg": "d1ljZJ9f5JekyM6RLFFH2Ua44j6neiQBdUIXOenRTjGppQr3JaeglpQIH6BjPCJL177+TH52U3UIRNS5YAyKIg==",
|
||||||
|
|
|
@ -14,7 +14,9 @@ d3.json('/json/stats').then(json => {
|
||||||
unique_urls_year.x.push(month.month_number)
|
unique_urls_year.x.push(month.month_number)
|
||||||
|
|
||||||
submissions_year.y.push(month.submissions)
|
submissions_year.y.push(month.submissions)
|
||||||
|
if (month.uniq_urls != null) {
|
||||||
unique_urls_year.y.push(month.uniq_urls)
|
unique_urls_year.y.push(month.uniq_urls)
|
||||||
|
}
|
||||||
});
|
});
|
||||||
datasets.push(submissions_year)
|
datasets.push(submissions_year)
|
||||||
datasets.push(unique_urls_year)
|
datasets.push(unique_urls_year)
|
||||||
|
@ -71,7 +73,10 @@ d3.json('/json/stats').then(json => {
|
||||||
data_lines.append("text")
|
data_lines.append("text")
|
||||||
.datum((d, i) => { return {name: datasets[i].label, final: d[d.length-1]}; })
|
.datum((d, i) => { return {name: datasets[i].label, final: d[d.length-1]}; })
|
||||||
.attr("transform", d => {
|
.attr("transform", d => {
|
||||||
return ( `translate(${x_scale(d.final[0])}, ${y_scale(d.final[1])})` ) ; })
|
if (d.final != null) {
|
||||||
|
return ( `translate(${x_scale(d.final[0])}, ${y_scale(d.final[1])})` ) ;
|
||||||
|
}
|
||||||
|
})
|
||||||
.attr("x", 3)
|
.attr("x", 3)
|
||||||
.attr("dy", ".35em")
|
.attr("dy", ".35em")
|
||||||
.attr("fill", (_, i) =>{ return d3.schemeCategory10[i]; })
|
.attr("fill", (_, i) =>{ return d3.schemeCategory10[i]; })
|
||||||
|
|
|
@ -11,7 +11,6 @@
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Submissions</th>
|
<th>Submissions</th>
|
||||||
<th>Submissions with redirects</th>
|
|
||||||
<th>Redirects</th>
|
<th>Redirects</th>
|
||||||
<th>Unique urls (including redirects)</th>
|
<th>Unique urls (including redirects)</th>
|
||||||
<th>Unique domains (including redirects)</th>
|
<th>Unique domains (including redirects)</th>
|
||||||
|
@ -20,7 +19,6 @@
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr>
|
<tr>
|
||||||
<td> {{ week['submissions'] }} </td>
|
<td> {{ week['submissions'] }} </td>
|
||||||
<td> {{ week['submissions_with_redirects'] }} </td>
|
|
||||||
<td> {{ week['redirects'] }} </td>
|
<td> {{ week['redirects'] }} </td>
|
||||||
<td> {{ week['uniq_urls'] }} </td>
|
<td> {{ week['uniq_urls'] }} </td>
|
||||||
<td> {{ week['uniq_domains'] }} </td>
|
<td> {{ week['uniq_domains'] }} </td>
|
||||||
|
@ -45,7 +43,6 @@
|
||||||
<tr>
|
<tr>
|
||||||
<th>Month</th>
|
<th>Month</th>
|
||||||
<th>Submissions</th>
|
<th>Submissions</th>
|
||||||
<th>Submissions with redirects</th>
|
|
||||||
<th>Redirects</th>
|
<th>Redirects</th>
|
||||||
<th>Unique urls (including redirects)</th>
|
<th>Unique urls (including redirects)</th>
|
||||||
<th>Unique domains (including redirects)</th>
|
<th>Unique domains (including redirects)</th>
|
||||||
|
@ -56,7 +53,6 @@
|
||||||
<tr>
|
<tr>
|
||||||
<td> {{ month_name(month['month_number']) }} </td>
|
<td> {{ month_name(month['month_number']) }} </td>
|
||||||
<td> {{ month['submissions'] }} </td>
|
<td> {{ month['submissions'] }} </td>
|
||||||
<td> {{ month['submissions_with_redirects'] }} </td>
|
|
||||||
<td> {{ month['redirects'] }} </td>
|
<td> {{ month['redirects'] }} </td>
|
||||||
<td> {{ month['uniq_urls'] }} </td>
|
<td> {{ month['uniq_urls'] }} </td>
|
||||||
<td> {{ month['uniq_domains'] }} </td>
|
<td> {{ month['uniq_domains'] }} </td>
|
||||||
|
|
Loading…
Reference in New Issue