chg: Properly use archived indexes for stats

pull/860/head
Raphaël Vinot 2024-01-08 12:32:52 +01:00
parent 858bff544a
commit 18fbbb4e7b
4 changed files with 32 additions and 16 deletions

View File

@ -1426,10 +1426,11 @@ class Lookyloo():
today = date.today() today = date.today()
calendar_week = today.isocalendar()[1] calendar_week = today.isocalendar()[1]
stats_dict = {'submissions': 0, 'submissions_with_redirects': 0, 'redirects': 0} stats_dict = {'submissions': 0, 'redirects': 0}
stats: Dict[int, Dict[int, Dict[str, Any]]] = {} stats: Dict[int, Dict[int, Dict[str, Any]]] = {}
weeks_stats: Dict[int, Dict] = {} weeks_stats: Dict[int, Dict] = {}
# Only recent captures that are not archived
for cache in self.sorted_capture_cache(): for cache in self.sorted_capture_cache():
if not hasattr(cache, 'timestamp'): if not hasattr(cache, 'timestamp'):
continue continue
@ -1443,7 +1444,6 @@ class Lookyloo():
stats[date_submission.year][date_submission.month]['submissions'] += 1 stats[date_submission.year][date_submission.month]['submissions'] += 1
stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url) stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
if hasattr(cache, 'redirects') and len(cache.redirects) > 0: if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1
stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects) stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects) stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
@ -1455,10 +1455,19 @@ class Lookyloo():
weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1 weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url) weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
if hasattr(cache, 'redirects') and len(cache.redirects) > 0: if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1
weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects) weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects) weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)
# Build limited stats based on archved captures and the indexes
archives_stats: Dict[int, Dict[int, int]] = defaultdict(lambda: defaultdict(int))
for _, capture_path in self.redis.hscan_iter('lookup_dirs_archived'):
capture_ts = datetime.fromisoformat(capture_path.rsplit('/', 1)[-1])
if capture_ts.year not in stats:
stats[capture_ts.year] = {}
if capture_ts.month not in stats[capture_ts.year]:
stats[capture_ts.year][capture_ts.month] = {'submissions': 0}
archives_stats[capture_ts.year][capture_ts.month] += 1
statistics: Dict[str, List] = {'weeks': [], 'years': []} statistics: Dict[str, List] = {'weeks': [], 'years': []}
for week_number in sorted(weeks_stats.keys()): for week_number in sorted(weeks_stats.keys()):
week_stat = weeks_stats[week_number] week_stat = weeks_stats[week_number]
@ -1472,15 +1481,21 @@ class Lookyloo():
year_stats: Dict[str, Union[int, List]] = {'year': year, 'months': [], 'yearly_submissions': 0, 'yearly_redirects': 0} year_stats: Dict[str, Union[int, List]] = {'year': year, 'months': [], 'yearly_submissions': 0, 'yearly_redirects': 0}
for month in sorted(stats[year].keys()): for month in sorted(stats[year].keys()):
month_stats = stats[year][month] month_stats = stats[year][month]
if len(month_stats) == 1:
# archived captures, missing many values
month_stats['month_number'] = month
else:
urls = month_stats.pop('uniq_urls') urls = month_stats.pop('uniq_urls')
month_stats['month_number'] = month month_stats['month_number'] = month
month_stats['uniq_urls'] = len(urls) month_stats['uniq_urls'] = len(urls)
month_stats['uniq_domains'] = len(uniq_domains(urls)) month_stats['uniq_domains'] = len(uniq_domains(urls))
year_stats['months'].append(month_stats) # type: ignore
year_stats['months'].append(month_stats) # type: ignore
year_stats['yearly_submissions'] += month_stats['submissions'] year_stats['yearly_submissions'] += month_stats['submissions']
if 'redirects' in month_stats:
year_stats['yearly_redirects'] += month_stats['redirects'] year_stats['yearly_redirects'] += month_stats['redirects']
statistics['years'].append(year_stats) statistics['years'].append(year_stats)
return statistics return statistics
def store_capture(self, uuid: str, is_public: bool, def store_capture(self, uuid: str, is_public: bool,

View File

@ -34,7 +34,7 @@
"redirect.png": "PAjzlPV97rEFvH55mG1ZC9wRl98be3yMeX/nENuFkJcds6/AXgSR2ig/QyPULgobSnNgiYieLVWY/oqsgeywrQ==", "redirect.png": "PAjzlPV97rEFvH55mG1ZC9wRl98be3yMeX/nENuFkJcds6/AXgSR2ig/QyPULgobSnNgiYieLVWY/oqsgeywrQ==",
"secure.svg": "H8ni7t0d60nCJDVGuZpuxC+RBy/ipAjWT627D12HlZGg6LUmjSwPTQTUekm3UJupEP7TUkhXyq6WHc5gy7QBjg==", "secure.svg": "H8ni7t0d60nCJDVGuZpuxC+RBy/ipAjWT627D12HlZGg6LUmjSwPTQTUekm3UJupEP7TUkhXyq6WHc5gy7QBjg==",
"stats.css": "/kY943FwWBTne4IIyf7iBROSfbGd82TeBicEXqKkRwawMVRIvM/Pk5MRa7okUyGIxaDjFQGmV/U1vy+PhN6Jbw==", "stats.css": "/kY943FwWBTne4IIyf7iBROSfbGd82TeBicEXqKkRwawMVRIvM/Pk5MRa7okUyGIxaDjFQGmV/U1vy+PhN6Jbw==",
"stats_graph.js": "0OEouA6NAxLG2wMd7D2vtGoMrXKna7My98Euc6ecyfdO4/6mIJS87vzISOS4zSZ8u4ehpa+p7E0nWhsXXE7H/Q==", "stats_graph.js": "Sxmr0yKWu5oSu3tUvpjN0nbFs0J/cHOyiOB+yOTcDr2+5L39bd6bqmD73JsSL4g/Z4B42IZrSq4iqF9M0jiYCA==",
"tree.css": "R/pWQnE8kMacDrzGy/NpA1tJoHAZpOjFiX6dqtBe+PqAnhYMn1CIQzODh8Ifvh6hBTPLRWX3bsabfEvfaI7Z6A==", "tree.css": "R/pWQnE8kMacDrzGy/NpA1tJoHAZpOjFiX6dqtBe+PqAnhYMn1CIQzODh8Ifvh6hBTPLRWX3bsabfEvfaI7Z6A==",
"tree.js": "yutMvl9vXhN0htkl40tRHF3r5LCDsH+MJplB6ihMENIsMryEyVoTr4Zq1UadgMG5tP+W2/jqm2zfXh414Kk8AA==", "tree.js": "yutMvl9vXhN0htkl40tRHF3r5LCDsH+MJplB6ihMENIsMryEyVoTr4Zq1UadgMG5tP+W2/jqm2zfXh414Kk8AA==",
"up.jpg": "d1ljZJ9f5JekyM6RLFFH2Ua44j6neiQBdUIXOenRTjGppQr3JaeglpQIH6BjPCJL177+TH52U3UIRNS5YAyKIg==", "up.jpg": "d1ljZJ9f5JekyM6RLFFH2Ua44j6neiQBdUIXOenRTjGppQr3JaeglpQIH6BjPCJL177+TH52U3UIRNS5YAyKIg==",

View File

@ -14,7 +14,9 @@ d3.json('/json/stats').then(json => {
unique_urls_year.x.push(month.month_number) unique_urls_year.x.push(month.month_number)
submissions_year.y.push(month.submissions) submissions_year.y.push(month.submissions)
if (month.uniq_urls != null) {
unique_urls_year.y.push(month.uniq_urls) unique_urls_year.y.push(month.uniq_urls)
}
}); });
datasets.push(submissions_year) datasets.push(submissions_year)
datasets.push(unique_urls_year) datasets.push(unique_urls_year)
@ -71,7 +73,10 @@ d3.json('/json/stats').then(json => {
data_lines.append("text") data_lines.append("text")
.datum((d, i) => { return {name: datasets[i].label, final: d[d.length-1]}; }) .datum((d, i) => { return {name: datasets[i].label, final: d[d.length-1]}; })
.attr("transform", d => { .attr("transform", d => {
return ( `translate(${x_scale(d.final[0])}, ${y_scale(d.final[1])})` ) ; }) if (d.final != null) {
return ( `translate(${x_scale(d.final[0])}, ${y_scale(d.final[1])})` ) ;
}
})
.attr("x", 3) .attr("x", 3)
.attr("dy", ".35em") .attr("dy", ".35em")
.attr("fill", (_, i) =>{ return d3.schemeCategory10[i]; }) .attr("fill", (_, i) =>{ return d3.schemeCategory10[i]; })

View File

@ -11,7 +11,6 @@
<thead> <thead>
<tr> <tr>
<th>Submissions</th> <th>Submissions</th>
<th>Submissions with redirects</th>
<th>Redirects</th> <th>Redirects</th>
<th>Unique urls (including redirects)</th> <th>Unique urls (including redirects)</th>
<th>Unique domains (including redirects)</th> <th>Unique domains (including redirects)</th>
@ -20,7 +19,6 @@
<tbody> <tbody>
<tr> <tr>
<td> {{ week['submissions'] }} </td> <td> {{ week['submissions'] }} </td>
<td> {{ week['submissions_with_redirects'] }} </td>
<td> {{ week['redirects'] }} </td> <td> {{ week['redirects'] }} </td>
<td> {{ week['uniq_urls'] }} </td> <td> {{ week['uniq_urls'] }} </td>
<td> {{ week['uniq_domains'] }} </td> <td> {{ week['uniq_domains'] }} </td>
@ -45,7 +43,6 @@
<tr> <tr>
<th>Month</th> <th>Month</th>
<th>Submissions</th> <th>Submissions</th>
<th>Submissions with redirects</th>
<th>Redirects</th> <th>Redirects</th>
<th>Unique urls (including redirects)</th> <th>Unique urls (including redirects)</th>
<th>Unique domains (including redirects)</th> <th>Unique domains (including redirects)</th>
@ -56,7 +53,6 @@
<tr> <tr>
<td> {{ month_name(month['month_number']) }} </td> <td> {{ month_name(month['month_number']) }} </td>
<td> {{ month['submissions'] }} </td> <td> {{ month['submissions'] }} </td>
<td> {{ month['submissions_with_redirects'] }} </td>
<td> {{ month['redirects'] }} </td> <td> {{ month['redirects'] }} </td>
<td> {{ month['uniq_urls'] }} </td> <td> {{ month['uniq_urls'] }} </td>
<td> {{ month['uniq_domains'] }} </td> <td> {{ month['uniq_domains'] }} </td>