mirror of https://github.com/CIRCL/lookyloo
				
				
				
			chg: Properly use archived indexes for stats
							parent
							
								
									858bff544a
								
							
						
					
					
						commit
						18fbbb4e7b
					
				| 
						 | 
				
			
			@ -1426,10 +1426,11 @@ class Lookyloo():
 | 
			
		|||
        today = date.today()
 | 
			
		||||
        calendar_week = today.isocalendar()[1]
 | 
			
		||||
 | 
			
		||||
        stats_dict = {'submissions': 0, 'submissions_with_redirects': 0, 'redirects': 0}
 | 
			
		||||
        stats_dict = {'submissions': 0, 'redirects': 0}
 | 
			
		||||
        stats: Dict[int, Dict[int, Dict[str, Any]]] = {}
 | 
			
		||||
        weeks_stats: Dict[int, Dict] = {}
 | 
			
		||||
 | 
			
		||||
        # Only recent captures that are not archived
 | 
			
		||||
        for cache in self.sorted_capture_cache():
 | 
			
		||||
            if not hasattr(cache, 'timestamp'):
 | 
			
		||||
                continue
 | 
			
		||||
| 
						 | 
				
			
			@ -1443,7 +1444,6 @@ class Lookyloo():
 | 
			
		|||
            stats[date_submission.year][date_submission.month]['submissions'] += 1
 | 
			
		||||
            stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
 | 
			
		||||
            if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
 | 
			
		||||
                stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1
 | 
			
		||||
                stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
 | 
			
		||||
                stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1455,10 +1455,19 @@ class Lookyloo():
 | 
			
		|||
                weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
 | 
			
		||||
                weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
 | 
			
		||||
                if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
 | 
			
		||||
                    weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1
 | 
			
		||||
                    weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
 | 
			
		||||
                    weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)
 | 
			
		||||
 | 
			
		||||
        # Build limited stats based on archved captures and the indexes
 | 
			
		||||
        archives_stats: Dict[int, Dict[int, int]] = defaultdict(lambda: defaultdict(int))
 | 
			
		||||
        for _, capture_path in self.redis.hscan_iter('lookup_dirs_archived'):
 | 
			
		||||
            capture_ts = datetime.fromisoformat(capture_path.rsplit('/', 1)[-1])
 | 
			
		||||
            if capture_ts.year not in stats:
 | 
			
		||||
                stats[capture_ts.year] = {}
 | 
			
		||||
            if capture_ts.month not in stats[capture_ts.year]:
 | 
			
		||||
                stats[capture_ts.year][capture_ts.month] = {'submissions': 0}
 | 
			
		||||
            archives_stats[capture_ts.year][capture_ts.month] += 1
 | 
			
		||||
 | 
			
		||||
        statistics: Dict[str, List] = {'weeks': [], 'years': []}
 | 
			
		||||
        for week_number in sorted(weeks_stats.keys()):
 | 
			
		||||
            week_stat = weeks_stats[week_number]
 | 
			
		||||
| 
						 | 
				
			
			@ -1472,15 +1481,21 @@ class Lookyloo():
 | 
			
		|||
            year_stats: Dict[str, Union[int, List]] = {'year': year, 'months': [], 'yearly_submissions': 0, 'yearly_redirects': 0}
 | 
			
		||||
            for month in sorted(stats[year].keys()):
 | 
			
		||||
                month_stats = stats[year][month]
 | 
			
		||||
                urls = month_stats.pop('uniq_urls')
 | 
			
		||||
                month_stats['month_number'] = month
 | 
			
		||||
                month_stats['uniq_urls'] = len(urls)
 | 
			
		||||
                month_stats['uniq_domains'] = len(uniq_domains(urls))
 | 
			
		||||
                year_stats['months'].append(month_stats)  # type: ignore
 | 
			
		||||
                if len(month_stats) == 1:
 | 
			
		||||
                    # archived captures, missing many values
 | 
			
		||||
                    month_stats['month_number'] = month
 | 
			
		||||
                else:
 | 
			
		||||
                    urls = month_stats.pop('uniq_urls')
 | 
			
		||||
                    month_stats['month_number'] = month
 | 
			
		||||
                    month_stats['uniq_urls'] = len(urls)
 | 
			
		||||
                    month_stats['uniq_domains'] = len(uniq_domains(urls))
 | 
			
		||||
 | 
			
		||||
                year_stats['months'].append(month_stats)  # type: ignore
 | 
			
		||||
                year_stats['yearly_submissions'] += month_stats['submissions']
 | 
			
		||||
                year_stats['yearly_redirects'] += month_stats['redirects']
 | 
			
		||||
                if 'redirects' in month_stats:
 | 
			
		||||
                    year_stats['yearly_redirects'] += month_stats['redirects']
 | 
			
		||||
            statistics['years'].append(year_stats)
 | 
			
		||||
 | 
			
		||||
        return statistics
 | 
			
		||||
 | 
			
		||||
    def store_capture(self, uuid: str, is_public: bool,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,7 +34,7 @@
 | 
			
		|||
    "redirect.png": "PAjzlPV97rEFvH55mG1ZC9wRl98be3yMeX/nENuFkJcds6/AXgSR2ig/QyPULgobSnNgiYieLVWY/oqsgeywrQ==",
 | 
			
		||||
    "secure.svg": "H8ni7t0d60nCJDVGuZpuxC+RBy/ipAjWT627D12HlZGg6LUmjSwPTQTUekm3UJupEP7TUkhXyq6WHc5gy7QBjg==",
 | 
			
		||||
    "stats.css": "/kY943FwWBTne4IIyf7iBROSfbGd82TeBicEXqKkRwawMVRIvM/Pk5MRa7okUyGIxaDjFQGmV/U1vy+PhN6Jbw==",
 | 
			
		||||
    "stats_graph.js": "0OEouA6NAxLG2wMd7D2vtGoMrXKna7My98Euc6ecyfdO4/6mIJS87vzISOS4zSZ8u4ehpa+p7E0nWhsXXE7H/Q==",
 | 
			
		||||
    "stats_graph.js": "Sxmr0yKWu5oSu3tUvpjN0nbFs0J/cHOyiOB+yOTcDr2+5L39bd6bqmD73JsSL4g/Z4B42IZrSq4iqF9M0jiYCA==",
 | 
			
		||||
    "tree.css": "R/pWQnE8kMacDrzGy/NpA1tJoHAZpOjFiX6dqtBe+PqAnhYMn1CIQzODh8Ifvh6hBTPLRWX3bsabfEvfaI7Z6A==",
 | 
			
		||||
    "tree.js": "yutMvl9vXhN0htkl40tRHF3r5LCDsH+MJplB6ihMENIsMryEyVoTr4Zq1UadgMG5tP+W2/jqm2zfXh414Kk8AA==",
 | 
			
		||||
    "up.jpg": "d1ljZJ9f5JekyM6RLFFH2Ua44j6neiQBdUIXOenRTjGppQr3JaeglpQIH6BjPCJL177+TH52U3UIRNS5YAyKIg==",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,7 +14,9 @@ d3.json('/json/stats').then(json => {
 | 
			
		|||
            unique_urls_year.x.push(month.month_number)
 | 
			
		||||
 | 
			
		||||
            submissions_year.y.push(month.submissions)
 | 
			
		||||
            unique_urls_year.y.push(month.uniq_urls)
 | 
			
		||||
            if (month.uniq_urls != null) {
 | 
			
		||||
                unique_urls_year.y.push(month.uniq_urls)
 | 
			
		||||
            }
 | 
			
		||||
        });
 | 
			
		||||
        datasets.push(submissions_year)
 | 
			
		||||
        datasets.push(unique_urls_year)
 | 
			
		||||
| 
						 | 
				
			
			@ -71,7 +73,10 @@ d3.json('/json/stats').then(json => {
 | 
			
		|||
    data_lines.append("text")
 | 
			
		||||
               .datum((d, i) => { return {name: datasets[i].label, final: d[d.length-1]}; })
 | 
			
		||||
               .attr("transform", d => {
 | 
			
		||||
                   return ( `translate(${x_scale(d.final[0])}, ${y_scale(d.final[1])})` ) ; })
 | 
			
		||||
                   if (d.final != null) {
 | 
			
		||||
                       return ( `translate(${x_scale(d.final[0])}, ${y_scale(d.final[1])})` ) ;
 | 
			
		||||
                   }
 | 
			
		||||
               })
 | 
			
		||||
               .attr("x", 3)
 | 
			
		||||
               .attr("dy", ".35em")
 | 
			
		||||
               .attr("fill", (_, i) =>{ return d3.schemeCategory10[i]; })
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,7 +11,6 @@
 | 
			
		|||
    <thead>
 | 
			
		||||
    <tr>
 | 
			
		||||
      <th>Submissions</th>
 | 
			
		||||
      <th>Submissions with redirects</th>
 | 
			
		||||
      <th>Redirects</th>
 | 
			
		||||
      <th>Unique urls (including redirects)</th>
 | 
			
		||||
      <th>Unique domains (including redirects)</th>
 | 
			
		||||
| 
						 | 
				
			
			@ -20,7 +19,6 @@
 | 
			
		|||
    <tbody>
 | 
			
		||||
    <tr>
 | 
			
		||||
      <td> {{ week['submissions'] }} </td>
 | 
			
		||||
      <td> {{ week['submissions_with_redirects'] }} </td>
 | 
			
		||||
      <td> {{ week['redirects'] }} </td>
 | 
			
		||||
      <td> {{ week['uniq_urls'] }} </td>
 | 
			
		||||
      <td> {{ week['uniq_domains'] }} </td>
 | 
			
		||||
| 
						 | 
				
			
			@ -45,7 +43,6 @@
 | 
			
		|||
      <tr>
 | 
			
		||||
        <th>Month</th>
 | 
			
		||||
        <th>Submissions</th>
 | 
			
		||||
        <th>Submissions with redirects</th>
 | 
			
		||||
        <th>Redirects</th>
 | 
			
		||||
        <th>Unique urls (including redirects)</th>
 | 
			
		||||
        <th>Unique domains (including redirects)</th>
 | 
			
		||||
| 
						 | 
				
			
			@ -56,7 +53,6 @@
 | 
			
		|||
        <tr>
 | 
			
		||||
          <td> {{ month_name(month['month_number']) }} </td>
 | 
			
		||||
          <td> {{ month['submissions'] }} </td>
 | 
			
		||||
          <td> {{ month['submissions_with_redirects'] }} </td>
 | 
			
		||||
          <td> {{ month['redirects'] }} </td>
 | 
			
		||||
          <td> {{ month['uniq_urls'] }} </td>
 | 
			
		||||
          <td> {{ month['uniq_domains'] }} </td>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue