2024-01-12 17:15:41 +01:00
|
|
|
from lookyloo import Lookyloo
|
2020-11-19 23:20:22 +01:00
|
|
|
import calendar
|
|
|
|
import datetime
|
2020-11-20 13:24:37 +01:00
|
|
|
from urllib.parse import urlparse
|
2024-01-12 17:15:41 +01:00
|
|
|
from typing import Dict, Any, Union, Set, List
|
2020-11-20 12:17:54 +01:00
|
|
|
|
2020-11-19 23:20:22 +01:00
|
|
|
lookyloo = Lookyloo()
|
2020-11-20 12:17:54 +01:00
|
|
|
|
2021-01-25 13:14:33 +01:00
|
|
|
stats: Dict[Union[str, int], Any] = {}
|
2020-11-20 12:17:54 +01:00
|
|
|
|
2020-11-20 13:24:37 +01:00
|
|
|
today = datetime.date.today()
|
|
|
|
calendar_week = today.isocalendar()[1]
|
2021-01-25 13:14:33 +01:00
|
|
|
weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \
|
|
|
|
{calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()},
|
|
|
|
calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}}
|
2020-11-20 13:24:37 +01:00
|
|
|
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def uniq_domains(uniq_urls: List[str]) -> Set[str]:
|
2020-11-20 13:24:37 +01:00
|
|
|
domains = set()
|
|
|
|
for url in uniq_urls:
|
|
|
|
splitted = urlparse(url)
|
2024-01-12 17:15:41 +01:00
|
|
|
if splitted.hostname:
|
|
|
|
domains.add(splitted.hostname)
|
2020-11-20 13:24:37 +01:00
|
|
|
return domains
|
|
|
|
|
|
|
|
|
2021-09-27 11:36:27 +02:00
|
|
|
for cache in lookyloo.sorted_capture_cache():
|
2021-01-25 13:14:33 +01:00
|
|
|
date = cache.timestamp
|
2020-11-20 12:17:54 +01:00
|
|
|
if date.year not in stats:
|
|
|
|
stats[date.year] = {}
|
|
|
|
if date.month not in stats[date.year]:
|
2021-01-25 13:14:33 +01:00
|
|
|
stats[date.year][date.month] = {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}
|
2020-11-20 12:17:54 +01:00
|
|
|
stats[date.year][date.month]['analysis'] += 1
|
2021-01-25 13:14:33 +01:00
|
|
|
if len(cache.redirects) > 0:
|
2020-11-20 15:27:05 +01:00
|
|
|
stats[date.year][date.month]['analysis_with_redirects'] += 1
|
2021-01-25 13:14:33 +01:00
|
|
|
stats[date.year][date.month]['redirects'] += len(cache.redirects)
|
|
|
|
stats[date.year][date.month]['uniq_urls'].update(cache.redirects)
|
|
|
|
stats[date.year][date.month]['uniq_urls'].add(cache.url)
|
2020-11-20 13:24:37 +01:00
|
|
|
if date.isocalendar()[1] in weeks_stats:
|
2021-01-25 13:14:33 +01:00
|
|
|
weeks_stats[date.isocalendar()[1]]['analysis'] += 1 # type: ignore
|
|
|
|
if len(cache.redirects) > 0:
|
|
|
|
weeks_stats[date.isocalendar()[1]]['analysis_with_redirects'] += 1 # type: ignore
|
|
|
|
weeks_stats[date.isocalendar()[1]]['redirects'] += len(cache.redirects) # type: ignore
|
|
|
|
weeks_stats[date.isocalendar()[1]]['uniq_urls'].update(cache.redirects) # type: ignore
|
|
|
|
weeks_stats[date.isocalendar()[1]]['uniq_urls'].add(cache.url) # type: ignore
|
2020-11-20 13:24:37 +01:00
|
|
|
|
|
|
|
print('Statistics for the last two weeks:')
|
|
|
|
for week_number, week_stat in weeks_stats.items():
|
|
|
|
print(f'Week {week_number}:')
|
|
|
|
print(' Number of analysis:', week_stat['analysis'])
|
2020-11-20 15:27:05 +01:00
|
|
|
print(' Number of analysis with redirects:', week_stat['analysis_with_redirects'])
|
|
|
|
print(' Number of redirects:', week_stat['redirects'])
|
2021-01-25 13:14:33 +01:00
|
|
|
print(' Number of unique URLs:', len(week_stat['uniq_urls'])) # type: ignore
|
2024-01-12 17:15:41 +01:00
|
|
|
d = uniq_domains(week_stat['uniq_urls']) # type: ignore[arg-type]
|
|
|
|
print(' Number of unique domains:', len(d))
|
2020-11-20 12:17:54 +01:00
|
|
|
|
|
|
|
|
|
|
|
for year, data in stats.items():
|
|
|
|
print('Year:', year)
|
|
|
|
yearly_analysis = 0
|
2020-11-20 15:27:05 +01:00
|
|
|
yearly_redirects = 0
|
2020-11-20 12:17:54 +01:00
|
|
|
for month in sorted(data.keys()):
|
|
|
|
stats = data[month]
|
|
|
|
print(' ', calendar.month_name[month])
|
|
|
|
print("\tNumber of analysis :", stats['analysis'])
|
2020-11-20 15:27:05 +01:00
|
|
|
print("\tNumber of analysis with redirects:", stats['analysis_with_redirects'])
|
|
|
|
print("\tNumber of redirects :", stats['redirects'])
|
2020-11-20 13:24:37 +01:00
|
|
|
print('\tNumber of unique URLs:', len(stats['uniq_urls']))
|
|
|
|
domains = uniq_domains(stats['uniq_urls'])
|
|
|
|
print('\tNumber of unique domains:', len(domains))
|
2020-11-20 12:17:54 +01:00
|
|
|
yearly_analysis += stats['analysis']
|
2020-11-20 15:27:05 +01:00
|
|
|
yearly_redirects += stats['redirects']
|
2020-11-20 12:17:54 +01:00
|
|
|
|
|
|
|
print(" Sum analysis:", yearly_analysis)
|
2020-11-20 15:27:05 +01:00
|
|
|
print(" Sum redirects:", yearly_redirects)
|