2021-08-24 18:32:54 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2021-08-24 18:32:54 +02:00
|
|
|
import json
|
2023-09-01 16:00:45 +02:00
|
|
|
import time
|
2021-08-24 18:32:54 +02:00
|
|
|
import logging
|
2022-11-23 15:54:22 +01:00
|
|
|
import logging.config
|
2021-08-24 18:32:54 +02:00
|
|
|
from collections import Counter
|
2021-09-07 12:59:31 +02:00
|
|
|
from datetime import date, timedelta
|
2024-01-13 01:24:32 +01:00
|
|
|
from typing import Any
|
2021-08-24 18:32:54 +02:00
|
|
|
|
2024-01-13 01:24:32 +01:00
|
|
|
from lacuscore import CaptureStatus as CaptureStatusCore
|
|
|
|
from lookyloo import Lookyloo
|
2022-11-01 18:10:20 +01:00
|
|
|
from lookyloo.default import AbstractManager, get_config, get_homedir, safe_create_dir
|
2022-08-23 17:44:48 +02:00
|
|
|
from lookyloo.helpers import ParsedUserAgent, serialize_to_json
|
2024-01-16 00:27:43 +01:00
|
|
|
from pylacus import CaptureStatus as CaptureStatusPy
|
2021-08-24 18:32:54 +02:00
|
|
|
|
2022-11-23 15:54:22 +01:00
|
|
|
logging.config.dictConfig(get_config('logging'))
|
2021-08-24 18:32:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
class Processing(AbstractManager):
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def __init__(self, loglevel: int | None=None):
|
2021-08-24 18:32:54 +02:00
|
|
|
super().__init__(loglevel)
|
2022-11-01 18:10:20 +01:00
|
|
|
self.script_name = 'processing'
|
|
|
|
self.lookyloo = Lookyloo()
|
2021-08-24 18:32:54 +02:00
|
|
|
|
|
|
|
self.use_own_ua = get_config('generic', 'use_user_agents_users')
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def _to_run_forever(self) -> None:
|
2021-08-24 18:32:54 +02:00
|
|
|
if self.use_own_ua:
|
|
|
|
self._build_ua_file()
|
2022-11-01 18:10:20 +01:00
|
|
|
self._retry_failed_enqueue()
|
2021-08-24 18:32:54 +02:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def _build_ua_file(self) -> None:
|
2021-08-24 18:32:54 +02:00
|
|
|
'''Build a file in a format compatible with the capture page'''
|
|
|
|
yesterday = (date.today() - timedelta(days=1))
|
|
|
|
self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
|
|
|
|
safe_create_dir(self_generated_ua_file_path)
|
|
|
|
self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
|
|
|
|
if self_generated_ua_file.exists():
|
2022-11-01 18:10:20 +01:00
|
|
|
self.logger.debug(f'User-agent file for {yesterday} already exists.')
|
2021-08-24 18:32:54 +02:00
|
|
|
return
|
2021-08-24 18:44:00 +02:00
|
|
|
self.logger.info(f'Generating user-agent file for {yesterday}')
|
2022-11-01 18:10:20 +01:00
|
|
|
entries = self.lookyloo.redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
|
2021-08-24 18:32:54 +02:00
|
|
|
if not entries:
|
2021-08-24 18:44:00 +02:00
|
|
|
self.logger.info(f'No User-agent file for {yesterday} to generate.')
|
2021-08-24 18:32:54 +02:00
|
|
|
return
|
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
to_store: dict[str, Any] = {'by_frequency': []}
|
2021-08-24 18:32:54 +02:00
|
|
|
uas = Counter([entry.split('|', 1)[1] for entry in entries])
|
|
|
|
for ua, _ in uas.most_common():
|
2022-03-29 21:13:02 +02:00
|
|
|
parsed_ua = ParsedUserAgent(ua)
|
2021-08-24 18:32:54 +02:00
|
|
|
if not parsed_ua.platform or not parsed_ua.browser:
|
|
|
|
continue
|
2022-08-22 17:34:00 +02:00
|
|
|
platform_key = parsed_ua.platform
|
|
|
|
if parsed_ua.platform_version:
|
|
|
|
platform_key = f'{platform_key} {parsed_ua.platform_version}'
|
|
|
|
browser_key = parsed_ua.browser
|
|
|
|
if parsed_ua.version:
|
|
|
|
browser_key = f'{browser_key} {parsed_ua.version}'
|
|
|
|
if platform_key not in to_store:
|
|
|
|
to_store[platform_key] = {}
|
|
|
|
if browser_key not in to_store[platform_key]:
|
2022-08-23 17:44:48 +02:00
|
|
|
to_store[platform_key][browser_key] = set()
|
|
|
|
to_store[platform_key][browser_key].add(parsed_ua.string)
|
2022-08-22 17:34:00 +02:00
|
|
|
to_store['by_frequency'].append({'os': platform_key,
|
|
|
|
'browser': browser_key,
|
2021-08-24 18:32:54 +02:00
|
|
|
'useragent': parsed_ua.string})
|
|
|
|
with self_generated_ua_file.open('w') as f:
|
2022-08-23 17:44:48 +02:00
|
|
|
json.dump(to_store, f, indent=2, default=serialize_to_json)
|
2021-08-24 18:32:54 +02:00
|
|
|
|
|
|
|
# Remove the UA / IP mapping.
|
2022-11-01 18:10:20 +01:00
|
|
|
self.lookyloo.redis.delete(f'user_agents|{yesterday.isoformat()}')
|
2021-08-24 18:44:00 +02:00
|
|
|
self.logger.info(f'User-agent file for {yesterday} generated.')
|
2021-08-24 18:32:54 +02:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def _retry_failed_enqueue(self) -> None:
|
2022-11-01 18:10:20 +01:00
|
|
|
'''If enqueuing failed, the settings are added, with a UUID in the 'to_capture key', and they have a UUID'''
|
2024-01-25 14:21:57 +01:00
|
|
|
to_requeue: list[str] = []
|
|
|
|
for uuid, _ in self.lookyloo.redis.zscan_iter('to_capture'):
|
2023-09-01 16:00:45 +02:00
|
|
|
if self.lookyloo.redis.hexists(uuid, 'not_queued'):
|
|
|
|
# The capture is marked as not queued
|
2024-01-25 14:21:57 +01:00
|
|
|
to_requeue.append(uuid)
|
2023-09-01 16:00:45 +02:00
|
|
|
elif self.lookyloo.lacus.get_capture_status(uuid) in [CaptureStatusPy.UNKNOWN, CaptureStatusCore.UNKNOWN]:
|
|
|
|
# The capture is unknown on lacus side. It might be a race condition.
|
|
|
|
# Let's retry a few times.
|
|
|
|
retry = 3
|
|
|
|
while retry > 0:
|
2024-01-25 14:21:57 +01:00
|
|
|
time.sleep(1)
|
2023-09-01 16:00:45 +02:00
|
|
|
if self.lookyloo.lacus.get_capture_status(uuid) not in [CaptureStatusPy.UNKNOWN, CaptureStatusCore.UNKNOWN]:
|
2024-01-25 14:21:57 +01:00
|
|
|
# Was a race condition, the UUID has been or is being processed by Lacus
|
2023-10-11 15:02:11 +02:00
|
|
|
self.logger.info(f'UUID {uuid} was only temporary unknown')
|
2023-09-01 16:00:45 +02:00
|
|
|
break
|
|
|
|
retry -= 1
|
2022-11-01 18:10:20 +01:00
|
|
|
else:
|
2023-09-01 16:00:45 +02:00
|
|
|
# UUID is still unknown
|
2023-10-11 15:02:11 +02:00
|
|
|
self.logger.info(f'UUID {uuid} is still unknown')
|
2024-01-25 14:21:57 +01:00
|
|
|
to_requeue.append(uuid)
|
|
|
|
|
|
|
|
for uuid in to_requeue:
|
2024-02-26 17:07:23 +01:00
|
|
|
if self.lookyloo.redis.zscore('to_capture', uuid) is None:
|
2024-01-25 14:21:57 +01:00
|
|
|
# The capture has been captured in the meantime.
|
2023-09-01 16:00:45 +02:00
|
|
|
continue
|
|
|
|
self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.')
|
|
|
|
# This capture couldn't be queued and we created the uuid locally
|
|
|
|
query = self.lookyloo.redis.hgetall(uuid)
|
|
|
|
try:
|
|
|
|
new_uuid = self.lookyloo.lacus.enqueue(
|
|
|
|
url=query.get('url', None),
|
|
|
|
document_name=query.get('document_name', None),
|
|
|
|
document=query.get('document', None),
|
|
|
|
# depth=query.get('depth', 0),
|
|
|
|
browser=query.get('browser', None),
|
|
|
|
device_name=query.get('device_name', None),
|
|
|
|
user_agent=query.get('user_agent', None),
|
|
|
|
proxy=query.get('proxy', None),
|
|
|
|
general_timeout_in_sec=query.get('general_timeout_in_sec', None),
|
|
|
|
cookies=query.get('cookies', None),
|
|
|
|
headers=query.get('headers', None),
|
|
|
|
http_credentials=query.get('http_credentials', None),
|
|
|
|
viewport=query.get('viewport', None),
|
|
|
|
referer=query.get('referer', None),
|
|
|
|
rendered_hostname_only=query.get('rendered_hostname_only', True),
|
|
|
|
# force=query.get('force', False),
|
|
|
|
# recapture_interval=query.get('recapture_interval', 300),
|
2024-01-14 02:18:21 +01:00
|
|
|
priority=query.get('priority', 0),
|
2023-09-01 16:00:45 +02:00
|
|
|
uuid=uuid
|
|
|
|
)
|
|
|
|
if new_uuid != uuid:
|
|
|
|
# somehow, between the check and queuing, the UUID isn't UNKNOWN anymore, just checking that
|
|
|
|
self.logger.warning(f'Had to change the capture UUID (duplicate). Old: {uuid} / New: {new_uuid}')
|
|
|
|
except Exception as e:
|
|
|
|
self.logger.warning(f'Still unable to enqueue capture: {e}')
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
self.lookyloo.redis.hdel(uuid, 'not_queued')
|
|
|
|
self.logger.info(f'{uuid} enqueued.')
|
2022-11-01 18:10:20 +01:00
|
|
|
|
2021-08-24 18:32:54 +02:00
|
|
|
|
2024-01-12 17:15:41 +01:00
|
|
|
def main() -> None:
|
2021-08-24 18:32:54 +02:00
|
|
|
p = Processing()
|
2022-11-01 18:10:20 +01:00
|
|
|
p.run(sleep_in_sec=30)
|
2021-08-24 18:32:54 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|