lookyloo/bin/background_processing.py

117 lines
5.3 KiB
Python
Executable File

#!/usr/bin/env python3
import json
import logging
from collections import Counter
from datetime import date, timedelta
from typing import Any, Dict
from lookyloo.lookyloo import Lookyloo
from lookyloo.default import AbstractManager, get_config, get_homedir, safe_create_dir
from lookyloo.helpers import ParsedUserAgent, serialize_to_json
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO)
class Processing(AbstractManager):
def __init__(self, loglevel: int=logging.INFO):
super().__init__(loglevel)
self.script_name = 'processing'
self.lookyloo = Lookyloo()
self.use_own_ua = get_config('generic', 'use_user_agents_users')
def _to_run_forever(self):
if self.use_own_ua:
self._build_ua_file()
self._retry_failed_enqueue()
def _build_ua_file(self):
'''Build a file in a format compatible with the capture page'''
yesterday = (date.today() - timedelta(days=1))
self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
safe_create_dir(self_generated_ua_file_path)
self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
if self_generated_ua_file.exists():
self.logger.debug(f'User-agent file for {yesterday} already exists.')
return
self.logger.info(f'Generating user-agent file for {yesterday}')
entries = self.lookyloo.redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
if not entries:
self.logger.info(f'No User-agent file for {yesterday} to generate.')
return
to_store: Dict[str, Any] = {'by_frequency': []}
uas = Counter([entry.split('|', 1)[1] for entry in entries])
for ua, _ in uas.most_common():
parsed_ua = ParsedUserAgent(ua)
if not parsed_ua.platform or not parsed_ua.browser:
continue
platform_key = parsed_ua.platform
if parsed_ua.platform_version:
platform_key = f'{platform_key} {parsed_ua.platform_version}'
browser_key = parsed_ua.browser
if parsed_ua.version:
browser_key = f'{browser_key} {parsed_ua.version}'
if platform_key not in to_store:
to_store[platform_key] = {}
if browser_key not in to_store[platform_key]:
to_store[platform_key][browser_key] = set()
to_store[platform_key][browser_key].add(parsed_ua.string)
to_store['by_frequency'].append({'os': platform_key,
'browser': browser_key,
'useragent': parsed_ua.string})
with self_generated_ua_file.open('w') as f:
json.dump(to_store, f, indent=2, default=serialize_to_json)
# Remove the UA / IP mapping.
self.lookyloo.redis.delete(f'user_agents|{yesterday.isoformat()}')
self.logger.info(f'User-agent file for {yesterday} generated.')
def _retry_failed_enqueue(self):
'''If enqueuing failed, the settings are added, with a UUID in the 'to_capture key', and they have a UUID'''
for uuid in self.lookyloo.redis.zrevrangebyscore('to_capture', 'Inf', '-Inf'):
if self.lookyloo.redis.hexists(uuid, 'not_queued'):
self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.')
# This capture couldn't be queued and we created the uuid locally
query = self.lookyloo.redis.hgetall(uuid)
try:
self.lookyloo.lacus.enqueue(
url=query.get('url', None),
document_name=query.get('document_name', None),
document=query.get('document', None),
# depth=query.get('depth', 0),
browser=query.get('browser', None),
device_name=query.get('device_name', None),
user_agent=query.get('user_agent', None),
proxy=query.get('proxy', None),
general_timeout_in_sec=query.get('general_timeout_in_sec', None),
cookies=query.get('cookies', None),
headers=query.get('headers', None),
http_credentials=query.get('http_credentials', None),
viewport=query.get('viewport', None),
referer=query.get('referer', None),
rendered_hostname_only=query.get('rendered_hostname_only', True),
# force=query.get('force', False),
# recapture_interval=query.get('recapture_interval', 300),
priority=query.get('priority', None),
uuid=uuid
)
except Exception as e:
self.logger.warning(f'Still unable to enqueue capture: {e}')
break
else:
self.lookyloo.redis.hdel(uuid, 'not_queued')
self.logger.info(f'{uuid} enqueued.')
def main():
p = Processing()
p.run(sleep_in_sec=30)
if __name__ == '__main__':
main()