mirror of https://github.com/CIRCL/lookyloo
chg: Improve async processing
parent
da3d1fe392
commit
12b8e4f949
|
@ -5,7 +5,7 @@ from pathlib import Path
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from lookyloo.abstractmanager import AbstractManager
|
from lookyloo.abstractmanager import AbstractManager
|
||||||
from lookyloo.helpers import get_homedir, set_running, unset_running
|
from lookyloo.helpers import get_homedir, set_running, unset_running, shutdown_requested
|
||||||
from lookyloo.lookyloo import Lookyloo
|
from lookyloo.lookyloo import Lookyloo
|
||||||
|
|
||||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||||
|
@ -22,7 +22,10 @@ class AsyncScraper(AbstractManager):
|
||||||
|
|
||||||
def _to_run_forever(self):
|
def _to_run_forever(self):
|
||||||
set_running('async_scrape')
|
set_running('async_scrape')
|
||||||
self.lookyloo.process_scrape_queue()
|
while True:
|
||||||
|
url = self.lookyloo.process_scrape_queue()
|
||||||
|
if url is None or shutdown_requested():
|
||||||
|
break
|
||||||
unset_running('async_scrape')
|
unset_running('async_scrape')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -113,11 +113,14 @@ class Lookyloo():
|
||||||
def process_scrape_queue(self):
|
def process_scrape_queue(self):
|
||||||
uuid = self.redis.spop('to_scrape')
|
uuid = self.redis.spop('to_scrape')
|
||||||
if not uuid:
|
if not uuid:
|
||||||
return
|
return None
|
||||||
to_scrape = self.redis.hgetall(uuid)
|
to_scrape = self.redis.hgetall(uuid)
|
||||||
self.redis.delete(uuid)
|
self.redis.delete(uuid)
|
||||||
to_scrape['perma_uuid'] = uuid
|
to_scrape['perma_uuid'] = uuid
|
||||||
self.scrape(**to_scrape)
|
if self.scrape(**to_scrape):
|
||||||
|
self.logger.info(f'Processed {to_scrape["url"]}')
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def load_tree(self, report_dir: Path):
|
def load_tree(self, report_dir: Path):
|
||||||
har_files = sorted(report_dir.glob('*.har'))
|
har_files = sorted(report_dir.glob('*.har'))
|
||||||
|
@ -152,7 +155,7 @@ class Lookyloo():
|
||||||
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
||||||
if not items:
|
if not items:
|
||||||
# broken
|
# broken
|
||||||
pass
|
return False
|
||||||
if not perma_uuid:
|
if not perma_uuid:
|
||||||
perma_uuid = str(uuid4())
|
perma_uuid = str(uuid4())
|
||||||
width = len(str(len(items)))
|
width = len(str(len(items)))
|
||||||
|
|
Loading…
Reference in New Issue