mirror of https://github.com/CIRCL/lookyloo
chg: Improve async processing
parent
da3d1fe392
commit
12b8e4f949
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
import logging
|
||||
|
||||
from lookyloo.abstractmanager import AbstractManager
|
||||
from lookyloo.helpers import get_homedir, set_running, unset_running
|
||||
from lookyloo.helpers import get_homedir, set_running, unset_running, shutdown_requested
|
||||
from lookyloo.lookyloo import Lookyloo
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||
|
@ -22,7 +22,10 @@ class AsyncScraper(AbstractManager):
|
|||
|
||||
def _to_run_forever(self):
|
||||
set_running('async_scrape')
|
||||
self.lookyloo.process_scrape_queue()
|
||||
while True:
|
||||
url = self.lookyloo.process_scrape_queue()
|
||||
if url is None or shutdown_requested():
|
||||
break
|
||||
unset_running('async_scrape')
|
||||
|
||||
|
||||
|
|
|
@ -113,11 +113,14 @@ class Lookyloo():
|
|||
def process_scrape_queue(self):
|
||||
uuid = self.redis.spop('to_scrape')
|
||||
if not uuid:
|
||||
return
|
||||
return None
|
||||
to_scrape = self.redis.hgetall(uuid)
|
||||
self.redis.delete(uuid)
|
||||
to_scrape['perma_uuid'] = uuid
|
||||
self.scrape(**to_scrape)
|
||||
if self.scrape(**to_scrape):
|
||||
self.logger.info(f'Processed {to_scrape["url"]}')
|
||||
return True
|
||||
return False
|
||||
|
||||
def load_tree(self, report_dir: Path):
|
||||
har_files = sorted(report_dir.glob('*.har'))
|
||||
|
@ -152,7 +155,7 @@ class Lookyloo():
|
|||
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
||||
if not items:
|
||||
# broken
|
||||
pass
|
||||
return False
|
||||
if not perma_uuid:
|
||||
perma_uuid = str(uuid4())
|
||||
width = len(str(len(items)))
|
||||
|
|
Loading…
Reference in New Issue