fix: Do not index private captures on public instance

pull/81/head
Raphaël Vinot 2020-07-20 13:39:08 +02:00
parent b217e4cd9a
commit 1c5f4f5710
3 changed files with 23 additions and 4 deletions

View File

@ -23,11 +23,19 @@ if __name__ == '__main__':
indexing = Indexing()
indexing.clear_indexes()
for capture_uuid in lookyloo.capture_uuids:
index = True
try:
tree = lookyloo.get_crawled_tree(capture_uuid)
except Exception as e:
print(capture_uuid, e)
continue
if lookyloo.is_public_instance:
cache = lookyloo.capture_cache(capture_uuid)
if cache.get('no_index') is not None:
index = False
# NOTE: these two methods do nothing if we just generated the pickle
indexing.index_cookies_capture(tree)
indexing.index_body_hashes_capture(tree)
if index:
indexing.index_cookies_capture(tree)
indexing.index_body_hashes_capture(tree)

View File

@ -2,6 +2,7 @@
"loglevel": "INFO",
"splash_loglevel": "WARNING",
"only_global_lookups": true,
"public_instance": false,
"splash_url": "http://127.0.0.1:8050",
"default_user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
"cache_clean_user": {},
@ -24,6 +25,7 @@
"_notes": {
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
"splash_loglevel": "(Splash) INFO is *very* verbose.",
"public_instance": "true means disabling features deemed unsafe on a public instance (such as indexing private captures)",
"only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network",
"splash_url": "URL to connect to splash",
"default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA",

View File

@ -154,6 +154,7 @@ class Lookyloo():
self.configs: Dict[str, Dict[str, Any]] = load_configs()
self.logger.setLevel(self.get_config('loglevel'))
self.indexing = Indexing()
self.is_public_instance = self.get_config('public_instance')
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
self.scrape_dir: Path = get_homedir() / 'scraped'
@ -219,6 +220,7 @@ class Lookyloo():
json.dump(to_store, f, indent=2)
def cache_tree(self, capture_uuid: str) -> None:
'''Generate the pickle, add capture in the indexes'''
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
@ -226,10 +228,17 @@ class Lookyloo():
with open((capture_dir / 'uuid'), 'r') as f:
uuid = f.read()
har_files = sorted(capture_dir.glob('*.har'))
# NOTE: We only index the public captures
index = True
try:
ct = CrawledTree(har_files, uuid)
self.indexing.index_cookies_capture(ct)
self.indexing.index_body_hashes_capture(ct)
if self.is_public_instance:
cache = self.capture_cache(capture_uuid)
if cache.get('no_index') is not None:
index = False
if index:
self.indexing.index_cookies_capture(ct)
self.indexing.index_body_hashes_capture(ct)
except Har2TreeError as e:
raise NoValidHarFile(e.message)