mirror of https://github.com/CIRCL/lookyloo
fix: Do not index private captures on public instance
parent
b217e4cd9a
commit
1c5f4f5710
|
@ -23,11 +23,19 @@ if __name__ == '__main__':
|
|||
indexing = Indexing()
|
||||
indexing.clear_indexes()
|
||||
for capture_uuid in lookyloo.capture_uuids:
|
||||
index = True
|
||||
try:
|
||||
tree = lookyloo.get_crawled_tree(capture_uuid)
|
||||
except Exception as e:
|
||||
print(capture_uuid, e)
|
||||
continue
|
||||
|
||||
if lookyloo.is_public_instance:
|
||||
cache = lookyloo.capture_cache(capture_uuid)
|
||||
if cache.get('no_index') is not None:
|
||||
index = False
|
||||
|
||||
# NOTE: these two methods do nothing if we just generated the pickle
|
||||
indexing.index_cookies_capture(tree)
|
||||
indexing.index_body_hashes_capture(tree)
|
||||
if index:
|
||||
indexing.index_cookies_capture(tree)
|
||||
indexing.index_body_hashes_capture(tree)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
"loglevel": "INFO",
|
||||
"splash_loglevel": "WARNING",
|
||||
"only_global_lookups": true,
|
||||
"public_instance": false,
|
||||
"splash_url": "http://127.0.0.1:8050",
|
||||
"default_user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
|
||||
"cache_clean_user": {},
|
||||
|
@ -24,6 +25,7 @@
|
|||
"_notes": {
|
||||
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
|
||||
"splash_loglevel": "(Splash) INFO is *very* verbose.",
|
||||
"public_instance": "true means disabling features deemed unsafe on a public instance (such as indexing private captures)",
|
||||
"only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network",
|
||||
"splash_url": "URL to connect to splash",
|
||||
"default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA",
|
||||
|
|
|
@ -154,6 +154,7 @@ class Lookyloo():
|
|||
self.configs: Dict[str, Dict[str, Any]] = load_configs()
|
||||
self.logger.setLevel(self.get_config('loglevel'))
|
||||
self.indexing = Indexing()
|
||||
self.is_public_instance = self.get_config('public_instance')
|
||||
|
||||
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||
self.scrape_dir: Path = get_homedir() / 'scraped'
|
||||
|
@ -219,6 +220,7 @@ class Lookyloo():
|
|||
json.dump(to_store, f, indent=2)
|
||||
|
||||
def cache_tree(self, capture_uuid: str) -> None:
|
||||
'''Generate the pickle, add capture in the indexes'''
|
||||
capture_dir = self.lookup_capture_dir(capture_uuid)
|
||||
if not capture_dir:
|
||||
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
||||
|
@ -226,10 +228,17 @@ class Lookyloo():
|
|||
with open((capture_dir / 'uuid'), 'r') as f:
|
||||
uuid = f.read()
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
# NOTE: We only index the public captures
|
||||
index = True
|
||||
try:
|
||||
ct = CrawledTree(har_files, uuid)
|
||||
self.indexing.index_cookies_capture(ct)
|
||||
self.indexing.index_body_hashes_capture(ct)
|
||||
if self.is_public_instance:
|
||||
cache = self.capture_cache(capture_uuid)
|
||||
if cache.get('no_index') is not None:
|
||||
index = False
|
||||
if index:
|
||||
self.indexing.index_cookies_capture(ct)
|
||||
self.indexing.index_body_hashes_capture(ct)
|
||||
except Har2TreeError as e:
|
||||
raise NoValidHarFile(e.message)
|
||||
|
||||
|
|
Loading…
Reference in New Issue