mirror of https://github.com/CIRCL/lookyloo
fix: Do not index private captures on public instance
parent
b217e4cd9a
commit
1c5f4f5710
|
@ -23,11 +23,19 @@ if __name__ == '__main__':
|
||||||
indexing = Indexing()
|
indexing = Indexing()
|
||||||
indexing.clear_indexes()
|
indexing.clear_indexes()
|
||||||
for capture_uuid in lookyloo.capture_uuids:
|
for capture_uuid in lookyloo.capture_uuids:
|
||||||
|
index = True
|
||||||
try:
|
try:
|
||||||
tree = lookyloo.get_crawled_tree(capture_uuid)
|
tree = lookyloo.get_crawled_tree(capture_uuid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(capture_uuid, e)
|
print(capture_uuid, e)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if lookyloo.is_public_instance:
|
||||||
|
cache = lookyloo.capture_cache(capture_uuid)
|
||||||
|
if cache.get('no_index') is not None:
|
||||||
|
index = False
|
||||||
|
|
||||||
# NOTE: these two methods do nothing if we just generated the pickle
|
# NOTE: these two methods do nothing if we just generated the pickle
|
||||||
indexing.index_cookies_capture(tree)
|
if index:
|
||||||
indexing.index_body_hashes_capture(tree)
|
indexing.index_cookies_capture(tree)
|
||||||
|
indexing.index_body_hashes_capture(tree)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
"loglevel": "INFO",
|
"loglevel": "INFO",
|
||||||
"splash_loglevel": "WARNING",
|
"splash_loglevel": "WARNING",
|
||||||
"only_global_lookups": true,
|
"only_global_lookups": true,
|
||||||
|
"public_instance": false,
|
||||||
"splash_url": "http://127.0.0.1:8050",
|
"splash_url": "http://127.0.0.1:8050",
|
||||||
"default_user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
|
"default_user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
|
||||||
"cache_clean_user": {},
|
"cache_clean_user": {},
|
||||||
|
@ -24,6 +25,7 @@
|
||||||
"_notes": {
|
"_notes": {
|
||||||
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
|
"loglevel": "(lookyloo) Can be one of the value listed here: https://docs.python.org/3/library/logging.html#levels",
|
||||||
"splash_loglevel": "(Splash) INFO is *very* verbose.",
|
"splash_loglevel": "(Splash) INFO is *very* verbose.",
|
||||||
|
"public_instance": "true means disabling features deemed unsafe on a public instance (such as indexing private captures)",
|
||||||
"only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network",
|
"only_global_lookups": "Set it to True if your instance is publicly available so users aren't able to scan your internal network",
|
||||||
"splash_url": "URL to connect to splash",
|
"splash_url": "URL to connect to splash",
|
||||||
"default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA",
|
"default_user_agent": "Ultimate fallback if the capture form, or the asynchronous submission, don't provide a UA",
|
||||||
|
|
|
@ -154,6 +154,7 @@ class Lookyloo():
|
||||||
self.configs: Dict[str, Dict[str, Any]] = load_configs()
|
self.configs: Dict[str, Dict[str, Any]] = load_configs()
|
||||||
self.logger.setLevel(self.get_config('loglevel'))
|
self.logger.setLevel(self.get_config('loglevel'))
|
||||||
self.indexing = Indexing()
|
self.indexing = Indexing()
|
||||||
|
self.is_public_instance = self.get_config('public_instance')
|
||||||
|
|
||||||
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||||
self.scrape_dir: Path = get_homedir() / 'scraped'
|
self.scrape_dir: Path = get_homedir() / 'scraped'
|
||||||
|
@ -219,6 +220,7 @@ class Lookyloo():
|
||||||
json.dump(to_store, f, indent=2)
|
json.dump(to_store, f, indent=2)
|
||||||
|
|
||||||
def cache_tree(self, capture_uuid: str) -> None:
|
def cache_tree(self, capture_uuid: str) -> None:
|
||||||
|
'''Generate the pickle, add capture in the indexes'''
|
||||||
capture_dir = self.lookup_capture_dir(capture_uuid)
|
capture_dir = self.lookup_capture_dir(capture_uuid)
|
||||||
if not capture_dir:
|
if not capture_dir:
|
||||||
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
|
||||||
|
@ -226,10 +228,17 @@ class Lookyloo():
|
||||||
with open((capture_dir / 'uuid'), 'r') as f:
|
with open((capture_dir / 'uuid'), 'r') as f:
|
||||||
uuid = f.read()
|
uuid = f.read()
|
||||||
har_files = sorted(capture_dir.glob('*.har'))
|
har_files = sorted(capture_dir.glob('*.har'))
|
||||||
|
# NOTE: We only index the public captures
|
||||||
|
index = True
|
||||||
try:
|
try:
|
||||||
ct = CrawledTree(har_files, uuid)
|
ct = CrawledTree(har_files, uuid)
|
||||||
self.indexing.index_cookies_capture(ct)
|
if self.is_public_instance:
|
||||||
self.indexing.index_body_hashes_capture(ct)
|
cache = self.capture_cache(capture_uuid)
|
||||||
|
if cache.get('no_index') is not None:
|
||||||
|
index = False
|
||||||
|
if index:
|
||||||
|
self.indexing.index_cookies_capture(ct)
|
||||||
|
self.indexing.index_body_hashes_capture(ct)
|
||||||
except Har2TreeError as e:
|
except Har2TreeError as e:
|
||||||
raise NoValidHarFile(e.message)
|
raise NoValidHarFile(e.message)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue