mirror of https://github.com/CIRCL/AIL-framework
chg: [crawler har] compress HAR
parent
c719990125
commit
28c647d370
|
@ -268,7 +268,7 @@ def extract_author_from_html(html):
|
|||
|
||||
def create_har_id(date, item_id):
|
||||
item_id = item_id.split('/')[-1]
|
||||
return os.path.join(date, f'{item_id}.json')
|
||||
return os.path.join(date, f'{item_id}.json.gz')
|
||||
|
||||
def save_har(har_id, har_content):
|
||||
# create dir
|
||||
|
@ -277,8 +277,8 @@ def save_har(har_id, har_content):
|
|||
os.makedirs(har_dir)
|
||||
# save HAR
|
||||
filename = os.path.join(get_har_dir(), har_id)
|
||||
with open(filename, 'w') as f:
|
||||
f.write(json.dumps(har_content))
|
||||
with gzip.open(filename, 'wb') as f:
|
||||
f.write(json.dumps(har_content).encode())
|
||||
|
||||
def get_all_har_ids():
|
||||
har_ids = []
|
||||
|
@ -308,11 +308,15 @@ def get_all_har_ids():
|
|||
|
||||
def get_har_content(har_id):
|
||||
har_path = os.path.join(HAR_DIR, har_id)
|
||||
with open(har_path) as f:
|
||||
try:
|
||||
return json.loads(f.read())
|
||||
except json.decoder.JSONDecodeError:
|
||||
return {}
|
||||
try:
|
||||
with gzip.open(har_path) as f:
|
||||
try:
|
||||
return json.loads(f.read())
|
||||
except json.decoder.JSONDecodeError:
|
||||
return {}
|
||||
except Exception as e:
|
||||
print(e) # TODO LOGS
|
||||
return {}
|
||||
|
||||
def extract_cookies_names_from_har(har):
|
||||
cookies = set()
|
||||
|
@ -362,6 +366,22 @@ def _reprocess_all_hars_etag():
|
|||
etag = Etags.create(etag_content)
|
||||
etag.add(date, domain)
|
||||
|
||||
def _gzip_all_hars():
|
||||
for har_id in get_all_har_ids():
|
||||
har_path = os.path.join(HAR_DIR, har_id)
|
||||
new_id = f'{har_path}.gz'
|
||||
if not har_id.endswith('.gz'):
|
||||
if not os.path.exists(new_id):
|
||||
with open(har_path, 'rb') as f:
|
||||
content = f.read()
|
||||
if content:
|
||||
with gzip.open(new_id, 'wb') as f:
|
||||
r = f.write(content)
|
||||
print(r)
|
||||
if os.path.exists(new_id) and os.path.exists(har_path):
|
||||
os.remove(har_path)
|
||||
print('delete:', har_path)
|
||||
|
||||
# # # - - # # #
|
||||
|
||||
################################################################################
|
||||
|
@ -1944,3 +1964,4 @@ load_blacklist()
|
|||
# print(r)
|
||||
# _reprocess_all_hars_cookie_name()
|
||||
# _reprocess_all_hars_etag()
|
||||
# _gzip_all_hars()
|
||||
|
|
|
@ -129,7 +129,7 @@ def get_item_url(item_id):
|
|||
|
||||
def get_item_har(item_id):
|
||||
har = '/'.join(item_id.rsplit('/')[-4:])
|
||||
har = f'{har}.json'
|
||||
har = f'{har}.json.gz'
|
||||
path = os.path.join(ConfigLoader.get_hars_dir(), har)
|
||||
if os.path.isfile(path):
|
||||
return har
|
||||
|
|
|
@ -389,7 +389,7 @@ class Domain(AbstractObject):
|
|||
har = get_item_har(item_id)
|
||||
if har:
|
||||
print(har)
|
||||
_write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json')
|
||||
_write_in_zip_buffer(zf, os.path.join(hars_dir, har), f'{basename}.json.gz')
|
||||
# Screenshot
|
||||
screenshot = self._get_external_correlation('item', '', item_id, 'screenshot')
|
||||
if screenshot and screenshot['screenshot']:
|
||||
|
|
|
@ -131,7 +131,7 @@ class Mixer(AbstractModule):
|
|||
|
||||
self.last_refresh = time.time()
|
||||
self.clear_feeders_stat()
|
||||
time.sleep(0.5)
|
||||
time.sleep(0.5)
|
||||
|
||||
def computeNone(self):
|
||||
self.refresh_stats()
|
||||
|
|
Loading…
Reference in New Issue