mirror of https://github.com/CIRCL/AIL-framework
fix: [crawler] fix down domain + domain redirection history
parent
77e8cc6c02
commit
fe2769308b
|
@ -60,6 +60,7 @@ class Crawler(AbstractModule):
|
||||||
self.root_item = None
|
self.root_item = None
|
||||||
self.date = None
|
self.date = None
|
||||||
self.items_dir = None
|
self.items_dir = None
|
||||||
|
self.original_domain = None
|
||||||
self.domain = None
|
self.domain = None
|
||||||
|
|
||||||
# TODO Replace with warning list ???
|
# TODO Replace with warning list ???
|
||||||
|
@ -190,6 +191,7 @@ class Crawler(AbstractModule):
|
||||||
print(domain)
|
print(domain)
|
||||||
|
|
||||||
self.domain = Domain(domain)
|
self.domain = Domain(domain)
|
||||||
|
self.original_domain = Domain(domain)
|
||||||
|
|
||||||
epoch = int(time.time())
|
epoch = int(time.time())
|
||||||
parent_id = task.get_parent()
|
parent_id = task.get_parent()
|
||||||
|
@ -212,12 +214,20 @@ class Crawler(AbstractModule):
|
||||||
# Origin + History + tags
|
# Origin + History + tags
|
||||||
if self.root_item:
|
if self.root_item:
|
||||||
self.domain.set_last_origin(parent_id)
|
self.domain.set_last_origin(parent_id)
|
||||||
self.domain.add_history(epoch, root_item=self.root_item)
|
|
||||||
# Tags
|
# Tags
|
||||||
for tag in task.get_tags():
|
for tag in task.get_tags():
|
||||||
self.domain.add_tag(tag)
|
self.domain.add_tag(tag)
|
||||||
elif self.domain.was_up():
|
self.domain.add_history(epoch, root_item=self.root_item)
|
||||||
self.domain.add_history(epoch, root_item=epoch)
|
|
||||||
|
if self.domain != self.original_domain:
|
||||||
|
self.original_domain.update_daterange(self.date.replace('/', ''))
|
||||||
|
if self.root_item:
|
||||||
|
self.original_domain.set_last_origin(parent_id)
|
||||||
|
# Tags
|
||||||
|
for tag in task.get_tags():
|
||||||
|
self.domain.add_tag(tag)
|
||||||
|
self.original_domain.add_history(epoch, root_item=self.root_item)
|
||||||
|
crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)
|
||||||
|
|
||||||
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
||||||
print('capture:', capture.uuid, 'completed')
|
print('capture:', capture.uuid, 'completed')
|
||||||
|
|
Loading…
Reference in New Issue