From 92d192238bdd8871f43b4b9076fd7db39ae6a58b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 29 Jan 2019 17:04:45 +0100 Subject: [PATCH] fix: [Crawler] change max page crawled --- bin/Crawler.py | 8 ++++---- bin/torcrawler/TorSplashCrawler.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/Crawler.py b/bin/Crawler.py index a59926f6..e6b61a99 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -209,16 +209,16 @@ if __name__ == '__main__': date_month = datetime.datetime.now().strftime("%Y%m") if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): + # first seen + if not r_onion.hexists('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen'): + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date) + # last_father r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) # last check r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) - # first seen - if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)): - r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date) - crawl_onion(url, domain, date, date_month, message) if url != domain_url: print(url) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 1b77c6ef..99a4f3b3 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -42,7 +42,7 @@ class TorSplashCrawler(): 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, - 'CLOSESPIDER_PAGECOUNT': 1000, + 'CLOSESPIDER_PAGECOUNT': 50, 'DEPTH_LIMIT': crawler_depth_limit })