diff --git a/known_content/legitimate.json b/known_content/legitimate.json new file mode 100644 index 0000000..90bc684 --- /dev/null +++ b/known_content/legitimate.json @@ -0,0 +1,42 @@ +{ + "f766df685b673657bdf57551354c149be2024385102854d2ca351e976684bb88361eae848f11f714e6e5973c061440831ea6f5be995b89fd5bd2d4559a0dc4a6": { + "domain": [], + "description": "jQuery v1.12.4 - WordPress 2019-05-16" + }, + "9c9616ccbc9765f4e825f6b57fba35e57b97b5ef5f51e88a5fe6d44bf22edbee1a52975f3311fe25d2ca65837b34dcb51cc2e00f02410c54a3aeee6a2c17e255": { + "domain": [], + "description": "Google SafeFrame Container" + }, + "cf69087b8f92f7b81efa788c3eb0b8a551405cdc7fa137e09a918349617359715ad5ef833f901e8d6e80c9ff20f63091710b492224e2ad23848673995dff5610": { + "domain": [], + "description": "Wordpress - embed - auto generated" + }, + "21047fea5269fee75a2a187aa09316519e35068cb2f2f76cfaf371e5224445e9d5c98497bd76fb9608d2b73e9dac1a3f5bfadfdc4623c479d53ecf93d81d3c9f": { + "domain": [], + "description": "Nginx - 301 - HTML" + }, + "0344c6b2757d4d787ed4a31ec7043c9dc9bf57017e451f60cecb9ad8f5febf64acf2a6c996346ae4b23297623ebf747954410aee27ee3c2f3c6ccd15a15d0f2d": { + "domain": [], + "description": "Nginx - 301 - HTML" + }, + "e423354c2083d0c889a488186322c5bf045f0e5dfa04db55d1625d21a0b4862a1d357aed0463b5e9d2659f7a8427c2c78da4084c1c741a5db7ab4742f8b55304": { + "domain": [], + "description": "jQuery UI CSS Framework 1.8.20" + }, + "b828576537cff413f37461f6a10bf6fc97cfcd256afb2f65d07ae552bbc8a639de1d84ed55fcade3682996da960d3f44e086ac56aa5f596b8607d9d118bb47ef": { + "domain": [], + "description": "Transparent PNG" + }, + "22142edb5016c6d74fef35af858439a3d314021ea7822bd65a00bcf35bed39576e490fb74dc2c04d32250178eb228db9a2ceeee290cf63aacb4f03741ad45949": { + "domain": [], + "description": "1px PNG" + }, + "43de6d36c775ce0f23813bc8ca401633762d0d7abd1945d4f8490f81ff7623d49ef423f4d63362c4ea57d58038d8edf3ad2d06869f4c4fc9d88c0e64c4a19470": { + "domain": [], + "description": "Gravatar unknown image" + }, + "c99bf4f1351efb28a74fa2504429875d9a63eb2d6a145a060ed487f83ff3a42b6c85d94165b960edca90aceec58d16a6ed37b25f44452bbacd7f5204c15c23cc": { + "domain": [], + "description": "Nginx - 302 - HTML" + } +} \ No newline at end of file diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 9a2d29a..9c750d2 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -115,8 +115,8 @@ class Indexing(): return self.redis.zcard(f'bh|{h}') def body_hash_fequency(self, body_hash: str) -> Dict[str, float]: - return {'hash_freq': self.redis.zscore('body_hashes', body_hash), - 'hash_domains_freq': self.redis.zcard(f'bh|{body_hash}')} + return {'hash_freq': int(self.redis.zscore('body_hashes', body_hash)), + 'hash_domains_freq': int(self.redis.zcard(f'bh|{body_hash}'))} def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None: if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): @@ -143,17 +143,23 @@ class Indexing(): return capture_uuid, urlnode_uuid, hostnode_uuid def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None, + filter_capture_uuid: Optional[str]=None, limit: int=20) -> Tuple[int, List[Tuple[str, str, str, bool]]]: to_return: List[Tuple[str, str, str, bool]] = [] all_captures = self.redis.smembers(f'bh|{body_hash}|captures') + len_captures = len(all_captures) for capture_uuid in list(all_captures)[:limit]: + if capture_uuid == filter_capture_uuid: + # Used to skip hits in current capture + len_captures -= 1 + continue for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1): url_uuid, hostnode_uuid, url = entry.split('|', 2) if filter_url: to_return.append((capture_uuid, hostnode_uuid, urlsplit(url).hostname, url == filter_url)) else: to_return.append((capture_uuid, hostnode_uuid, urlsplit(url).hostname, False)) - return len(all_captures), to_return + return len_captures, to_return def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]: return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True) @@ -1070,11 +1076,8 @@ class Lookyloo(): def hash_lookup(self, blob_hash: str, url: str, capture_uuid: str) -> Tuple[int, Dict[str, List[Tuple[str, str, str, str, str]]]]: captures_list: Dict[str, List[Tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []} - total_captures, details = self.indexing.get_body_hash_captures(blob_hash, url) + total_captures, details = self.indexing.get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid) for h_capture_uuid, url_uuid, url_hostname, same_url in details: - if h_capture_uuid == capture_uuid: - # Skip self. - continue cache = self.capture_cache(h_capture_uuid) if cache: if same_url: diff --git a/poetry.lock b/poetry.lock index 6a4c39e..45298a9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -948,7 +948,7 @@ description = "Traitlets Python configuration system" name = "traitlets" optional = false python-versions = ">=3.7" -version = "5.0.0" +version = "5.0.3" [package.dependencies] ipython-genutils = "*" @@ -1639,8 +1639,8 @@ toml = [ {file = "toml-0.10.1.tar.gz", hash = "sha256:926b612be1e5ce0634a2ca03470f95169cf16f939018233a670519cb4ac58b0f"}, ] traitlets = [ - {file = "traitlets-5.0.0-py3-none-any.whl", hash = "sha256:62a037f12ccb823fb05823afbe35fe0273bc18fa3202d0cf0ea8f24e97e464be"}, - {file = "traitlets-5.0.0.tar.gz", hash = "sha256:0d9c4005506b306b0a99551e96174b8bedc675c2dd048f92b3bbbb7d86ac93a9"}, + {file = "traitlets-5.0.3-py3-none-any.whl", hash = "sha256:8bdadb17a04c844f444cdefaa3dee47a12ff14cf6277b9eeda29bfa0659d5987"}, + {file = "traitlets-5.0.3.tar.gz", hash = "sha256:a2e91709a0330b6c5d497ed470b2feb1ed8da5c9dd807c6daab41f727b9391c9"}, ] twisted = [ {file = "Twisted-20.3.0-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:cdbc4c7f0cd7a2218b575844e970f05a1be1861c607b0e048c9bceca0c4d42f7"}, diff --git a/website/web/templates/hostname_popup.html b/website/web/templates/hostname_popup.html index ee92c52..419311f 100644 --- a/website/web/templates/hostname_popup.html +++ b/website/web/templates/hostname_popup.html @@ -43,7 +43,6 @@ + + +{% endblock %} + +{% block styles %} +{{ super() }} + +{% endblock %} + + +{% block content %} +
+ + + + + + + + + + + {% for h, freq, number_domains, context, capture_uuid, urlnode_uuid, hostnode_uuid in ressources %} + + + + + + + {% endfor %} + +
SHA 521FrequencyNumber unique domainsContext
+ {{ shorten_string(h, 20) }}
+ {{ get_ressource_button(capture_uuid, urlnode_uuid, h, 'Download sample') }} +
{{ freq }}{{ number_domains }} {{ context['type'] }} - {{ context['details'] }}
+ {{ context_form(capture_uuid, urlnode_uuid, hostnode_uuid, h, 'ressources') }} +
+
+{% endblock %}