diff --git a/tools/generate-chrome-crux-1m.py b/tools/generate-chrome-crux-1m.py new file mode 100755 index 0000000..1ea7497 --- /dev/null +++ b/tools/generate-chrome-crux-1m.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import csv +import gzip +from urllib.parse import urlparse + +from generator import download_to_file, get_version, write_to_file, get_abspath_source_file + + +def process(files, dst): + + warninglist = { + 'description': "Cached Chrome Top Million Websites - top 1 million", + 'version': get_version(), + 'name': "google-chrome-crux-1million", + 'type': 'string', + 'list': [], + 'matching_attributes': ['hostname', 'domain', 'uri', 'url'] + } + + flag = True + + for file in files: + with open(get_abspath_source_file(file)) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + if flag: + flag = False + print(True) + continue + v = row[0] + p = urlparse(v) + host = p.hostname + warninglist['list'].append(host) + + write_to_file(warninglist, dst) + + +if __name__ == '__main__': + crux_domains_url = "https://github.com/zakird/crux-top-lists/raw/main/data/global/current.csv.gz" + + crux_domains_file = "crux-top-1m.csv.gz" + + dst = 'google-chrome-crux-1million' + + download_to_file(crux_domains_url, crux_domains_file, gzip_enable=True) + + process([crux_domains_file], dst)