From fe923ca1d7781e03c962da5c07ad43656fcb3c76 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Mon, 2 Jan 2023 11:18:35 +0100 Subject: [PATCH] new: [generate-chrome-crux-1m] New generator added for the Cached Chrome Top Million Websites "Recent research showed that the top million most popular websites published by Google Chrome via their UX Report (CrUX) is significantly more accurate than other top lists like the Alexa Top Million and Tranco Top Million. This repository caches a CSV version of the Chrome top sites, queried from the CrUX data in Google BigQuery. You can browse all of the cached lists here. The most up-to-date top million global websites can be downloaded directly at: https://raw.githubusercontent.com/zakird/crux-top-lists/main/data/global/current.csv.gz." Ref: https://github.com/zakird/crux-top-lists --- tools/generate-chrome-crux-1m.py | 49 ++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 tools/generate-chrome-crux-1m.py diff --git a/tools/generate-chrome-crux-1m.py b/tools/generate-chrome-crux-1m.py new file mode 100755 index 0000000..1ea7497 --- /dev/null +++ b/tools/generate-chrome-crux-1m.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import csv +import gzip +from urllib.parse import urlparse + +from generator import download_to_file, get_version, write_to_file, get_abspath_source_file + + +def process(files, dst): + + warninglist = { + 'description': "Cached Chrome Top Million Websites - top 1 million", + 'version': get_version(), + 'name': "google-chrome-crux-1million", + 'type': 'string', + 'list': [], + 'matching_attributes': ['hostname', 'domain', 'uri', 'url'] + } + + flag = True + + for file in files: + with open(get_abspath_source_file(file)) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + if flag: + flag = False + print(True) + continue + v = row[0] + p = urlparse(v) + host = p.hostname + warninglist['list'].append(host) + + write_to_file(warninglist, dst) + + +if __name__ == '__main__': + crux_domains_url = "https://github.com/zakird/crux-top-lists/raw/main/data/global/current.csv.gz" + + crux_domains_file = "crux-top-1m.csv.gz" + + dst = 'google-chrome-crux-1million' + + download_to_file(crux_domains_url, crux_domains_file, gzip_enable=True) + + process([crux_domains_file], dst)