new: [generate-chrome-crux-1m] New generator added for the Cached Chrome Top Million Websites

"Recent research showed that the top million most popular websites published by Google Chrome via their UX Report (CrUX) is significantly more accurate than other top lists like the Alexa Top Million and Tranco Top Million. This repository caches a CSV version of the Chrome top sites, queried from the CrUX data in Google BigQuery. You can browse all of the cached lists here. The most up-to-date top million global websites can be downloaded directly at: https://raw.githubusercontent.com/zakird/crux-top-lists/main/data/global/current.csv.gz." Ref: https://github.com/zakird/crux-top-lists
2023-01-02 11:18:35 +01:00 · 2023-01-02 11:18:35 +01:00 · fe923ca1d7
parent 811869148f
commit fe923ca1d7
1 changed files with 49 additions and 0 deletions
--- a/tools/generate-chrome-crux-1m.py
+++ b/tools/generate-chrome-crux-1m.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import csv
+import gzip
+from urllib.parse import urlparse
+
+from generator import download_to_file, get_version, write_to_file, get_abspath_source_file
+
+
+def process(files, dst):
+
+    warninglist = {
+        'description': "Cached Chrome Top Million Websites - top 1 million",
+        'version': get_version(),
+        'name': "google-chrome-crux-1million",
+        'type': 'string',
+        'list': [],
+        'matching_attributes': ['hostname', 'domain', 'uri', 'url']
+    }
+
+    flag = True
+
+    for file in files:
+        with open(get_abspath_source_file(file)) as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter=',')
+            for row in csv_reader:
+                if flag:
+                    flag = False
+                    print(True)
+                    continue
+                v = row[0]
+                p = urlparse(v)
+                host = p.hostname
+                warninglist['list'].append(host)
+
+    write_to_file(warninglist, dst)
+
+
+if __name__ == '__main__':
+    crux_domains_url = "https://github.com/zakird/crux-top-lists/raw/main/data/global/current.csv.gz"
+
+    crux_domains_file = "crux-top-1m.csv.gz"
+
+    dst = 'google-chrome-crux-1million'
+
+    download_to_file(crux_domains_url, crux_domains_file, gzip_enable=True)
+
+    process([crux_domains_file], dst)