From fe923ca1d7781e03c962da5c07ad43656fcb3c76 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy <a@foo.be>
Date: Mon, 2 Jan 2023 11:18:35 +0100
Subject: [PATCH] new: [generate-chrome-crux-1m] New generator added for the
 Cached Chrome Top Million Websites

"Recent research showed that the top million most popular websites published by Google Chrome via their UX Report (CrUX) is significantly more accurate than other top lists like the Alexa Top Million and Tranco Top Million.

This repository caches a CSV version of the Chrome top sites, queried
from the CrUX data in Google BigQuery. You can browse all of the cached
lists here. The most up-to-date top million global websites can be
downloaded directly at:
https://raw.githubusercontent.com/zakird/crux-top-lists/main/data/global/current.csv.gz."

Ref: https://github.com/zakird/crux-top-lists
---
 tools/generate-chrome-crux-1m.py | 49 ++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100755 tools/generate-chrome-crux-1m.py

diff --git a/tools/generate-chrome-crux-1m.py b/tools/generate-chrome-crux-1m.py
new file mode 100755
index 0000000..1ea7497
--- /dev/null
+++ b/tools/generate-chrome-crux-1m.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import csv
+import gzip
+from urllib.parse import urlparse
+
+from generator import download_to_file, get_version, write_to_file, get_abspath_source_file
+
+
+def process(files, dst):
+
+    warninglist = {
+        'description': "Cached Chrome Top Million Websites - top 1 million",
+        'version': get_version(),
+        'name': "google-chrome-crux-1million",
+        'type': 'string',
+        'list': [],
+        'matching_attributes': ['hostname', 'domain', 'uri', 'url']
+    }
+
+    flag = True
+
+    for file in files:
+        with open(get_abspath_source_file(file)) as csv_file:
+            csv_reader = csv.reader(csv_file, delimiter=',')
+            for row in csv_reader:
+                if flag:
+                    flag = False
+                    print(True)
+                    continue
+                v = row[0]
+                p = urlparse(v)
+                host = p.hostname
+                warninglist['list'].append(host)
+
+    write_to_file(warninglist, dst)
+
+
+if __name__ == '__main__':
+    crux_domains_url = "https://github.com/zakird/crux-top-lists/raw/main/data/global/current.csv.gz"
+
+    crux_domains_file = "crux-top-1m.csv.gz"
+
+    dst = 'google-chrome-crux-1million'
+
+    download_to_file(crux_domains_url, crux_domains_file, gzip_enable=True)
+
+    process([crux_domains_file], dst)