new: [generate-chrome-crux-1m] New generator added for the Cached Chrome Top Million Websites
"Recent research showed that the top million most popular websites published by Google Chrome via their UX Report (CrUX) is significantly more accurate than other top lists like the Alexa Top Million and Tranco Top Million. This repository caches a CSV version of the Chrome top sites, queried from the CrUX data in Google BigQuery. You can browse all of the cached lists here. The most up-to-date top million global websites can be downloaded directly at: https://raw.githubusercontent.com/zakird/crux-top-lists/main/data/global/current.csv.gz." Ref: https://github.com/zakird/crux-top-listspull/243/head
parent
811869148f
commit
fe923ca1d7
|
@ -0,0 +1,49 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import csv
|
||||
import gzip
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from generator import download_to_file, get_version, write_to_file, get_abspath_source_file
|
||||
|
||||
|
||||
def process(files, dst):
|
||||
|
||||
warninglist = {
|
||||
'description': "Cached Chrome Top Million Websites - top 1 million",
|
||||
'version': get_version(),
|
||||
'name': "google-chrome-crux-1million",
|
||||
'type': 'string',
|
||||
'list': [],
|
||||
'matching_attributes': ['hostname', 'domain', 'uri', 'url']
|
||||
}
|
||||
|
||||
flag = True
|
||||
|
||||
for file in files:
|
||||
with open(get_abspath_source_file(file)) as csv_file:
|
||||
csv_reader = csv.reader(csv_file, delimiter=',')
|
||||
for row in csv_reader:
|
||||
if flag:
|
||||
flag = False
|
||||
print(True)
|
||||
continue
|
||||
v = row[0]
|
||||
p = urlparse(v)
|
||||
host = p.hostname
|
||||
warninglist['list'].append(host)
|
||||
|
||||
write_to_file(warninglist, dst)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
crux_domains_url = "https://github.com/zakird/crux-top-lists/raw/main/data/global/current.csv.gz"
|
||||
|
||||
crux_domains_file = "crux-top-1m.csv.gz"
|
||||
|
||||
dst = 'google-chrome-crux-1million'
|
||||
|
||||
download_to_file(crux_domains_url, crux_domains_file, gzip_enable=True)
|
||||
|
||||
process([crux_domains_file], dst)
|
Loading…
Reference in New Issue