misp-warninglists/tools/generate-chrome-crux-1m.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import csv
import gzip
from urllib.parse import urlparse

from generator import download_to_file, get_version, write_to_file, get_abspath_source_file


def process(files, dst):

    warninglist = {
        'description': "Cached Chrome Top Million Websites - top 1 million",
        'version': get_version(),
        'name': "google-chrome-crux-1million",
        'type': 'string',
        'list': [],
        'matching_attributes': ['hostname', 'domain', 'uri', 'url']
    }

    flag = True

    for file in files:
        with open(get_abspath_source_file(file)) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for row in csv_reader:
                if flag:
                    flag = False
                    continue
                v = row[0]
                p = urlparse(v)
                host = p.hostname
                warninglist['list'].append(host)

    write_to_file(warninglist, dst)


if __name__ == '__main__':
    crux_domains_url = "https://github.com/zakird/crux-top-lists/raw/main/data/global/current.csv.gz"

    crux_domains_file = "crux-top-1m.csv.gz"

    dst = 'google-chrome-crux-1million'

    download_to_file(crux_domains_url, crux_domains_file, gzip_enable=True)

    process([crux_domains_file], dst)
new: [generate-chrome-crux-1m] New generator added for the Cached Chrome Top Million Websites "Recent research showed that the top million most popular websites published by Google Chrome via their UX Report (CrUX) is significantly more accurate than other top lists like the Alexa Top Million and Tranco Top Million. This repository caches a CSV version of the Chrome top sites, queried from the CrUX data in Google BigQuery. You can browse all of the cached lists here. The most up-to-date top million global websites can be downloaded directly at: https://raw.githubusercontent.com/zakird/crux-top-lists/main/data/global/current.csv.gz." Ref: https://github.com/zakird/crux-top-lists 2023-01-02 11:18:35 +01:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`import csv`
			`import gzip`
			`from urllib.parse import urlparse`

			`from generator import download_to_file, get_version, write_to_file, get_abspath_source_file`


			`def process(files, dst):`

			`warninglist = {`
			`'description': "Cached Chrome Top Million Websites - top 1 million",`
			`'version': get_version(),`
			`'name': "google-chrome-crux-1million",`
			`'type': 'string',`
			`'list': [],`
			`'matching_attributes': ['hostname', 'domain', 'uri', 'url']`
			`}`

			`flag = True`

			`for file in files:`
			`with open(get_abspath_source_file(file)) as csv_file:`
			`csv_reader = csv.reader(csv_file, delimiter=',')`
			`for row in csv_reader:`
			`if flag:`
			`flag = False`
			`continue`
			`v = row[0]`
			`p = urlparse(v)`
			`host = p.hostname`
			`warninglist['list'].append(host)`

			`write_to_file(warninglist, dst)`


			`if __name__ == '__main__':`
			`crux_domains_url = "https://github.com/zakird/crux-top-lists/raw/main/data/global/current.csv.gz"`

			`crux_domains_file = "crux-top-1m.csv.gz"`

			`dst = 'google-chrome-crux-1million'`

			`download_to_file(crux_domains_url, crux_domains_file, gzip_enable=True)`

			`process([crux_domains_file], dst)`