49 lines
1.3 KiB
Python
49 lines
1.3 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
import csv
|
||
|
import gzip
|
||
|
from urllib.parse import urlparse
|
||
|
|
||
|
from generator import download_to_file, get_version, write_to_file, get_abspath_source_file
|
||
|
|
||
|
|
||
|
def process(files, dst):
|
||
|
|
||
|
warninglist = {
|
||
|
'description': "Cached Chrome Top Million Websites - top 1 million",
|
||
|
'version': get_version(),
|
||
|
'name': "google-chrome-crux-1million",
|
||
|
'type': 'string',
|
||
|
'list': [],
|
||
|
'matching_attributes': ['hostname', 'domain', 'uri', 'url']
|
||
|
}
|
||
|
|
||
|
flag = True
|
||
|
|
||
|
for file in files:
|
||
|
with open(get_abspath_source_file(file)) as csv_file:
|
||
|
csv_reader = csv.reader(csv_file, delimiter=',')
|
||
|
for row in csv_reader:
|
||
|
if flag:
|
||
|
flag = False
|
||
|
continue
|
||
|
v = row[0]
|
||
|
p = urlparse(v)
|
||
|
host = p.hostname
|
||
|
warninglist['list'].append(host)
|
||
|
|
||
|
write_to_file(warninglist, dst)
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
crux_domains_url = "https://github.com/zakird/crux-top-lists/raw/main/data/global/current.csv.gz"
|
||
|
|
||
|
crux_domains_file = "crux-top-1m.csv.gz"
|
||
|
|
||
|
dst = 'google-chrome-crux-1million'
|
||
|
|
||
|
download_to_file(crux_domains_url, crux_domains_file, gzip_enable=True)
|
||
|
|
||
|
process([crux_domains_file], dst)
|