2019-04-23 11:00:07 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
from generator import download_to_file, get_version, write_to_file, get_abspath_source_file
|
|
|
|
|
2019-04-24 03:23:40 +02:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
# TODO: Include MozRank
|
|
|
|
def process(files, dst):
|
2019-04-24 03:23:40 +02:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
warninglist = {
|
2021-06-10 10:02:20 +02:00
|
|
|
'description': "Event contains one or more entries from the top 500 of the most used domains from Moz.",
|
2020-07-27 10:44:30 +02:00
|
|
|
'version': get_version(),
|
|
|
|
'name': "Top 500 domains and pages from https://moz.com/top500",
|
2020-09-16 00:27:03 +02:00
|
|
|
'type': 'string',
|
2020-07-27 10:44:30 +02:00
|
|
|
'list': [],
|
|
|
|
'matching_attributes': ['hostname', 'domain', 'uri', 'url']
|
|
|
|
}
|
2019-04-24 03:23:40 +02:00
|
|
|
|
2022-10-13 08:19:33 +02:00
|
|
|
flag = True
|
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
for file in files:
|
|
|
|
with open(get_abspath_source_file(file)) as csv_file:
|
|
|
|
csv_reader = csv.reader(csv_file, delimiter=',')
|
|
|
|
for row in csv_reader:
|
2022-10-13 08:19:33 +02:00
|
|
|
if flag:
|
|
|
|
flag = False
|
|
|
|
continue
|
2020-07-27 10:44:30 +02:00
|
|
|
v = row[1]
|
|
|
|
warninglist['list'].append(v.rstrip().rstrip('/'))
|
2019-04-24 03:23:40 +02:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
write_to_file(warninglist, dst)
|
2019-04-23 11:00:07 +02:00
|
|
|
|
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
if __name__ == '__main__':
|
|
|
|
moz_domains_url = "https://moz.com/top-500/download/?table=top500Domains"
|
|
|
|
#moz_pages_url = "https://moz.com/top500/pages/csv"
|
2019-04-23 11:00:07 +02:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
moz_domains_file = "moz-top500.domains.csv"
|
|
|
|
#moz_pages_file = "moz-top500.pages.csv"
|
2019-04-23 11:00:07 +02:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
moz_dst = 'moz-top500'
|
2019-04-24 03:36:22 +02:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
download_to_file(moz_domains_url, moz_domains_file)
|
|
|
|
#download_to_file(moz_pages_url, moz_pages_file)
|
2019-04-24 03:23:40 +02:00
|
|
|
|
2020-07-27 10:44:30 +02:00
|
|
|
#process([moz_domains_file, moz_pages_file], moz_dst)
|
|
|
|
process([moz_domains_file], moz_dst)
|