diff --git a/lists/mozilla-top500/README.md b/lists/mozilla-top500/README.md index 5ffffd7..4ddb41a 100644 --- a/lists/mozilla-top500/README.md +++ b/lists/mozilla-top500/README.md @@ -6,6 +6,5 @@ Contains a list of the top 500 web pages ranked by the number of linking root do ## Update list ```bash -wget https://moz.com/top500/pages/csv - +../../tools/generate_mozilla-top500.py |jq . > list.json ``` diff --git a/tools/generate_mozilla-top500.py b/tools/generate_mozilla-top500.py index 1afec96..9feebc7 100755 --- a/tools/generate_mozilla-top500.py +++ b/tools/generate_mozilla-top500.py @@ -5,14 +5,23 @@ import requests import datetime import json import csv +import os # TODO: Include Top500 pages # TODO: Include MozRank -#moz_url = "https://moz.com/top500/pages/csv" -moz_url = "https://moz.com/top500/domains/csv" -moz_file = "csv" + +moz_url_domains = "https://moz.com/top500/domains/csv" +moz_url_pages = "https://moz.com/top500/pages/csv" + +moz_file_domains = "/tmp/top500.domains.csv" +moz_file_pages = "/tmp/top500.pages.csv" + user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"} -r = requests.get(moz_url, headers=user_agent) + +rDomains = requests.get(moz_url_domains, headers=user_agent) +rPages = requests.get(moz_url_pages, headers=user_agent) +open(moz_file_domains, 'wb').write(rDomains.content) +open(moz_file_pages, 'wb').write(rPages.content) moz_warninglist = {} version = int(datetime.date.today().strftime('%Y%m%d')) @@ -25,7 +34,7 @@ moz_warninglist['type'] = 'hostname' moz_warninglist['list'] = [] moz_warninglist['matching_attributes'] = ['hostname', 'domain'] -with open(moz_file) as csv_file: +with open(moz_file_domains) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: @@ -40,3 +49,9 @@ with open(moz_file) as csv_file: moz_warninglist['list'] = sorted(set(moz_warninglist['list'])) print(json.dumps(moz_warninglist)) + +try: + os.remove(moz_file_domains) + os.remove(moz_file_pages) +except: + print(f'Perhaps {moz_file_domains}/{moz_file_pages} does not exist.')