chg: [moz500] Added info how to regenerate, added provisional urls/files
to topPages.pull/104/head
parent
9e0b2ebc75
commit
f41f976ce6
|
@ -6,6 +6,5 @@ Contains a list of the top 500 web pages ranked by the number of linking root do
|
||||||
## Update list
|
## Update list
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
wget https://moz.com/top500/pages/csv
|
../../tools/generate_mozilla-top500.py |jq . > list.json
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -5,14 +5,23 @@ import requests
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import csv
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
# TODO: Include Top500 pages
|
# TODO: Include Top500 pages
|
||||||
# TODO: Include MozRank
|
# TODO: Include MozRank
|
||||||
#moz_url = "https://moz.com/top500/pages/csv"
|
|
||||||
moz_url = "https://moz.com/top500/domains/csv"
|
moz_url_domains = "https://moz.com/top500/domains/csv"
|
||||||
moz_file = "csv"
|
moz_url_pages = "https://moz.com/top500/pages/csv"
|
||||||
|
|
||||||
|
moz_file_domains = "/tmp/top500.domains.csv"
|
||||||
|
moz_file_pages = "/tmp/top500.pages.csv"
|
||||||
|
|
||||||
user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
|
user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
|
||||||
r = requests.get(moz_url, headers=user_agent)
|
|
||||||
|
rDomains = requests.get(moz_url_domains, headers=user_agent)
|
||||||
|
rPages = requests.get(moz_url_pages, headers=user_agent)
|
||||||
|
open(moz_file_domains, 'wb').write(rDomains.content)
|
||||||
|
open(moz_file_pages, 'wb').write(rPages.content)
|
||||||
|
|
||||||
moz_warninglist = {}
|
moz_warninglist = {}
|
||||||
version = int(datetime.date.today().strftime('%Y%m%d'))
|
version = int(datetime.date.today().strftime('%Y%m%d'))
|
||||||
|
@ -25,7 +34,7 @@ moz_warninglist['type'] = 'hostname'
|
||||||
moz_warninglist['list'] = []
|
moz_warninglist['list'] = []
|
||||||
moz_warninglist['matching_attributes'] = ['hostname', 'domain']
|
moz_warninglist['matching_attributes'] = ['hostname', 'domain']
|
||||||
|
|
||||||
with open(moz_file) as csv_file:
|
with open(moz_file_domains) as csv_file:
|
||||||
csv_reader = csv.reader(csv_file, delimiter=',')
|
csv_reader = csv.reader(csv_file, delimiter=',')
|
||||||
line_count = 0
|
line_count = 0
|
||||||
for row in csv_reader:
|
for row in csv_reader:
|
||||||
|
@ -40,3 +49,9 @@ with open(moz_file) as csv_file:
|
||||||
|
|
||||||
moz_warninglist['list'] = sorted(set(moz_warninglist['list']))
|
moz_warninglist['list'] = sorted(set(moz_warninglist['list']))
|
||||||
print(json.dumps(moz_warninglist))
|
print(json.dumps(moz_warninglist))
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.remove(moz_file_domains)
|
||||||
|
os.remove(moz_file_pages)
|
||||||
|
except:
|
||||||
|
print(f'Perhaps {moz_file_domains}/{moz_file_pages} does not exist.')
|
||||||
|
|
Loading…
Reference in New Issue