From 8f1fe94b4957f302ba0ea759f302a7e16a818bfe Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Tue, 23 Apr 2019 18:00:07 +0900 Subject: [PATCH 1/4] new: [tool] Generate The Moz top 500 Domain list from https://moz.com/top500 --- tools/generate_mozilla-top500.py | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100755 tools/generate_mozilla-top500.py diff --git a/tools/generate_mozilla-top500.py b/tools/generate_mozilla-top500.py new file mode 100755 index 0000000..04f426c --- /dev/null +++ b/tools/generate_mozilla-top500.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import requests +import datetime +import json +import csv + +# TODO: Include Top500 pages +# TODO: Include MozRank +#moz_url = "https://moz.com/top500/pages/csv" +moz_url = "https://moz.com/top500/domains/csv" +moz_file = "csv" +user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"} +r = requests.get(moz_url, headers=user_agent) + +moz_warninglist = {} +version = int(datetime.date.today().strftime('%Y%m%d')) + +moz_warninglist['description'] = "Event contains one or more entries from the top 500 of the most used domains (Mozilla)." +d = datetime.datetime.now() +moz_warninglist['version'] = version +moz_warninglist['name'] = "Top 500 domains and pages from Mozilla" +moz_warninglist['type'] = 'hostname' +moz_warninglist['list'] = [] +moz_warninglist['matching_attributes'] = ['hostname', 'domain'] + +with open(moz_file) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + line_count = 0 + for row in csv_reader: + if line_count == 0: + #print(f'Column names are {", ".join(row)}') + line_count += 1 + else: + #print(f'\t{row[0]}. {row[1]}, MozTrust: {row[5]}.') + v = str(row).split(',')[1] + moz_warninglist['list'].append(v.rstrip().lstrip('/')) + line_count += 1 + +moz_warninglist['list'] = sorted(set(moz_warninglist['list'])) +print(json.dumps(moz_warninglist)) From 9e0b2ebc752dff7b3ec1ebf078dc99052bb533b2 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Wed, 24 Apr 2019 09:45:56 +0900 Subject: [PATCH 2/4] new: [list] Added Mozilla Top 500 domains --- lists/mozilla-top500/README.md | 11 + lists/mozilla-top500/list.json | 512 +++++++++++++++++++++++++++++++ tools/generate_mozilla-top500.py | 4 +- 3 files changed, 525 insertions(+), 2 deletions(-) create mode 100644 lists/mozilla-top500/README.md create mode 100644 lists/mozilla-top500/list.json diff --git a/lists/mozilla-top500/README.md b/lists/mozilla-top500/README.md new file mode 100644 index 0000000..5ffffd7 --- /dev/null +++ b/lists/mozilla-top500/README.md @@ -0,0 +1,11 @@ +# The Moz Top 500 +Moz's list of the top 500 domains and pages on the web. + +Contains a list of the top 500 web pages ranked by the number of linking root domains. This data is sourced from the Mozcape web index of 818 Billion domains and 6 Trillion pages. + +## Update list + +```bash +wget https://moz.com/top500/pages/csv + +``` diff --git a/lists/mozilla-top500/list.json b/lists/mozilla-top500/list.json new file mode 100644 index 0000000..39560f2 --- /dev/null +++ b/lists/mozilla-top500/list.json @@ -0,0 +1,512 @@ +{ + "description": "Event contains one or more entries from the top 500 of the most used domains (Mozilla).", + "version": 20190424, + "name": "Top 500 domains and pages from Mozilla", + "type": "hostname", + "list": [ + "163.com", + "1688.com", + "1and1.com", + "1and1.fr", + "1und1.de", + "360.cn", + "4.cn", + "51.la", + "a8.net", + "aarp.org", + "abc.net.au", + "about.com", + "aboutads.info", + "aboutcookies.org", + "accuweather.com", + "acm.org", + "addthis.com", + "addtoany.com", + "admin.ch", + "adobe.com", + "adweek.com", + "alexa.com", + "alibaba.com", + "aliyun.com", + "allaboutcookies.org", + "amazon.co.jp", + "amazon.co.uk", + "amazon.com", + "amazon.de", + "amazon.fr", + "amazonaws.com", + "ameblo.jp", + "amzn.to", + "android.com", + "aol.com", + "apache.org", + "apple.com", + "archive.org", + "arstechnica.com", + "artisteer.com", + "arxiv.org", + "athemes.com", + "att.com", + "azurewebsites.net", + "baidu.com", + "bandcamp.com", + "barnesandnoble.com", + "bbb.org", + "bbc.co.uk", + "bbc.com", + "behance.net", + "beian.gov.cn", + "berkeley.edu", + "bigcartel.com", + "bigcommerce.com", + "bing.com", + "biomedcentral.com", + "bit.ly", + "bitbucket.org", + "bizjournals.com", + "blackberry.com", + "blogger.com", + "blogspot.co.uk", + "blogspot.com", + "blogspot.com.es", + "blogspot.jp", + "bloomberg.com", + "bls.gov", + "bluehost.com", + "bmj.com", + "booking.com", + "box.com", + "bund.de", + "businessinsider.com", + "businesswire.com", + "buydomains.com", + "buzzfeed.com", + "ca.gov", + "cam.ac.uk", + "canada.ca", + "cbc.ca", + "cbslocal.com", + "cbsnews.com", + "cdc.gov", + "census.gov", + "change.org", + "chicagotribune.com", + "cisco.com", + "clickbank.net", + "cloudfront.net", + "cmu.edu", + "cnbc.com", + "cnet.com", + "cnn.com", + "colorlib.com", + "columbia.edu", + "congress.gov", + "constantcontact.com", + "cornell.edu", + "cpanel.com", + "cpanel.net", + "creativecommons.org", + "cryoutcreations.eu", + "dailymail.co.uk", + "dailymotion.com", + "debian.org", + "dedecms.com", + "delicious.com", + "deloitte.com", + "deviantart.com", + "dhs.gov", + "dictionary.com", + "digg.com", + "disqus.com", + "dmca.com", + "doi.org", + "dol.gov", + "domainactive.co", + "domainname.de", + "domainnameshop.com", + "domainretailing.com", + "domeneshop.no", + "dot.gov", + "doubleclick.net", + "dreamhost.com", + "dribbble.com", + "dropbox.com", + "drupal.org", + "duke.edu", + "e-recht24.de", + "ebay.com", + "economist.com", + "ed.gov", + "eepurl.com", + "eff.org", + "elegantthemes.com", + "elsevier.com", + "enable-javascript.com", + "ename.com.cn", + "engadget.com", + "entrepreneur.com", + "epa.gov", + "etracker.de", + "etsy.com", + "europa.eu", + "eventbrite.co.uk", + "eventbrite.com", + "ewebdevelopment.com", + "example.com", + "exblog.jp", + "facebook.com", + "fao.org", + "fastcompany.com", + "fb.com", + "fb.me", + "fbcdn.net", + "fc2.com", + "fcc.gov", + "fda.gov", + "feedburner.com", + "flickr.com", + "forbes.com", + "fortune.com", + "foursquare.com", + "foxnews.com", + "free.fr", + "ft.com", + "ftc.gov", + "g.co", + "gartner.com", + "geocities.jp", + "gesetze-im-internet.de", + "getpocket.com", + "giphy.com", + "github.com", + "github.io", + "globo.com", + "gnu.org", + "go.com", + "godaddy.com", + "gofundme.com", + "goo.gl", + "goo.ne.jp", + "goodreads.com", + "google.be", + "google.ca", + "google.ch", + "google.co.in", + "google.co.jp", + "google.co.uk", + "google.com", + "google.com.au", + "google.com.br", + "google.de", + "google.es", + "google.fr", + "google.it", + "google.nl", + "google.pl", + "google.ru", + "googleapis.com", + "googleusercontent.com", + "gotowebinar.com", + "gpo.gov", + "gravatar.com", + "guardian.co.uk", + "harvard.edu", + "hatena.ne.jp", + "hbr.org", + "hhs.gov", + "hibu.com", + "hilton.com", + "histats.com", + "hollywoodreporter.com", + "home.pl", + "homestead.com", + "hostgator.com", + "hostnet.nl", + "house.gov", + "houzz.com", + "hp.com", + "hubspot.com", + "huffingtonpost.com", + "ibm.com", + "icann.org", + "icio.us", + "ieee.org", + "ietf.org", + "ifeng.com", + "illinois.edu", + "imdb.com", + "imgur.com", + "inc.com", + "independent.co.uk", + "indiatimes.com", + "instagram.com", + "intel.com", + "irs.gov", + "iso.org", + "issuu.com", + "iubenda.com", + "japanpost.jp", + "java.com", + "jiathis.com", + "jimdo.com", + "joomla.org", + "jugem.jp", + "justgiving.com", + "justice.gov", + "kickstarter.com", + "latimes.com", + "libsyn.com", + "line.me", + "linkedin.com", + "list-manage.com", + "list-manage1.com", + "live.com", + "livedoor.jp", + "livejournal.com", + "loc.gov", + "loopia.com", + "loopia.se", + "macromedia.com", + "mail.ru", + "mailchimp.com", + "mapquest.com", + "marriott.com", + "mashable.com", + "medium.com", + "meetup.com", + "mhlw.go.jp", + "microsoft.com", + "miibeian.gov.cn", + "miitbeian.gov.cn", + "mijndomein.nl", + "mit.edu", + "mlb.com", + "mlit.go.jp", + "moodle.org", + "moz.com", + "mozilla.com", + "mozilla.org", + "msdn.com", + "msn.com", + "mynavi.jp", + "myshopify.com", + "myspace.com", + "mysql.com", + "namejet.com", + "nasa.gov", + "nationalgeographic.com", + "nature.com", + "naver.com", + "nazwa.pl", + "nbcnews.com", + "netflix.com", + "netscape.com", + "networkadvertising.org", + "networksolutions.com", + "newyorker.com", + "nginx.com", + "nginx.org", + "nhk.or.jp", + "nifty.com", + "nih.gov", + "nist.gov", + "noaa.gov", + "npr.org", + "nps.gov", + "ny.gov", + "nytimes.com", + "nyu.edu", + "ocn.ne.jp", + "oecd.org", + "office.com", + "ok.ru", + "one.com", + "opencart.com", + "opensource.org", + "opera.com", + "oracle.com", + "oreilly.com", + "oup.com", + "ow.ly", + "ox.ac.uk", + "parallels.com", + "paypal.com", + "pbs.org", + "phoca.cz", + "photobucket.com", + "php.net", + "phpbb.com", + "pinterest.com", + "playstation.com", + "plesk.com", + "plos.org", + "prestashop.com", + "prnewswire.com", + "psu.edu", + "psychologytoday.com", + "python.org", + "qq.com", + "quantcast.com", + "rakuten.co.jp", + "rambler.ru", + "redcross.org", + "reddit.com", + "reference.com", + "researchgate.net", + "reuters.com", + "rs6.net", + "sagepub.com", + "sakura.ne.jp", + "samsung.com", + "sciencedirect.com", + "sciencemag.org", + "scientificamerican.com", + "scribd.com", + "sec.gov", + "secureserver.net", + "sedo.com", + "sedoparking.com", + "senate.gov", + "shinystat.com", + "shop-pro.jp", + "shopify.com", + "si.edu", + "sina.com.cn", + "siteorigin.com", + "skype.com", + "slate.com", + "slideshare.net", + "snapchat.com", + "sogou.com", + "sohu.com", + "soundcloud.com", + "sourceforge.net", + "spotify.com", + "springer.com", + "squarespace.com", + "squareup.com", + "ssa.gov", + "stackoverflow.com", + "stanford.edu", + "starwoodhotels.com", + "statcounter.com", + "state.gov", + "steampowered.com", + "storify.com", + "studiopress.com", + "stumbleupon.com", + "sun.com", + "surveymonkey.com", + "symantec.com", + "t.co", + "t.me", + "tandfonline.com", + "taobao.com", + "teamviewer.com", + "techcrunch.com", + "ted.com", + "telegram.me", + "telegraph.co.uk", + "theatlantic.com", + "theguardian.com", + "thehill.com", + "themeforest.net", + "themegrill.com", + "thenextweb.com", + "theverge.com", + "ticketmaster.com", + "time.com", + "tmall.com", + "today.com", + "tripadvisor.co.uk", + "tripadvisor.com", + "trustpilot.com", + "tucowsdomains.com", + "tumblr.com", + "twitch.tv", + "twitter.com", + "typeform.com", + "typepad.com", + "uchicago.edu", + "ucl.ac.uk", + "ucla.edu", + "umblr.com", + "umich.edu", + "umn.edu", + "un.org", + "unesco.org", + "unicef.org", + "unsplash.com", + "uol.com.br", + "upenn.edu", + "usa.gov", + "usatoday.com", + "usc.edu", + "usda.gov", + "usgs.gov", + "usnews.com", + "ustream.tv", + "utexas.edu", + "va.gov", + "variety.com", + "venturebeat.com", + "vice.com", + "vimeo.com", + "visma.com", + "vk.com", + "vkontakte.ru", + "w3.org", + "w3schools.com", + "warnerbros.com", + "washington.edu", + "washingtonpost.com", + "web.de", + "webmd.com", + "webs.com", + "weebly.com", + "weibo.com", + "whatsapp.com", + "whitehouse.gov", + "who.int", + "wikia.com", + "wikihow.com", + "wikimedia.org", + "wikipedia.org", + "wiley.com", + "windowsphone.com", + "wired.com", + "wisc.edu", + "wix.com", + "wixsite.com", + "wordpress.com", + "wordpress.org", + "worldbank.org", + "wp.com", + "wp.me", + "wsimg.com", + "wsj.com", + "wufoo.com", + "wunderground.com", + "www.gov.uk", + "www.nhs.uk", + "xing.com", + "xinhuanet.com", + "xiti.com", + "yahoo.co.jp", + "yahoo.com", + "yale.edu", + "yandex.ru", + "yelp.com", + "youku.com", + "youronlinechoices.com", + "youtu.be", + "youtube.com", + "zdnet.com", + "zendesk.com", + "zenfolio.com" + ], + "matching_attributes": [ + "hostname", + "domain" + ] +} diff --git a/tools/generate_mozilla-top500.py b/tools/generate_mozilla-top500.py index 04f426c..1afec96 100755 --- a/tools/generate_mozilla-top500.py +++ b/tools/generate_mozilla-top500.py @@ -34,8 +34,8 @@ with open(moz_file) as csv_file: line_count += 1 else: #print(f'\t{row[0]}. {row[1]}, MozTrust: {row[5]}.') - v = str(row).split(',')[1] - moz_warninglist['list'].append(v.rstrip().lstrip('/')) + v = row[1] + moz_warninglist['list'].append(v.rstrip().rstrip('/')) line_count += 1 moz_warninglist['list'] = sorted(set(moz_warninglist['list'])) From f41f976ce6eecd7bdacf32a490cae70ae45390bc Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Wed, 24 Apr 2019 10:23:40 +0900 Subject: [PATCH 3/4] chg: [moz500] Added info how to regenerate, added provisional urls/files to topPages. --- lists/mozilla-top500/README.md | 3 +-- tools/generate_mozilla-top500.py | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/lists/mozilla-top500/README.md b/lists/mozilla-top500/README.md index 5ffffd7..4ddb41a 100644 --- a/lists/mozilla-top500/README.md +++ b/lists/mozilla-top500/README.md @@ -6,6 +6,5 @@ Contains a list of the top 500 web pages ranked by the number of linking root do ## Update list ```bash -wget https://moz.com/top500/pages/csv - +../../tools/generate_mozilla-top500.py |jq . > list.json ``` diff --git a/tools/generate_mozilla-top500.py b/tools/generate_mozilla-top500.py index 1afec96..9feebc7 100755 --- a/tools/generate_mozilla-top500.py +++ b/tools/generate_mozilla-top500.py @@ -5,14 +5,23 @@ import requests import datetime import json import csv +import os # TODO: Include Top500 pages # TODO: Include MozRank -#moz_url = "https://moz.com/top500/pages/csv" -moz_url = "https://moz.com/top500/domains/csv" -moz_file = "csv" + +moz_url_domains = "https://moz.com/top500/domains/csv" +moz_url_pages = "https://moz.com/top500/pages/csv" + +moz_file_domains = "/tmp/top500.domains.csv" +moz_file_pages = "/tmp/top500.pages.csv" + user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"} -r = requests.get(moz_url, headers=user_agent) + +rDomains = requests.get(moz_url_domains, headers=user_agent) +rPages = requests.get(moz_url_pages, headers=user_agent) +open(moz_file_domains, 'wb').write(rDomains.content) +open(moz_file_pages, 'wb').write(rPages.content) moz_warninglist = {} version = int(datetime.date.today().strftime('%Y%m%d')) @@ -25,7 +34,7 @@ moz_warninglist['type'] = 'hostname' moz_warninglist['list'] = [] moz_warninglist['matching_attributes'] = ['hostname', 'domain'] -with open(moz_file) as csv_file: +with open(moz_file_domains) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: @@ -40,3 +49,9 @@ with open(moz_file) as csv_file: moz_warninglist['list'] = sorted(set(moz_warninglist['list'])) print(json.dumps(moz_warninglist)) + +try: + os.remove(moz_file_domains) + os.remove(moz_file_pages) +except: + print(f'Perhaps {moz_file_domains}/{moz_file_pages} does not exist.') From 190312cf0f88f34bcadd1363d00d42eb14d806b6 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Wed, 24 Apr 2019 10:36:22 +0900 Subject: [PATCH 4/4] chg: [moz500] Added Pages too. Updated list --- lists/mozilla-top500/list.json | 460 ++++++++++++++++++++++++++++++- tools/generate_mozilla-top500.py | 15 +- 2 files changed, 473 insertions(+), 2 deletions(-) diff --git a/lists/mozilla-top500/list.json b/lists/mozilla-top500/list.json index 39560f2..cf06db4 100644 --- a/lists/mozilla-top500/list.json +++ b/lists/mozilla-top500/list.json @@ -4,6 +4,7 @@ "name": "Top 500 domains and pages from Mozilla", "type": "hostname", "list": [ + "123-reg-expired.co.uk", "163.com", "1688.com", "1and1.com", @@ -15,11 +16,15 @@ "a8.net", "aarp.org", "abc.net.au", + "abcnews.go.com", "about.com", "aboutads.info", "aboutcookies.org", + "account.1und1.de", + "accounts.google.com/ServiceLogin?service=jotspot", "accuweather.com", "acm.org", + "add.my.yahoo.com/content", "addthis.com", "addtoany.com", "admin.ch", @@ -40,20 +45,26 @@ "android.com", "aol.com", "apache.org", + "api.whatsapp.com", "apple.com", + "apple.com/mac", "archive.org", "arstechnica.com", "artisteer.com", "arxiv.org", "athemes.com", + "athemes.com/theme/sydney", "att.com", + "automattic.com", "azurewebsites.net", + "b.hatena.ne.jp/entry", "baidu.com", "bandcamp.com", "barnesandnoble.com", "bbb.org", "bbc.co.uk", "bbc.com", + "bbs.dedecms.com", "behance.net", "beian.gov.cn", "berkeley.edu", @@ -64,6 +75,7 @@ "bit.ly", "bitbucket.org", "bizjournals.com", + "bizvektor.com", "blackberry.com", "blogger.com", "blogspot.co.uk", @@ -76,14 +88,18 @@ "bmj.com", "booking.com", "box.com", + "br.wordpress.org", + "brokercheck.finra.org", "bund.de", "businessinsider.com", "businesswire.com", "buydomains.com", "buzzfeed.com", "ca.gov", + "calendar.google.com/calendar/render", "cam.ac.uk", "canada.ca", + "catchthemes.com", "cbc.ca", "cbslocal.com", "cbsnews.com", @@ -93,31 +109,52 @@ "chicagotribune.com", "cisco.com", "clickbank.net", + "cloud.feedly.com", "cloudfront.net", "cmu.edu", + "cn.wordpress.org", "cnbc.com", "cnet.com", "cnn.com", + "codex.wordpress.org", "colorlib.com", "columbia.edu", "congress.gov", + "connect.mail.ru/share", "constantcontact.com", "cornell.edu", "cpanel.com", "cpanel.net", "creativecommons.org", + "creativecommons.org/licenses/by-nc-sa/3.0", + "creativecommons.org/licenses/by-sa/2.0", + "creativecommons.org/licenses/by-sa/3.0", + "creativecommons.org/licenses/by/2.0", + "creativecommons.org/licenses/by/3.0", + "creativecommons.org/licenses/by/4.0", "cryoutcreations.eu", + "cyberchimps.com/responsive-theme", "dailymail.co.uk", "dailymotion.com", + "de-de.facebook.com/policy.php", + "de.wordpress.org", "debian.org", "dedecms.com", + "del.icio.us/post", "delicious.com", "deloitte.com", + "devblog.plesk.com", + "developers.facebook.com/docs/plugins", + "developers.google.com/analytics/devguides/collection/analyticsjs/cookie-usage", + "developers.google.com/analytics/devguides/collection/analyticsjs/cookie-usage?hl=es&csw=1", "deviantart.com", "dhs.gov", "dictionary.com", "digg.com", + "digg.com/submit", + "discuz.qq.com/service/security", "disqus.com", + "disqus.com/?ref_noscript", "dmca.com", "doi.org", "dol.gov", @@ -135,6 +172,8 @@ "duke.edu", "e-recht24.de", "ebay.com", + "ec.europa.eu/consumers/odr", + "ec.europa.eu/info/departments/justice-and-consumers_en", "economist.com", "ed.gov", "eepurl.com", @@ -146,6 +185,7 @@ "engadget.com", "entrepreneur.com", "epa.gov", + "es.wordpress.org", "etracker.de", "etsy.com", "europa.eu", @@ -155,6 +195,7 @@ "example.com", "exblog.jp", "facebook.com", + "facebook.com/sharer.php", "fao.org", "fastcompany.com", "fb.com", @@ -164,6 +205,9 @@ "fcc.gov", "fda.gov", "feedburner.com", + "feedburner.google.com", + "feedjit.com", + "feedly.com/index.html", "flickr.com", "forbes.com", "fortune.com", @@ -174,15 +218,26 @@ "ftc.gov", "g.co", "gartner.com", + "generatepress.com", "geocities.jp", "gesetze-im-internet.de", + "get.adobe.com/de/reader", + "get.adobe.com/flashplayer", + "get.adobe.com/jp/reader", + "get.adobe.com/reader", + "getbootstrap.com", "getpocket.com", + "getpocket.com/save", "giphy.com", "github.com", "github.io", "globo.com", + "gmail.com", + "gmpg.org/xfn", "gnu.org", "go.com", + "go.cpanel.net/cleardnscache", + "go.microsoft.com/fwlink/?linkid=66138&clcid=0x409", "godaddy.com", "gofundme.com", "goo.gl", @@ -209,22 +264,32 @@ "gotowebinar.com", "gpo.gov", "gravatar.com", + "gravatar.com/site/signup", + "gtranslate.net", "guardian.co.uk", "harvard.edu", "hatena.ne.jp", "hbr.org", + "help.opera.com/Windows/10.00/it/cookies.html", "hhs.gov", "hibu.com", "hilton.com", "histats.com", "hollywoodreporter.com", "home.pl", + "home.pl/kontakt", + "homeads.home.pl/ads/www/delivery/ck.php?n=f90e22f", "homestead.com", + "hootsuite.com", "hostgator.com", + "hostingmanager.secureserver.net", "hostnet.nl", "house.gov", "houzz.com", "hp.com", + "html5up.net", + "httpd.apache.org", + "httpd.apache.org/docs/2.4/mod/mod_userdir.html", "hubspot.com", "huffingtonpost.com", "ibm.com", @@ -244,36 +309,54 @@ "irs.gov", "iso.org", "issuu.com", + "it.wordpress.org", "iubenda.com", + "ja.wordpress.org", + "jalbum.net", + "jalbum.net/en", "japanpost.jp", "java.com", "jiathis.com", + "jigsaw.w3.org/css-validator", + "jigsaw.w3.org/css-validator/check/referer", + "jigsaw.w3.org/css-validator/check/referer?profile=css3", "jimdo.com", + "joomla-extensions.kubik-rubik.de", "joomla.org", + "jquery.com", "jugem.jp", "justgiving.com", "justice.gov", + "kb.plesk.com", "kickstarter.com", "latimes.com", + "lazaworx.com", "libsyn.com", + "lifestream.aol.com", "line.me", "linkedin.com", "list-manage.com", "list-manage1.com", + "listings.homestead.com", "live.com", "livedoor.jp", "livejournal.com", "loc.gov", + "logc204.xiti.com/go.click?xts=453041&s2=14&p=homepage::kundendefault::index::button-mehr-info&clic=N&type=click", "loopia.com", "loopia.se", "macromedia.com", + "mail.google.com/mail", "mail.ru", "mailchimp.com", "mapquest.com", + "maps.google.com", + "maps.google.com/maps?f=d&source=s_d&daddr=&saddr=&hl=en&geocode=&mra=ls&sll=37.0625,-95.677068&sspn=49.176833,114.257812&ie=UTF8&t=h&z=12", "marriott.com", "mashable.com", "medium.com", "meetup.com", + "megagroup.ru", "mhlw.go.jp", "microsoft.com", "miibeian.gov.cn", @@ -282,6 +365,7 @@ "mit.edu", "mlb.com", "mlit.go.jp", + "mobirise.com", "moodle.org", "moz.com", "mozilla.com", @@ -291,6 +375,7 @@ "mynavi.jp", "myshopify.com", "myspace.com", + "myspace.com/Modules/PostTo/Pages", "mysql.com", "namejet.com", "nasa.gov", @@ -299,7 +384,9 @@ "naver.com", "nazwa.pl", "nbcnews.com", + "netcn.console.aliyun.com/core/host/list2", "netflix.com", + "netscape.aol.com", "netscape.com", "networkadvertising.org", "networksolutions.com", @@ -310,6 +397,7 @@ "nifty.com", "nih.gov", "nist.gov", + "nl.wordpress.org", "noaa.gov", "npr.org", "nps.gov", @@ -323,12 +411,16 @@ "one.com", "opencart.com", "opensource.org", + "opensource.org/licenses/gpl-license.php", "opera.com", + "optout.networkadvertising.org", "oracle.com", "oreilly.com", "oup.com", + "outlook.live.com/owa", "ow.ly", "ox.ac.uk", + "panel.dreamhost.com", "parallels.com", "paypal.com", "pbs.org", @@ -337,9 +429,22 @@ "php.net", "phpbb.com", "pinterest.com", + "pinterest.com/pin/create/button", + "pinterest.com/pin/create/button/?description=", + "pinterest.com/pin/create/button/?media=", + "pixabay.com", + "pl.wordpress.org", + "planet.wordpress.org", "playstation.com", "plesk.com", "plos.org", + "plus.google.com", + "plus.google.com/communities/109881979300958500728", + "plus.google.com/share", + "plus.google.com/share?url=", + "plusone.google.com/_/+1/confirm?hl=en", + "presscustomizr.com", + "presscustomizr.com/customizr", "prestashop.com", "prnewswire.com", "psu.edu", @@ -351,10 +456,13 @@ "rambler.ru", "redcross.org", "reddit.com", + "reddit.com/submit", "reference.com", "researchgate.net", "reuters.com", "rs6.net", + "ru.wordpress.org", + "safeharbor.export.gov/companyinfo.aspx?id=16626", "sagepub.com", "sakura.ne.jp", "samsung.com", @@ -373,9 +481,11 @@ "si.edu", "sina.com.cn", "siteorigin.com", + "sites.google.com", "skype.com", "slate.com", "slideshare.net", + "smallbusiness.yahoo.com/webhosting", "snapchat.com", "sogou.com", "sohu.com", @@ -390,34 +500,64 @@ "stanford.edu", "starwoodhotels.com", "statcounter.com", + "statcounter.com/free-hit-counter", + "statcounter.com/free-web-stats", + "statcounter.com/shopify", + "statcounter.com/tumblr", "state.gov", "steampowered.com", "storify.com", "studiopress.com", "stumbleupon.com", "sun.com", + "support.apple.com/it-it/HT201265", + "support.apple.com/kb/PH5042", + "support.apple.com/kb/ph5042", + "support.google.com/analytics/answer/6004245?hl=de", + "support.google.com/answer/23852", + "support.google.com/chrome/answer/95647?hl=es", + "support.google.com/chrome/answer/95647?hl=it", + "support.google.com/chrome/bin/answer.py?hl=es&answer=95647", + "support.microsoft.com/help/17442", + "support.microsoft.com/windows", + "support.mozilla.org/es/kb/habilitar-y-deshabilitar-cookies-que-los-sitios-we", + "support.mozilla.org/it/kb/Attivare%20e%20disattivare%20i%20cookie", + "support.plesk.com", + "support.plesk.com/hc", "surveymonkey.com", "symantec.com", "t.co", "t.me", + "talk.plesk.com", "tandfonline.com", "taobao.com", "teamviewer.com", "techcrunch.com", + "technorati.com/faves", "ted.com", "telegram.me", "telegraph.co.uk", + "templated.co", "theatlantic.com", "theguardian.com", "thehill.com", + "theme-fusion.com", "themeforest.net", "themegrill.com", + "themegrill.com/themes/colormag", + "themegrill.com/themes/spacious", "thenextweb.com", "theverge.com", "ticketmaster.com", "time.com", "tmall.com", "today.com", + "tools.google.com/dlpage/gaoptout", + "tools.google.com/dlpage/gaoptout?hl=de", + "tools.google.com/dlpage/gaoptout?hl=en", + "tools.google.com/dlpage/gaoptout?hl=it", + "top100.rambler.ru/top100", + "translate.google.com", "tripadvisor.co.uk", "tripadvisor.com", "trustpilot.com", @@ -425,6 +565,15 @@ "tumblr.com", "twitch.tv", "twitter.com", + "twitter.com/Plesk", + "twitter.com/account/settings", + "twitter.com/home", + "twitter.com/intent/tweet", + "twitter.com/intent/tweet?text=", + "twitter.com/onecom", + "twitter.com/privacy", + "twitter.com/share", + "twitter.com/share?text=", "typeform.com", "typepad.com", "uchicago.edu", @@ -448,13 +597,21 @@ "ustream.tv", "utexas.edu", "va.gov", + "validator.w3.org", + "validator.w3.org/check", + "validator.w3.org/check/referer", + "validator.w3.org/check?uri=referer", "variety.com", "venturebeat.com", "vice.com", "vimeo.com", + "vinaora.com", "visma.com", "vk.com", + "vk.com/login?act=vkcomredirect&to=c2hhcmUucGhw", + "vk.com/share.php", "vkontakte.ru", + "vkontakte.ru/share.php", "w3.org", "w3schools.com", "warnerbros.com", @@ -473,22 +630,320 @@ "wikimedia.org", "wikipedia.org", "wiley.com", + "windows.microsoft.com/en-us/internet-explorer/products/ie/home", + "windows.microsoft.com/es-es/windows7/how-to-manage-cookies-in-internet-explorer-9", + "windows.microsoft.com/it-it/windows-vista/block-or-allow-cookies", "windowsphone.com", "wired.com", "wisc.edu", "wix.com", "wixsite.com", + "woocommerce.com", "wordpress.com", + "wordpress.com/?ref=footer_blog", + "wordpress.com/?ref=footer_website", + "wordpress.com/themes", "wordpress.org", + "wordpress.org/extend/ideas", + "wordpress.org/extend/plugins", + "wordpress.org/extend/themes", + "wordpress.org/news", + "wordpress.org/plugins", + "wordpress.org/plugins/asesor-cookies-para-la-ley-en-espana", + "wordpress.org/support", + "wordpress.org/support/forum/requests-and-feedback", + "wordpress.org/themes", "worldbank.org", + "wowslider.com", "wp.com", "wp.me", + "wpfr.net", "wsimg.com", "wsj.com", "wufoo.com", "wunderground.com", + "www-redirect.ext.hp.com", + "www.000webhost.com/migrate?static=true", + "www.163.com", + "www.1und1.de", + "www.22.cn", + "www.4.cn/company/contactus", + "www.51.la/?19089091", + "www.aboutads.info/choices", + "www.aboutcookies.org", + "www.addthis.com/bookmark.php", + "www.addthis.com/bookmark.php?v=20", + "www.addthis.com/bookmark.php?v=250", + "www.addtoany.com/share", + "www.addtoany.com/share_save", + "www.adobe.com", + "www.adobe.com/go/getflash", + "www.adobe.com/jp/products/acrobat/readstep2.html", + "www.adobe.com/products/acrobat/readstep.html", + "www.adobe.com/products/acrobat/readstep2.html", + "www.adobe.com/shockwave/download/download.cgi?P1_Prod_Version=ShockwaveFlash&promoid=BIOW", + "www.alipay.com", + "www.allaboutcookies.org", + "www.amazon.com", + "www.andersnoren.se", + "www.aol.com", + "www.apache.org", + "www.apache.org/licenses/LICENSE-2.0", + "www.apple.com", + "www.apple.com/mac", + "www.apple.com/safari", + "www.artisteer.com/?p=joomla_templates", + "www.authorize.net", + "www.axs.com", + "www.baidu.com", + "www.bbc.co.uk", + "www.bing.com", + "www.blogger.com", + "www.bluehost.com", + "www.booking.com", + "www.cdc.gov", + "www.chronoengine.com", + "www.cia.gov/redirects/ciaredirect.html", + "www.cisco.com", + "www.cnn.com", + "www.comsenz.com", + "www.cryoutcreations.eu", + "www.dedecms.com", + "www.discuz.net", + "www.domainname.de", + "www.domainnameshop.com", + "www.domainnameshop.com/whois", + "www.domeneshop.no", + "www.dreamhost.com", + "www.dropbox.com", + "www.drupal.org", + "www.e-recht24.de", + "www.e-recht24.de/artikel/datenschutz/6590-facebook-like-button-datenschutz-disclaimer.html", + "www.e-recht24.de/artikel/datenschutz/6635-datenschutz-rechtliche-risiken-bei-der-nutzung-von-google-analytics-und-googleadsense.html", + "www.e-recht24.de/impressum-generator.html", + "www.e-recht24.de/muster-datenschutzerklaerung.html", + "www.e-recht24.de/muster-disclaimer.htm", + "www.e-recht24.de/muster-disclaimer.html", + "www.ebay.com", + "www.elegantthemes.com", + "www.enable-javascript.com", + "www.ename.com.cn/custompage/custompagestyle", + "www.enom.com/help/Default.aspx", + "www.epa.gov", + "www.example.com", + "www.facebook.com", + "www.facebook.com/Onecom", + "www.facebook.com/Plesk", + "www.facebook.com/about/privacy", + "www.facebook.com/business/dashboard", + "www.facebook.com/facebook", + "www.facebook.com/help/cookies", + "www.facebook.com/home.php", + "www.facebook.com/policy.php", + "www.facebook.com/share.php", + "www.facebook.com/sharer.php", + "www.facebook.com/sharer.php?t=", + "www.facebook.com/sharer/sharer.php", + "www.facebook.com/sharer/sharer.php?src=sdkpreparse", + "www.facebook.com/sharer/sharer.php?u=", + "www.fda.gov", + "www.finra.org", + "www.flickr.com", + "www.format.com/l/your_new_portfolio", + "www.forpsi.com", + "www.freecsstemplates.org", + "www.gimp.org", + "www.gmail.com", + "www.gnu.org", + "www.gnu.org/copyleft/gpl.html", + "www.gnu.org/licenses/gpl-2.0.html", + "www.gnu.org/licenses/gpl.html", + "www.godaddy.com", + "www.godaddy.com/hosting/website-builder.aspx?isc=wscfwst304", + "www.godaddy.com/websites/website-builder", + "www.godaddy.com/websites/website-builder?cvosrc=assets.wsb_badge.wsb_badge", + "www.google.co.jp", + "www.google.co.uk", + "www.google.com", + "www.google.com/a/UniversalLogin?service=jotspot", + "www.google.com/analytics", + "www.google.com/analytics/learn/privacy.html", + "www.google.com/analytics/terms/de.html", + "www.google.com/calendar/render", + "www.google.com/chrome", + "www.google.com/gmail", + "www.google.com/intl/de/+/policy/+1button.html", + "www.google.com/intl/de/analytics/privacyoverview.html", + "www.google.com/intl/de/policies/privacy", + "www.google.com/intl/en/policies/privacy", + "www.google.com/intl/it/policies/privacy", + "www.google.com/policies/privacy", + "www.google.com/policies/privacy/ads", + "www.google.com/policies/technologies/cookies", + "www.google.com/privacy_ads.html", + "www.google.com/search?q=whois", + "www.google.com/support/bin/answer.py?answer=23852", + "www.google.de", + "www.google.de/intl/de/policies/privacy", + "www.google.it/intl/it/policies/privacy", + "www.gosuslugi.ru", + "www.gov.cn", "www.gov.uk", + "www.graphene-theme.com", + "www.haosou.com", + "www.histats.com", + "www.homestead.com", + "www.hostgator.com", + "www.hotmail.com", + "www.hp.com", + "www.huffingtonpost.com", + "www.hupso.com/share", + "www.ibm.com", + "www.ifeng.com", + "www.imdb.com", + "www.instagram.com", + "www.iqiyi.com", + "www.irs.gov", + "www.ispconfig.org", + "www.jd.com", + "www.jiathis.com/share", + "www.joomla.org", + "www.joomlatune.com", + "www.joomshaper.com", + "www.jssor.com", + "www.kickstarter.com", + "www.kriesi.at", + "www.linkedin.com", + "www.linkedin.com/cws/share", + "www.linkedin.com/legal/privacy-policy", + "www.linkedin.com/shareArticle?mini=true", + "www.linkwithin.com", + "www.liveinternet.ru/click", + "www.loopia.se", + "www.luminate.com/webhosting", + "www.lycos.com", + "www.macromedia.com/go/getflashplayer", + "www.mapquest.com", + "www.mapy.cz", + "www.mediawiki.org", + "www.mhthemes.com", + "www.microsoft.com", + "www.microsoft.com/en-us/windows", + "www.miibeian.gov.cn", + "www.miitbeian.gov.cn", + "www.mijndomein.nl", + "www.mijndomein.nl/producten", + "www.mijndomein.nl/producten/websitemaker", + "www.mozilla.org", + "www.mozilla.org/en-US", + "www.mozilla.org/en-US/firefox/new", + "www.mozilla.org/firefox/new", + "www.msn.com", + "www.myspace.com", + "www.myspace.com/Modules/PostTo/Pages", + "www.mysql.com", + "www.nasa.gov", + "www.netvibes.com/subscribe.php", + "www.networkadvertising.org/choices", + "www.networkadvertising.org/managing/opt_out.asp", + "www.networksolutions.com", + "www.nginx.com", "www.nhs.uk", + "www.nytimes.com", + "www.odin.com", + "www.olark.com/?welcome", + "www.one.com/en", + "www.opencart.com", + "www.opera.com", + "www.oracle.com/index.html", + "www.ovh.com", + "www.pagesjaunes.fr", + "www.parallels.com", + "www.parallels.com/intro", + "www.parallels.com/plesk", + "www.parallels.com/products/automation/intro", + "www.parallels.com/products/containers/intro", + "www.parallels.com/products/desktop/intro", + "www.parallels.com/products/desktop/pd4wl/intro", + "www.parallels.com/products/panel/intro", + "www.parallels.com/products/server/intro", + "www.paypal.com", + "www.people.com.cn", + "www.phoca.cz", + "www.phoca.cz/phocadownload", + "www.phoca.cz/phocagallery", + "www.php.net", + "www.phpbb.com", + "www.phpbb.com/ideas", + "www.pinterest.com", + "www.pinterest.com/pin/create/button", + "www.pinterest.com/pin/create/button/?url=&media=&description=", + "www.plesk.com", + "www.plesk.com/blog", + "www.prestashop.com", + "www.python.org", + "www.qq.com", + "www.redcross.org", + "www.reddit.com", + "www.reuters.com", + "www.safenames.net/?ref=lndrdr", + "www.shinystat.com", + "www.shinystat.com/it", + "www.simplemachines.org", + "www.simplemachines.org/about/smf/license.php", + "www.sina.com.cn", + "www.sipc.org", + "www.skype.com", + "www.slideshare.net", + "www.so.com", + "www.sogou.com", + "www.sohu.com", + "www.statcounter.com", + "www.studiopress.com", + "www.stumbleupon.com", + "www.stumbleupon.com/submit", + "www.taobao.com", + "www.toplist.cz", + "www.tradeindia.com", + "www.tripadvisor.co.uk", + "www.tripadvisor.com", + "www.tucows.com", + "www.tucowsdomains.com", + "www.tumblr.com", + "www.tumblr.com/share/link", + "www.twitter.com", + "www.twitter.com/share", + "www.ubuntu.com", + "www.ucoz.ru", + "www.usatoday.com", + "www.ustream.tv", + "www.value-domain.com", + "www.vektor-inc.co.jp", + "www.visma.com", + "www.w3.org", + "www.washingtonpost.com", + "www.webmd.com", + "www.weebly.com", + "www.whitehouse.gov", + "www.who.int/en", + "www.wikipedia.org", + "www.woothemes.com", + "www.wordpress-fr.net", + "www.wordpress.com", + "www.wordpress.org", + "www.wsj.com", + "www.xing.com/app/share?op=data_protection", + "www.xinhuanet.com", + "www.xml-sitemaps.com", + "www.yahoo.co.jp", + "www.yahoo.com", + "www.yelp.com", + "www.yootheme.com", + "www.youku.com", + "www.youronlinechoices.com", + "www.youronlinechoices.com/uk/your-ad-choices", + "www.youtube.com", + "www.zend.com", "xing.com", "xinhuanet.com", "xiti.com", @@ -497,6 +952,7 @@ "yale.edu", "yandex.ru", "yelp.com", + "yootheme.com", "youku.com", "youronlinechoices.com", "youtu.be", @@ -507,6 +963,8 @@ ], "matching_attributes": [ "hostname", - "domain" + "domain", + "uri", + "url" ] } diff --git a/tools/generate_mozilla-top500.py b/tools/generate_mozilla-top500.py index 9feebc7..a572191 100755 --- a/tools/generate_mozilla-top500.py +++ b/tools/generate_mozilla-top500.py @@ -32,7 +32,7 @@ moz_warninglist['version'] = version moz_warninglist['name'] = "Top 500 domains and pages from Mozilla" moz_warninglist['type'] = 'hostname' moz_warninglist['list'] = [] -moz_warninglist['matching_attributes'] = ['hostname', 'domain'] +moz_warninglist['matching_attributes'] = ['hostname', 'domain', 'uri', 'url'] with open(moz_file_domains) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') @@ -47,6 +47,19 @@ with open(moz_file_domains) as csv_file: moz_warninglist['list'].append(v.rstrip().rstrip('/')) line_count += 1 +with open(moz_file_pages) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + line_count = 0 + for row in csv_reader: + if line_count == 0: + #print(f'Column names are {", ".join(row)}') + line_count += 1 + else: + #print(f'\t{row[0]}. {row[1]}, MozTrust: {row[5]}.') + v = row[1] + moz_warninglist['list'].append(v.rstrip().rstrip('/')) + line_count += 1 + moz_warninglist['list'] = sorted(set(moz_warninglist['list'])) print(json.dumps(moz_warninglist))