From 9e0b2ebc752dff7b3ec1ebf078dc99052bb533b2 Mon Sep 17 00:00:00 2001 From: Steve Clement Date: Wed, 24 Apr 2019 09:45:56 +0900 Subject: [PATCH] new: [list] Added Mozilla Top 500 domains --- lists/mozilla-top500/README.md | 11 + lists/mozilla-top500/list.json | 512 +++++++++++++++++++++++++++++++ tools/generate_mozilla-top500.py | 4 +- 3 files changed, 525 insertions(+), 2 deletions(-) create mode 100644 lists/mozilla-top500/README.md create mode 100644 lists/mozilla-top500/list.json diff --git a/lists/mozilla-top500/README.md b/lists/mozilla-top500/README.md new file mode 100644 index 0000000..5ffffd7 --- /dev/null +++ b/lists/mozilla-top500/README.md @@ -0,0 +1,11 @@ +# The Moz Top 500 +Moz's list of the top 500 domains and pages on the web. + +Contains a list of the top 500 web pages ranked by the number of linking root domains. This data is sourced from the Mozcape web index of 818 Billion domains and 6 Trillion pages. + +## Update list + +```bash +wget https://moz.com/top500/pages/csv + +``` diff --git a/lists/mozilla-top500/list.json b/lists/mozilla-top500/list.json new file mode 100644 index 0000000..39560f2 --- /dev/null +++ b/lists/mozilla-top500/list.json @@ -0,0 +1,512 @@ +{ + "description": "Event contains one or more entries from the top 500 of the most used domains (Mozilla).", + "version": 20190424, + "name": "Top 500 domains and pages from Mozilla", + "type": "hostname", + "list": [ + "163.com", + "1688.com", + "1and1.com", + "1and1.fr", + "1und1.de", + "360.cn", + "4.cn", + "51.la", + "a8.net", + "aarp.org", + "abc.net.au", + "about.com", + "aboutads.info", + "aboutcookies.org", + "accuweather.com", + "acm.org", + "addthis.com", + "addtoany.com", + "admin.ch", + "adobe.com", + "adweek.com", + "alexa.com", + "alibaba.com", + "aliyun.com", + "allaboutcookies.org", + "amazon.co.jp", + "amazon.co.uk", + "amazon.com", + "amazon.de", + "amazon.fr", + "amazonaws.com", + "ameblo.jp", + "amzn.to", + "android.com", + "aol.com", + "apache.org", + "apple.com", + "archive.org", + "arstechnica.com", + "artisteer.com", + "arxiv.org", + "athemes.com", + "att.com", + "azurewebsites.net", + "baidu.com", + "bandcamp.com", + "barnesandnoble.com", + "bbb.org", + "bbc.co.uk", + "bbc.com", + "behance.net", + "beian.gov.cn", + "berkeley.edu", + "bigcartel.com", + "bigcommerce.com", + "bing.com", + "biomedcentral.com", + "bit.ly", + "bitbucket.org", + "bizjournals.com", + "blackberry.com", + "blogger.com", + "blogspot.co.uk", + "blogspot.com", + "blogspot.com.es", + "blogspot.jp", + "bloomberg.com", + "bls.gov", + "bluehost.com", + "bmj.com", + "booking.com", + "box.com", + "bund.de", + "businessinsider.com", + "businesswire.com", + "buydomains.com", + "buzzfeed.com", + "ca.gov", + "cam.ac.uk", + "canada.ca", + "cbc.ca", + "cbslocal.com", + "cbsnews.com", + "cdc.gov", + "census.gov", + "change.org", + "chicagotribune.com", + "cisco.com", + "clickbank.net", + "cloudfront.net", + "cmu.edu", + "cnbc.com", + "cnet.com", + "cnn.com", + "colorlib.com", + "columbia.edu", + "congress.gov", + "constantcontact.com", + "cornell.edu", + "cpanel.com", + "cpanel.net", + "creativecommons.org", + "cryoutcreations.eu", + "dailymail.co.uk", + "dailymotion.com", + "debian.org", + "dedecms.com", + "delicious.com", + "deloitte.com", + "deviantart.com", + "dhs.gov", + "dictionary.com", + "digg.com", + "disqus.com", + "dmca.com", + "doi.org", + "dol.gov", + "domainactive.co", + "domainname.de", + "domainnameshop.com", + "domainretailing.com", + "domeneshop.no", + "dot.gov", + "doubleclick.net", + "dreamhost.com", + "dribbble.com", + "dropbox.com", + "drupal.org", + "duke.edu", + "e-recht24.de", + "ebay.com", + "economist.com", + "ed.gov", + "eepurl.com", + "eff.org", + "elegantthemes.com", + "elsevier.com", + "enable-javascript.com", + "ename.com.cn", + "engadget.com", + "entrepreneur.com", + "epa.gov", + "etracker.de", + "etsy.com", + "europa.eu", + "eventbrite.co.uk", + "eventbrite.com", + "ewebdevelopment.com", + "example.com", + "exblog.jp", + "facebook.com", + "fao.org", + "fastcompany.com", + "fb.com", + "fb.me", + "fbcdn.net", + "fc2.com", + "fcc.gov", + "fda.gov", + "feedburner.com", + "flickr.com", + "forbes.com", + "fortune.com", + "foursquare.com", + "foxnews.com", + "free.fr", + "ft.com", + "ftc.gov", + "g.co", + "gartner.com", + "geocities.jp", + "gesetze-im-internet.de", + "getpocket.com", + "giphy.com", + "github.com", + "github.io", + "globo.com", + "gnu.org", + "go.com", + "godaddy.com", + "gofundme.com", + "goo.gl", + "goo.ne.jp", + "goodreads.com", + "google.be", + "google.ca", + "google.ch", + "google.co.in", + "google.co.jp", + "google.co.uk", + "google.com", + "google.com.au", + "google.com.br", + "google.de", + "google.es", + "google.fr", + "google.it", + "google.nl", + "google.pl", + "google.ru", + "googleapis.com", + "googleusercontent.com", + "gotowebinar.com", + "gpo.gov", + "gravatar.com", + "guardian.co.uk", + "harvard.edu", + "hatena.ne.jp", + "hbr.org", + "hhs.gov", + "hibu.com", + "hilton.com", + "histats.com", + "hollywoodreporter.com", + "home.pl", + "homestead.com", + "hostgator.com", + "hostnet.nl", + "house.gov", + "houzz.com", + "hp.com", + "hubspot.com", + "huffingtonpost.com", + "ibm.com", + "icann.org", + "icio.us", + "ieee.org", + "ietf.org", + "ifeng.com", + "illinois.edu", + "imdb.com", + "imgur.com", + "inc.com", + "independent.co.uk", + "indiatimes.com", + "instagram.com", + "intel.com", + "irs.gov", + "iso.org", + "issuu.com", + "iubenda.com", + "japanpost.jp", + "java.com", + "jiathis.com", + "jimdo.com", + "joomla.org", + "jugem.jp", + "justgiving.com", + "justice.gov", + "kickstarter.com", + "latimes.com", + "libsyn.com", + "line.me", + "linkedin.com", + "list-manage.com", + "list-manage1.com", + "live.com", + "livedoor.jp", + "livejournal.com", + "loc.gov", + "loopia.com", + "loopia.se", + "macromedia.com", + "mail.ru", + "mailchimp.com", + "mapquest.com", + "marriott.com", + "mashable.com", + "medium.com", + "meetup.com", + "mhlw.go.jp", + "microsoft.com", + "miibeian.gov.cn", + "miitbeian.gov.cn", + "mijndomein.nl", + "mit.edu", + "mlb.com", + "mlit.go.jp", + "moodle.org", + "moz.com", + "mozilla.com", + "mozilla.org", + "msdn.com", + "msn.com", + "mynavi.jp", + "myshopify.com", + "myspace.com", + "mysql.com", + "namejet.com", + "nasa.gov", + "nationalgeographic.com", + "nature.com", + "naver.com", + "nazwa.pl", + "nbcnews.com", + "netflix.com", + "netscape.com", + "networkadvertising.org", + "networksolutions.com", + "newyorker.com", + "nginx.com", + "nginx.org", + "nhk.or.jp", + "nifty.com", + "nih.gov", + "nist.gov", + "noaa.gov", + "npr.org", + "nps.gov", + "ny.gov", + "nytimes.com", + "nyu.edu", + "ocn.ne.jp", + "oecd.org", + "office.com", + "ok.ru", + "one.com", + "opencart.com", + "opensource.org", + "opera.com", + "oracle.com", + "oreilly.com", + "oup.com", + "ow.ly", + "ox.ac.uk", + "parallels.com", + "paypal.com", + "pbs.org", + "phoca.cz", + "photobucket.com", + "php.net", + "phpbb.com", + "pinterest.com", + "playstation.com", + "plesk.com", + "plos.org", + "prestashop.com", + "prnewswire.com", + "psu.edu", + "psychologytoday.com", + "python.org", + "qq.com", + "quantcast.com", + "rakuten.co.jp", + "rambler.ru", + "redcross.org", + "reddit.com", + "reference.com", + "researchgate.net", + "reuters.com", + "rs6.net", + "sagepub.com", + "sakura.ne.jp", + "samsung.com", + "sciencedirect.com", + "sciencemag.org", + "scientificamerican.com", + "scribd.com", + "sec.gov", + "secureserver.net", + "sedo.com", + "sedoparking.com", + "senate.gov", + "shinystat.com", + "shop-pro.jp", + "shopify.com", + "si.edu", + "sina.com.cn", + "siteorigin.com", + "skype.com", + "slate.com", + "slideshare.net", + "snapchat.com", + "sogou.com", + "sohu.com", + "soundcloud.com", + "sourceforge.net", + "spotify.com", + "springer.com", + "squarespace.com", + "squareup.com", + "ssa.gov", + "stackoverflow.com", + "stanford.edu", + "starwoodhotels.com", + "statcounter.com", + "state.gov", + "steampowered.com", + "storify.com", + "studiopress.com", + "stumbleupon.com", + "sun.com", + "surveymonkey.com", + "symantec.com", + "t.co", + "t.me", + "tandfonline.com", + "taobao.com", + "teamviewer.com", + "techcrunch.com", + "ted.com", + "telegram.me", + "telegraph.co.uk", + "theatlantic.com", + "theguardian.com", + "thehill.com", + "themeforest.net", + "themegrill.com", + "thenextweb.com", + "theverge.com", + "ticketmaster.com", + "time.com", + "tmall.com", + "today.com", + "tripadvisor.co.uk", + "tripadvisor.com", + "trustpilot.com", + "tucowsdomains.com", + "tumblr.com", + "twitch.tv", + "twitter.com", + "typeform.com", + "typepad.com", + "uchicago.edu", + "ucl.ac.uk", + "ucla.edu", + "umblr.com", + "umich.edu", + "umn.edu", + "un.org", + "unesco.org", + "unicef.org", + "unsplash.com", + "uol.com.br", + "upenn.edu", + "usa.gov", + "usatoday.com", + "usc.edu", + "usda.gov", + "usgs.gov", + "usnews.com", + "ustream.tv", + "utexas.edu", + "va.gov", + "variety.com", + "venturebeat.com", + "vice.com", + "vimeo.com", + "visma.com", + "vk.com", + "vkontakte.ru", + "w3.org", + "w3schools.com", + "warnerbros.com", + "washington.edu", + "washingtonpost.com", + "web.de", + "webmd.com", + "webs.com", + "weebly.com", + "weibo.com", + "whatsapp.com", + "whitehouse.gov", + "who.int", + "wikia.com", + "wikihow.com", + "wikimedia.org", + "wikipedia.org", + "wiley.com", + "windowsphone.com", + "wired.com", + "wisc.edu", + "wix.com", + "wixsite.com", + "wordpress.com", + "wordpress.org", + "worldbank.org", + "wp.com", + "wp.me", + "wsimg.com", + "wsj.com", + "wufoo.com", + "wunderground.com", + "www.gov.uk", + "www.nhs.uk", + "xing.com", + "xinhuanet.com", + "xiti.com", + "yahoo.co.jp", + "yahoo.com", + "yale.edu", + "yandex.ru", + "yelp.com", + "youku.com", + "youronlinechoices.com", + "youtu.be", + "youtube.com", + "zdnet.com", + "zendesk.com", + "zenfolio.com" + ], + "matching_attributes": [ + "hostname", + "domain" + ] +} diff --git a/tools/generate_mozilla-top500.py b/tools/generate_mozilla-top500.py index 04f426c..1afec96 100755 --- a/tools/generate_mozilla-top500.py +++ b/tools/generate_mozilla-top500.py @@ -34,8 +34,8 @@ with open(moz_file) as csv_file: line_count += 1 else: #print(f'\t{row[0]}. {row[1]}, MozTrust: {row[5]}.') - v = str(row).split(',')[1] - moz_warninglist['list'].append(v.rstrip().lstrip('/')) + v = row[1] + moz_warninglist['list'].append(v.rstrip().rstrip('/')) line_count += 1 moz_warninglist['list'] = sorted(set(moz_warninglist['list']))