new: [list] Added Mozilla Top 500 domains

pull/104/head
Steve Clement 2019-04-24 09:45:56 +09:00
parent 8f1fe94b49
commit 9e0b2ebc75
3 changed files with 525 additions and 2 deletions

View File

@ -0,0 +1,11 @@
# The Moz Top 500
Moz's list of the top 500 domains and pages on the web.
Contains a list of the top 500 web pages ranked by the number of linking root domains. This data is sourced from the Mozcape web index of 818 Billion domains and 6 Trillion pages.
## Update list
```bash
wget https://moz.com/top500/pages/csv
```

View File

@ -0,0 +1,512 @@
{
"description": "Event contains one or more entries from the top 500 of the most used domains (Mozilla).",
"version": 20190424,
"name": "Top 500 domains and pages from Mozilla",
"type": "hostname",
"list": [
"163.com",
"1688.com",
"1and1.com",
"1and1.fr",
"1und1.de",
"360.cn",
"4.cn",
"51.la",
"a8.net",
"aarp.org",
"abc.net.au",
"about.com",
"aboutads.info",
"aboutcookies.org",
"accuweather.com",
"acm.org",
"addthis.com",
"addtoany.com",
"admin.ch",
"adobe.com",
"adweek.com",
"alexa.com",
"alibaba.com",
"aliyun.com",
"allaboutcookies.org",
"amazon.co.jp",
"amazon.co.uk",
"amazon.com",
"amazon.de",
"amazon.fr",
"amazonaws.com",
"ameblo.jp",
"amzn.to",
"android.com",
"aol.com",
"apache.org",
"apple.com",
"archive.org",
"arstechnica.com",
"artisteer.com",
"arxiv.org",
"athemes.com",
"att.com",
"azurewebsites.net",
"baidu.com",
"bandcamp.com",
"barnesandnoble.com",
"bbb.org",
"bbc.co.uk",
"bbc.com",
"behance.net",
"beian.gov.cn",
"berkeley.edu",
"bigcartel.com",
"bigcommerce.com",
"bing.com",
"biomedcentral.com",
"bit.ly",
"bitbucket.org",
"bizjournals.com",
"blackberry.com",
"blogger.com",
"blogspot.co.uk",
"blogspot.com",
"blogspot.com.es",
"blogspot.jp",
"bloomberg.com",
"bls.gov",
"bluehost.com",
"bmj.com",
"booking.com",
"box.com",
"bund.de",
"businessinsider.com",
"businesswire.com",
"buydomains.com",
"buzzfeed.com",
"ca.gov",
"cam.ac.uk",
"canada.ca",
"cbc.ca",
"cbslocal.com",
"cbsnews.com",
"cdc.gov",
"census.gov",
"change.org",
"chicagotribune.com",
"cisco.com",
"clickbank.net",
"cloudfront.net",
"cmu.edu",
"cnbc.com",
"cnet.com",
"cnn.com",
"colorlib.com",
"columbia.edu",
"congress.gov",
"constantcontact.com",
"cornell.edu",
"cpanel.com",
"cpanel.net",
"creativecommons.org",
"cryoutcreations.eu",
"dailymail.co.uk",
"dailymotion.com",
"debian.org",
"dedecms.com",
"delicious.com",
"deloitte.com",
"deviantart.com",
"dhs.gov",
"dictionary.com",
"digg.com",
"disqus.com",
"dmca.com",
"doi.org",
"dol.gov",
"domainactive.co",
"domainname.de",
"domainnameshop.com",
"domainretailing.com",
"domeneshop.no",
"dot.gov",
"doubleclick.net",
"dreamhost.com",
"dribbble.com",
"dropbox.com",
"drupal.org",
"duke.edu",
"e-recht24.de",
"ebay.com",
"economist.com",
"ed.gov",
"eepurl.com",
"eff.org",
"elegantthemes.com",
"elsevier.com",
"enable-javascript.com",
"ename.com.cn",
"engadget.com",
"entrepreneur.com",
"epa.gov",
"etracker.de",
"etsy.com",
"europa.eu",
"eventbrite.co.uk",
"eventbrite.com",
"ewebdevelopment.com",
"example.com",
"exblog.jp",
"facebook.com",
"fao.org",
"fastcompany.com",
"fb.com",
"fb.me",
"fbcdn.net",
"fc2.com",
"fcc.gov",
"fda.gov",
"feedburner.com",
"flickr.com",
"forbes.com",
"fortune.com",
"foursquare.com",
"foxnews.com",
"free.fr",
"ft.com",
"ftc.gov",
"g.co",
"gartner.com",
"geocities.jp",
"gesetze-im-internet.de",
"getpocket.com",
"giphy.com",
"github.com",
"github.io",
"globo.com",
"gnu.org",
"go.com",
"godaddy.com",
"gofundme.com",
"goo.gl",
"goo.ne.jp",
"goodreads.com",
"google.be",
"google.ca",
"google.ch",
"google.co.in",
"google.co.jp",
"google.co.uk",
"google.com",
"google.com.au",
"google.com.br",
"google.de",
"google.es",
"google.fr",
"google.it",
"google.nl",
"google.pl",
"google.ru",
"googleapis.com",
"googleusercontent.com",
"gotowebinar.com",
"gpo.gov",
"gravatar.com",
"guardian.co.uk",
"harvard.edu",
"hatena.ne.jp",
"hbr.org",
"hhs.gov",
"hibu.com",
"hilton.com",
"histats.com",
"hollywoodreporter.com",
"home.pl",
"homestead.com",
"hostgator.com",
"hostnet.nl",
"house.gov",
"houzz.com",
"hp.com",
"hubspot.com",
"huffingtonpost.com",
"ibm.com",
"icann.org",
"icio.us",
"ieee.org",
"ietf.org",
"ifeng.com",
"illinois.edu",
"imdb.com",
"imgur.com",
"inc.com",
"independent.co.uk",
"indiatimes.com",
"instagram.com",
"intel.com",
"irs.gov",
"iso.org",
"issuu.com",
"iubenda.com",
"japanpost.jp",
"java.com",
"jiathis.com",
"jimdo.com",
"joomla.org",
"jugem.jp",
"justgiving.com",
"justice.gov",
"kickstarter.com",
"latimes.com",
"libsyn.com",
"line.me",
"linkedin.com",
"list-manage.com",
"list-manage1.com",
"live.com",
"livedoor.jp",
"livejournal.com",
"loc.gov",
"loopia.com",
"loopia.se",
"macromedia.com",
"mail.ru",
"mailchimp.com",
"mapquest.com",
"marriott.com",
"mashable.com",
"medium.com",
"meetup.com",
"mhlw.go.jp",
"microsoft.com",
"miibeian.gov.cn",
"miitbeian.gov.cn",
"mijndomein.nl",
"mit.edu",
"mlb.com",
"mlit.go.jp",
"moodle.org",
"moz.com",
"mozilla.com",
"mozilla.org",
"msdn.com",
"msn.com",
"mynavi.jp",
"myshopify.com",
"myspace.com",
"mysql.com",
"namejet.com",
"nasa.gov",
"nationalgeographic.com",
"nature.com",
"naver.com",
"nazwa.pl",
"nbcnews.com",
"netflix.com",
"netscape.com",
"networkadvertising.org",
"networksolutions.com",
"newyorker.com",
"nginx.com",
"nginx.org",
"nhk.or.jp",
"nifty.com",
"nih.gov",
"nist.gov",
"noaa.gov",
"npr.org",
"nps.gov",
"ny.gov",
"nytimes.com",
"nyu.edu",
"ocn.ne.jp",
"oecd.org",
"office.com",
"ok.ru",
"one.com",
"opencart.com",
"opensource.org",
"opera.com",
"oracle.com",
"oreilly.com",
"oup.com",
"ow.ly",
"ox.ac.uk",
"parallels.com",
"paypal.com",
"pbs.org",
"phoca.cz",
"photobucket.com",
"php.net",
"phpbb.com",
"pinterest.com",
"playstation.com",
"plesk.com",
"plos.org",
"prestashop.com",
"prnewswire.com",
"psu.edu",
"psychologytoday.com",
"python.org",
"qq.com",
"quantcast.com",
"rakuten.co.jp",
"rambler.ru",
"redcross.org",
"reddit.com",
"reference.com",
"researchgate.net",
"reuters.com",
"rs6.net",
"sagepub.com",
"sakura.ne.jp",
"samsung.com",
"sciencedirect.com",
"sciencemag.org",
"scientificamerican.com",
"scribd.com",
"sec.gov",
"secureserver.net",
"sedo.com",
"sedoparking.com",
"senate.gov",
"shinystat.com",
"shop-pro.jp",
"shopify.com",
"si.edu",
"sina.com.cn",
"siteorigin.com",
"skype.com",
"slate.com",
"slideshare.net",
"snapchat.com",
"sogou.com",
"sohu.com",
"soundcloud.com",
"sourceforge.net",
"spotify.com",
"springer.com",
"squarespace.com",
"squareup.com",
"ssa.gov",
"stackoverflow.com",
"stanford.edu",
"starwoodhotels.com",
"statcounter.com",
"state.gov",
"steampowered.com",
"storify.com",
"studiopress.com",
"stumbleupon.com",
"sun.com",
"surveymonkey.com",
"symantec.com",
"t.co",
"t.me",
"tandfonline.com",
"taobao.com",
"teamviewer.com",
"techcrunch.com",
"ted.com",
"telegram.me",
"telegraph.co.uk",
"theatlantic.com",
"theguardian.com",
"thehill.com",
"themeforest.net",
"themegrill.com",
"thenextweb.com",
"theverge.com",
"ticketmaster.com",
"time.com",
"tmall.com",
"today.com",
"tripadvisor.co.uk",
"tripadvisor.com",
"trustpilot.com",
"tucowsdomains.com",
"tumblr.com",
"twitch.tv",
"twitter.com",
"typeform.com",
"typepad.com",
"uchicago.edu",
"ucl.ac.uk",
"ucla.edu",
"umblr.com",
"umich.edu",
"umn.edu",
"un.org",
"unesco.org",
"unicef.org",
"unsplash.com",
"uol.com.br",
"upenn.edu",
"usa.gov",
"usatoday.com",
"usc.edu",
"usda.gov",
"usgs.gov",
"usnews.com",
"ustream.tv",
"utexas.edu",
"va.gov",
"variety.com",
"venturebeat.com",
"vice.com",
"vimeo.com",
"visma.com",
"vk.com",
"vkontakte.ru",
"w3.org",
"w3schools.com",
"warnerbros.com",
"washington.edu",
"washingtonpost.com",
"web.de",
"webmd.com",
"webs.com",
"weebly.com",
"weibo.com",
"whatsapp.com",
"whitehouse.gov",
"who.int",
"wikia.com",
"wikihow.com",
"wikimedia.org",
"wikipedia.org",
"wiley.com",
"windowsphone.com",
"wired.com",
"wisc.edu",
"wix.com",
"wixsite.com",
"wordpress.com",
"wordpress.org",
"worldbank.org",
"wp.com",
"wp.me",
"wsimg.com",
"wsj.com",
"wufoo.com",
"wunderground.com",
"www.gov.uk",
"www.nhs.uk",
"xing.com",
"xinhuanet.com",
"xiti.com",
"yahoo.co.jp",
"yahoo.com",
"yale.edu",
"yandex.ru",
"yelp.com",
"youku.com",
"youronlinechoices.com",
"youtu.be",
"youtube.com",
"zdnet.com",
"zendesk.com",
"zenfolio.com"
],
"matching_attributes": [
"hostname",
"domain"
]
}

View File

@ -34,8 +34,8 @@ with open(moz_file) as csv_file:
line_count += 1
else:
#print(f'\t{row[0]}. {row[1]}, MozTrust: {row[5]}.')
v = str(row).split(',')[1]
moz_warninglist['list'].append(v.rstrip().lstrip('/'))
v = row[1]
moz_warninglist['list'].append(v.rstrip().rstrip('/'))
line_count += 1
moz_warninglist['list'] = sorted(set(moz_warninglist['list']))