new: [list] The Moz Top 500 Domains and Pages (#104)

new: [list] The Moz Top 500 Domains and Pages
pull/107/head
Steve Clement 2019-04-24 10:40:11 +09:00 committed by GitHub
commit 284b5fa2c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 1050 additions and 0 deletions

View File

@ -0,0 +1,10 @@
# The Moz Top 500
Moz's list of the top 500 domains and pages on the web.
Contains a list of the top 500 web pages ranked by the number of linking root domains. This data is sourced from the Mozcape web index of 818 Billion domains and 6 Trillion pages.
## Update list
```bash
../../tools/generate_mozilla-top500.py |jq . > list.json
```

View File

@ -0,0 +1,970 @@
{
"description": "Event contains one or more entries from the top 500 of the most used domains (Mozilla).",
"version": 20190424,
"name": "Top 500 domains and pages from Mozilla",
"type": "hostname",
"list": [
"123-reg-expired.co.uk",
"163.com",
"1688.com",
"1and1.com",
"1and1.fr",
"1und1.de",
"360.cn",
"4.cn",
"51.la",
"a8.net",
"aarp.org",
"abc.net.au",
"abcnews.go.com",
"about.com",
"aboutads.info",
"aboutcookies.org",
"account.1und1.de",
"accounts.google.com/ServiceLogin?service=jotspot",
"accuweather.com",
"acm.org",
"add.my.yahoo.com/content",
"addthis.com",
"addtoany.com",
"admin.ch",
"adobe.com",
"adweek.com",
"alexa.com",
"alibaba.com",
"aliyun.com",
"allaboutcookies.org",
"amazon.co.jp",
"amazon.co.uk",
"amazon.com",
"amazon.de",
"amazon.fr",
"amazonaws.com",
"ameblo.jp",
"amzn.to",
"android.com",
"aol.com",
"apache.org",
"api.whatsapp.com",
"apple.com",
"apple.com/mac",
"archive.org",
"arstechnica.com",
"artisteer.com",
"arxiv.org",
"athemes.com",
"athemes.com/theme/sydney",
"att.com",
"automattic.com",
"azurewebsites.net",
"b.hatena.ne.jp/entry",
"baidu.com",
"bandcamp.com",
"barnesandnoble.com",
"bbb.org",
"bbc.co.uk",
"bbc.com",
"bbs.dedecms.com",
"behance.net",
"beian.gov.cn",
"berkeley.edu",
"bigcartel.com",
"bigcommerce.com",
"bing.com",
"biomedcentral.com",
"bit.ly",
"bitbucket.org",
"bizjournals.com",
"bizvektor.com",
"blackberry.com",
"blogger.com",
"blogspot.co.uk",
"blogspot.com",
"blogspot.com.es",
"blogspot.jp",
"bloomberg.com",
"bls.gov",
"bluehost.com",
"bmj.com",
"booking.com",
"box.com",
"br.wordpress.org",
"brokercheck.finra.org",
"bund.de",
"businessinsider.com",
"businesswire.com",
"buydomains.com",
"buzzfeed.com",
"ca.gov",
"calendar.google.com/calendar/render",
"cam.ac.uk",
"canada.ca",
"catchthemes.com",
"cbc.ca",
"cbslocal.com",
"cbsnews.com",
"cdc.gov",
"census.gov",
"change.org",
"chicagotribune.com",
"cisco.com",
"clickbank.net",
"cloud.feedly.com",
"cloudfront.net",
"cmu.edu",
"cn.wordpress.org",
"cnbc.com",
"cnet.com",
"cnn.com",
"codex.wordpress.org",
"colorlib.com",
"columbia.edu",
"congress.gov",
"connect.mail.ru/share",
"constantcontact.com",
"cornell.edu",
"cpanel.com",
"cpanel.net",
"creativecommons.org",
"creativecommons.org/licenses/by-nc-sa/3.0",
"creativecommons.org/licenses/by-sa/2.0",
"creativecommons.org/licenses/by-sa/3.0",
"creativecommons.org/licenses/by/2.0",
"creativecommons.org/licenses/by/3.0",
"creativecommons.org/licenses/by/4.0",
"cryoutcreations.eu",
"cyberchimps.com/responsive-theme",
"dailymail.co.uk",
"dailymotion.com",
"de-de.facebook.com/policy.php",
"de.wordpress.org",
"debian.org",
"dedecms.com",
"del.icio.us/post",
"delicious.com",
"deloitte.com",
"devblog.plesk.com",
"developers.facebook.com/docs/plugins",
"developers.google.com/analytics/devguides/collection/analyticsjs/cookie-usage",
"developers.google.com/analytics/devguides/collection/analyticsjs/cookie-usage?hl=es&csw=1",
"deviantart.com",
"dhs.gov",
"dictionary.com",
"digg.com",
"digg.com/submit",
"discuz.qq.com/service/security",
"disqus.com",
"disqus.com/?ref_noscript",
"dmca.com",
"doi.org",
"dol.gov",
"domainactive.co",
"domainname.de",
"domainnameshop.com",
"domainretailing.com",
"domeneshop.no",
"dot.gov",
"doubleclick.net",
"dreamhost.com",
"dribbble.com",
"dropbox.com",
"drupal.org",
"duke.edu",
"e-recht24.de",
"ebay.com",
"ec.europa.eu/consumers/odr",
"ec.europa.eu/info/departments/justice-and-consumers_en",
"economist.com",
"ed.gov",
"eepurl.com",
"eff.org",
"elegantthemes.com",
"elsevier.com",
"enable-javascript.com",
"ename.com.cn",
"engadget.com",
"entrepreneur.com",
"epa.gov",
"es.wordpress.org",
"etracker.de",
"etsy.com",
"europa.eu",
"eventbrite.co.uk",
"eventbrite.com",
"ewebdevelopment.com",
"example.com",
"exblog.jp",
"facebook.com",
"facebook.com/sharer.php",
"fao.org",
"fastcompany.com",
"fb.com",
"fb.me",
"fbcdn.net",
"fc2.com",
"fcc.gov",
"fda.gov",
"feedburner.com",
"feedburner.google.com",
"feedjit.com",
"feedly.com/index.html",
"flickr.com",
"forbes.com",
"fortune.com",
"foursquare.com",
"foxnews.com",
"free.fr",
"ft.com",
"ftc.gov",
"g.co",
"gartner.com",
"generatepress.com",
"geocities.jp",
"gesetze-im-internet.de",
"get.adobe.com/de/reader",
"get.adobe.com/flashplayer",
"get.adobe.com/jp/reader",
"get.adobe.com/reader",
"getbootstrap.com",
"getpocket.com",
"getpocket.com/save",
"giphy.com",
"github.com",
"github.io",
"globo.com",
"gmail.com",
"gmpg.org/xfn",
"gnu.org",
"go.com",
"go.cpanel.net/cleardnscache",
"go.microsoft.com/fwlink/?linkid=66138&clcid=0x409",
"godaddy.com",
"gofundme.com",
"goo.gl",
"goo.ne.jp",
"goodreads.com",
"google.be",
"google.ca",
"google.ch",
"google.co.in",
"google.co.jp",
"google.co.uk",
"google.com",
"google.com.au",
"google.com.br",
"google.de",
"google.es",
"google.fr",
"google.it",
"google.nl",
"google.pl",
"google.ru",
"googleapis.com",
"googleusercontent.com",
"gotowebinar.com",
"gpo.gov",
"gravatar.com",
"gravatar.com/site/signup",
"gtranslate.net",
"guardian.co.uk",
"harvard.edu",
"hatena.ne.jp",
"hbr.org",
"help.opera.com/Windows/10.00/it/cookies.html",
"hhs.gov",
"hibu.com",
"hilton.com",
"histats.com",
"hollywoodreporter.com",
"home.pl",
"home.pl/kontakt",
"homeads.home.pl/ads/www/delivery/ck.php?n=f90e22f",
"homestead.com",
"hootsuite.com",
"hostgator.com",
"hostingmanager.secureserver.net",
"hostnet.nl",
"house.gov",
"houzz.com",
"hp.com",
"html5up.net",
"httpd.apache.org",
"httpd.apache.org/docs/2.4/mod/mod_userdir.html",
"hubspot.com",
"huffingtonpost.com",
"ibm.com",
"icann.org",
"icio.us",
"ieee.org",
"ietf.org",
"ifeng.com",
"illinois.edu",
"imdb.com",
"imgur.com",
"inc.com",
"independent.co.uk",
"indiatimes.com",
"instagram.com",
"intel.com",
"irs.gov",
"iso.org",
"issuu.com",
"it.wordpress.org",
"iubenda.com",
"ja.wordpress.org",
"jalbum.net",
"jalbum.net/en",
"japanpost.jp",
"java.com",
"jiathis.com",
"jigsaw.w3.org/css-validator",
"jigsaw.w3.org/css-validator/check/referer",
"jigsaw.w3.org/css-validator/check/referer?profile=css3",
"jimdo.com",
"joomla-extensions.kubik-rubik.de",
"joomla.org",
"jquery.com",
"jugem.jp",
"justgiving.com",
"justice.gov",
"kb.plesk.com",
"kickstarter.com",
"latimes.com",
"lazaworx.com",
"libsyn.com",
"lifestream.aol.com",
"line.me",
"linkedin.com",
"list-manage.com",
"list-manage1.com",
"listings.homestead.com",
"live.com",
"livedoor.jp",
"livejournal.com",
"loc.gov",
"logc204.xiti.com/go.click?xts=453041&s2=14&p=homepage::kundendefault::index::button-mehr-info&clic=N&type=click",
"loopia.com",
"loopia.se",
"macromedia.com",
"mail.google.com/mail",
"mail.ru",
"mailchimp.com",
"mapquest.com",
"maps.google.com",
"maps.google.com/maps?f=d&source=s_d&daddr=&saddr=&hl=en&geocode=&mra=ls&sll=37.0625,-95.677068&sspn=49.176833,114.257812&ie=UTF8&t=h&z=12",
"marriott.com",
"mashable.com",
"medium.com",
"meetup.com",
"megagroup.ru",
"mhlw.go.jp",
"microsoft.com",
"miibeian.gov.cn",
"miitbeian.gov.cn",
"mijndomein.nl",
"mit.edu",
"mlb.com",
"mlit.go.jp",
"mobirise.com",
"moodle.org",
"moz.com",
"mozilla.com",
"mozilla.org",
"msdn.com",
"msn.com",
"mynavi.jp",
"myshopify.com",
"myspace.com",
"myspace.com/Modules/PostTo/Pages",
"mysql.com",
"namejet.com",
"nasa.gov",
"nationalgeographic.com",
"nature.com",
"naver.com",
"nazwa.pl",
"nbcnews.com",
"netcn.console.aliyun.com/core/host/list2",
"netflix.com",
"netscape.aol.com",
"netscape.com",
"networkadvertising.org",
"networksolutions.com",
"newyorker.com",
"nginx.com",
"nginx.org",
"nhk.or.jp",
"nifty.com",
"nih.gov",
"nist.gov",
"nl.wordpress.org",
"noaa.gov",
"npr.org",
"nps.gov",
"ny.gov",
"nytimes.com",
"nyu.edu",
"ocn.ne.jp",
"oecd.org",
"office.com",
"ok.ru",
"one.com",
"opencart.com",
"opensource.org",
"opensource.org/licenses/gpl-license.php",
"opera.com",
"optout.networkadvertising.org",
"oracle.com",
"oreilly.com",
"oup.com",
"outlook.live.com/owa",
"ow.ly",
"ox.ac.uk",
"panel.dreamhost.com",
"parallels.com",
"paypal.com",
"pbs.org",
"phoca.cz",
"photobucket.com",
"php.net",
"phpbb.com",
"pinterest.com",
"pinterest.com/pin/create/button",
"pinterest.com/pin/create/button/?description=",
"pinterest.com/pin/create/button/?media=",
"pixabay.com",
"pl.wordpress.org",
"planet.wordpress.org",
"playstation.com",
"plesk.com",
"plos.org",
"plus.google.com",
"plus.google.com/communities/109881979300958500728",
"plus.google.com/share",
"plus.google.com/share?url=",
"plusone.google.com/_/+1/confirm?hl=en",
"presscustomizr.com",
"presscustomizr.com/customizr",
"prestashop.com",
"prnewswire.com",
"psu.edu",
"psychologytoday.com",
"python.org",
"qq.com",
"quantcast.com",
"rakuten.co.jp",
"rambler.ru",
"redcross.org",
"reddit.com",
"reddit.com/submit",
"reference.com",
"researchgate.net",
"reuters.com",
"rs6.net",
"ru.wordpress.org",
"safeharbor.export.gov/companyinfo.aspx?id=16626",
"sagepub.com",
"sakura.ne.jp",
"samsung.com",
"sciencedirect.com",
"sciencemag.org",
"scientificamerican.com",
"scribd.com",
"sec.gov",
"secureserver.net",
"sedo.com",
"sedoparking.com",
"senate.gov",
"shinystat.com",
"shop-pro.jp",
"shopify.com",
"si.edu",
"sina.com.cn",
"siteorigin.com",
"sites.google.com",
"skype.com",
"slate.com",
"slideshare.net",
"smallbusiness.yahoo.com/webhosting",
"snapchat.com",
"sogou.com",
"sohu.com",
"soundcloud.com",
"sourceforge.net",
"spotify.com",
"springer.com",
"squarespace.com",
"squareup.com",
"ssa.gov",
"stackoverflow.com",
"stanford.edu",
"starwoodhotels.com",
"statcounter.com",
"statcounter.com/free-hit-counter",
"statcounter.com/free-web-stats",
"statcounter.com/shopify",
"statcounter.com/tumblr",
"state.gov",
"steampowered.com",
"storify.com",
"studiopress.com",
"stumbleupon.com",
"sun.com",
"support.apple.com/it-it/HT201265",
"support.apple.com/kb/PH5042",
"support.apple.com/kb/ph5042",
"support.google.com/analytics/answer/6004245?hl=de",
"support.google.com/answer/23852",
"support.google.com/chrome/answer/95647?hl=es",
"support.google.com/chrome/answer/95647?hl=it",
"support.google.com/chrome/bin/answer.py?hl=es&answer=95647",
"support.microsoft.com/help/17442",
"support.microsoft.com/windows",
"support.mozilla.org/es/kb/habilitar-y-deshabilitar-cookies-que-los-sitios-we",
"support.mozilla.org/it/kb/Attivare%20e%20disattivare%20i%20cookie",
"support.plesk.com",
"support.plesk.com/hc",
"surveymonkey.com",
"symantec.com",
"t.co",
"t.me",
"talk.plesk.com",
"tandfonline.com",
"taobao.com",
"teamviewer.com",
"techcrunch.com",
"technorati.com/faves",
"ted.com",
"telegram.me",
"telegraph.co.uk",
"templated.co",
"theatlantic.com",
"theguardian.com",
"thehill.com",
"theme-fusion.com",
"themeforest.net",
"themegrill.com",
"themegrill.com/themes/colormag",
"themegrill.com/themes/spacious",
"thenextweb.com",
"theverge.com",
"ticketmaster.com",
"time.com",
"tmall.com",
"today.com",
"tools.google.com/dlpage/gaoptout",
"tools.google.com/dlpage/gaoptout?hl=de",
"tools.google.com/dlpage/gaoptout?hl=en",
"tools.google.com/dlpage/gaoptout?hl=it",
"top100.rambler.ru/top100",
"translate.google.com",
"tripadvisor.co.uk",
"tripadvisor.com",
"trustpilot.com",
"tucowsdomains.com",
"tumblr.com",
"twitch.tv",
"twitter.com",
"twitter.com/Plesk",
"twitter.com/account/settings",
"twitter.com/home",
"twitter.com/intent/tweet",
"twitter.com/intent/tweet?text=",
"twitter.com/onecom",
"twitter.com/privacy",
"twitter.com/share",
"twitter.com/share?text=",
"typeform.com",
"typepad.com",
"uchicago.edu",
"ucl.ac.uk",
"ucla.edu",
"umblr.com",
"umich.edu",
"umn.edu",
"un.org",
"unesco.org",
"unicef.org",
"unsplash.com",
"uol.com.br",
"upenn.edu",
"usa.gov",
"usatoday.com",
"usc.edu",
"usda.gov",
"usgs.gov",
"usnews.com",
"ustream.tv",
"utexas.edu",
"va.gov",
"validator.w3.org",
"validator.w3.org/check",
"validator.w3.org/check/referer",
"validator.w3.org/check?uri=referer",
"variety.com",
"venturebeat.com",
"vice.com",
"vimeo.com",
"vinaora.com",
"visma.com",
"vk.com",
"vk.com/login?act=vkcomredirect&to=c2hhcmUucGhw",
"vk.com/share.php",
"vkontakte.ru",
"vkontakte.ru/share.php",
"w3.org",
"w3schools.com",
"warnerbros.com",
"washington.edu",
"washingtonpost.com",
"web.de",
"webmd.com",
"webs.com",
"weebly.com",
"weibo.com",
"whatsapp.com",
"whitehouse.gov",
"who.int",
"wikia.com",
"wikihow.com",
"wikimedia.org",
"wikipedia.org",
"wiley.com",
"windows.microsoft.com/en-us/internet-explorer/products/ie/home",
"windows.microsoft.com/es-es/windows7/how-to-manage-cookies-in-internet-explorer-9",
"windows.microsoft.com/it-it/windows-vista/block-or-allow-cookies",
"windowsphone.com",
"wired.com",
"wisc.edu",
"wix.com",
"wixsite.com",
"woocommerce.com",
"wordpress.com",
"wordpress.com/?ref=footer_blog",
"wordpress.com/?ref=footer_website",
"wordpress.com/themes",
"wordpress.org",
"wordpress.org/extend/ideas",
"wordpress.org/extend/plugins",
"wordpress.org/extend/themes",
"wordpress.org/news",
"wordpress.org/plugins",
"wordpress.org/plugins/asesor-cookies-para-la-ley-en-espana",
"wordpress.org/support",
"wordpress.org/support/forum/requests-and-feedback",
"wordpress.org/themes",
"worldbank.org",
"wowslider.com",
"wp.com",
"wp.me",
"wpfr.net",
"wsimg.com",
"wsj.com",
"wufoo.com",
"wunderground.com",
"www-redirect.ext.hp.com",
"www.000webhost.com/migrate?static=true",
"www.163.com",
"www.1und1.de",
"www.22.cn",
"www.4.cn/company/contactus",
"www.51.la/?19089091",
"www.aboutads.info/choices",
"www.aboutcookies.org",
"www.addthis.com/bookmark.php",
"www.addthis.com/bookmark.php?v=20",
"www.addthis.com/bookmark.php?v=250",
"www.addtoany.com/share",
"www.addtoany.com/share_save",
"www.adobe.com",
"www.adobe.com/go/getflash",
"www.adobe.com/jp/products/acrobat/readstep2.html",
"www.adobe.com/products/acrobat/readstep.html",
"www.adobe.com/products/acrobat/readstep2.html",
"www.adobe.com/shockwave/download/download.cgi?P1_Prod_Version=ShockwaveFlash&promoid=BIOW",
"www.alipay.com",
"www.allaboutcookies.org",
"www.amazon.com",
"www.andersnoren.se",
"www.aol.com",
"www.apache.org",
"www.apache.org/licenses/LICENSE-2.0",
"www.apple.com",
"www.apple.com/mac",
"www.apple.com/safari",
"www.artisteer.com/?p=joomla_templates",
"www.authorize.net",
"www.axs.com",
"www.baidu.com",
"www.bbc.co.uk",
"www.bing.com",
"www.blogger.com",
"www.bluehost.com",
"www.booking.com",
"www.cdc.gov",
"www.chronoengine.com",
"www.cia.gov/redirects/ciaredirect.html",
"www.cisco.com",
"www.cnn.com",
"www.comsenz.com",
"www.cryoutcreations.eu",
"www.dedecms.com",
"www.discuz.net",
"www.domainname.de",
"www.domainnameshop.com",
"www.domainnameshop.com/whois",
"www.domeneshop.no",
"www.dreamhost.com",
"www.dropbox.com",
"www.drupal.org",
"www.e-recht24.de",
"www.e-recht24.de/artikel/datenschutz/6590-facebook-like-button-datenschutz-disclaimer.html",
"www.e-recht24.de/artikel/datenschutz/6635-datenschutz-rechtliche-risiken-bei-der-nutzung-von-google-analytics-und-googleadsense.html",
"www.e-recht24.de/impressum-generator.html",
"www.e-recht24.de/muster-datenschutzerklaerung.html",
"www.e-recht24.de/muster-disclaimer.htm",
"www.e-recht24.de/muster-disclaimer.html",
"www.ebay.com",
"www.elegantthemes.com",
"www.enable-javascript.com",
"www.ename.com.cn/custompage/custompagestyle",
"www.enom.com/help/Default.aspx",
"www.epa.gov",
"www.example.com",
"www.facebook.com",
"www.facebook.com/Onecom",
"www.facebook.com/Plesk",
"www.facebook.com/about/privacy",
"www.facebook.com/business/dashboard",
"www.facebook.com/facebook",
"www.facebook.com/help/cookies",
"www.facebook.com/home.php",
"www.facebook.com/policy.php",
"www.facebook.com/share.php",
"www.facebook.com/sharer.php",
"www.facebook.com/sharer.php?t=",
"www.facebook.com/sharer/sharer.php",
"www.facebook.com/sharer/sharer.php?src=sdkpreparse",
"www.facebook.com/sharer/sharer.php?u=",
"www.fda.gov",
"www.finra.org",
"www.flickr.com",
"www.format.com/l/your_new_portfolio",
"www.forpsi.com",
"www.freecsstemplates.org",
"www.gimp.org",
"www.gmail.com",
"www.gnu.org",
"www.gnu.org/copyleft/gpl.html",
"www.gnu.org/licenses/gpl-2.0.html",
"www.gnu.org/licenses/gpl.html",
"www.godaddy.com",
"www.godaddy.com/hosting/website-builder.aspx?isc=wscfwst304",
"www.godaddy.com/websites/website-builder",
"www.godaddy.com/websites/website-builder?cvosrc=assets.wsb_badge.wsb_badge",
"www.google.co.jp",
"www.google.co.uk",
"www.google.com",
"www.google.com/a/UniversalLogin?service=jotspot",
"www.google.com/analytics",
"www.google.com/analytics/learn/privacy.html",
"www.google.com/analytics/terms/de.html",
"www.google.com/calendar/render",
"www.google.com/chrome",
"www.google.com/gmail",
"www.google.com/intl/de/+/policy/+1button.html",
"www.google.com/intl/de/analytics/privacyoverview.html",
"www.google.com/intl/de/policies/privacy",
"www.google.com/intl/en/policies/privacy",
"www.google.com/intl/it/policies/privacy",
"www.google.com/policies/privacy",
"www.google.com/policies/privacy/ads",
"www.google.com/policies/technologies/cookies",
"www.google.com/privacy_ads.html",
"www.google.com/search?q=whois",
"www.google.com/support/bin/answer.py?answer=23852",
"www.google.de",
"www.google.de/intl/de/policies/privacy",
"www.google.it/intl/it/policies/privacy",
"www.gosuslugi.ru",
"www.gov.cn",
"www.gov.uk",
"www.graphene-theme.com",
"www.haosou.com",
"www.histats.com",
"www.homestead.com",
"www.hostgator.com",
"www.hotmail.com",
"www.hp.com",
"www.huffingtonpost.com",
"www.hupso.com/share",
"www.ibm.com",
"www.ifeng.com",
"www.imdb.com",
"www.instagram.com",
"www.iqiyi.com",
"www.irs.gov",
"www.ispconfig.org",
"www.jd.com",
"www.jiathis.com/share",
"www.joomla.org",
"www.joomlatune.com",
"www.joomshaper.com",
"www.jssor.com",
"www.kickstarter.com",
"www.kriesi.at",
"www.linkedin.com",
"www.linkedin.com/cws/share",
"www.linkedin.com/legal/privacy-policy",
"www.linkedin.com/shareArticle?mini=true",
"www.linkwithin.com",
"www.liveinternet.ru/click",
"www.loopia.se",
"www.luminate.com/webhosting",
"www.lycos.com",
"www.macromedia.com/go/getflashplayer",
"www.mapquest.com",
"www.mapy.cz",
"www.mediawiki.org",
"www.mhthemes.com",
"www.microsoft.com",
"www.microsoft.com/en-us/windows",
"www.miibeian.gov.cn",
"www.miitbeian.gov.cn",
"www.mijndomein.nl",
"www.mijndomein.nl/producten",
"www.mijndomein.nl/producten/websitemaker",
"www.mozilla.org",
"www.mozilla.org/en-US",
"www.mozilla.org/en-US/firefox/new",
"www.mozilla.org/firefox/new",
"www.msn.com",
"www.myspace.com",
"www.myspace.com/Modules/PostTo/Pages",
"www.mysql.com",
"www.nasa.gov",
"www.netvibes.com/subscribe.php",
"www.networkadvertising.org/choices",
"www.networkadvertising.org/managing/opt_out.asp",
"www.networksolutions.com",
"www.nginx.com",
"www.nhs.uk",
"www.nytimes.com",
"www.odin.com",
"www.olark.com/?welcome",
"www.one.com/en",
"www.opencart.com",
"www.opera.com",
"www.oracle.com/index.html",
"www.ovh.com",
"www.pagesjaunes.fr",
"www.parallels.com",
"www.parallels.com/intro",
"www.parallels.com/plesk",
"www.parallels.com/products/automation/intro",
"www.parallels.com/products/containers/intro",
"www.parallels.com/products/desktop/intro",
"www.parallels.com/products/desktop/pd4wl/intro",
"www.parallels.com/products/panel/intro",
"www.parallels.com/products/server/intro",
"www.paypal.com",
"www.people.com.cn",
"www.phoca.cz",
"www.phoca.cz/phocadownload",
"www.phoca.cz/phocagallery",
"www.php.net",
"www.phpbb.com",
"www.phpbb.com/ideas",
"www.pinterest.com",
"www.pinterest.com/pin/create/button",
"www.pinterest.com/pin/create/button/?url=&media=&description=",
"www.plesk.com",
"www.plesk.com/blog",
"www.prestashop.com",
"www.python.org",
"www.qq.com",
"www.redcross.org",
"www.reddit.com",
"www.reuters.com",
"www.safenames.net/?ref=lndrdr",
"www.shinystat.com",
"www.shinystat.com/it",
"www.simplemachines.org",
"www.simplemachines.org/about/smf/license.php",
"www.sina.com.cn",
"www.sipc.org",
"www.skype.com",
"www.slideshare.net",
"www.so.com",
"www.sogou.com",
"www.sohu.com",
"www.statcounter.com",
"www.studiopress.com",
"www.stumbleupon.com",
"www.stumbleupon.com/submit",
"www.taobao.com",
"www.toplist.cz",
"www.tradeindia.com",
"www.tripadvisor.co.uk",
"www.tripadvisor.com",
"www.tucows.com",
"www.tucowsdomains.com",
"www.tumblr.com",
"www.tumblr.com/share/link",
"www.twitter.com",
"www.twitter.com/share",
"www.ubuntu.com",
"www.ucoz.ru",
"www.usatoday.com",
"www.ustream.tv",
"www.value-domain.com",
"www.vektor-inc.co.jp",
"www.visma.com",
"www.w3.org",
"www.washingtonpost.com",
"www.webmd.com",
"www.weebly.com",
"www.whitehouse.gov",
"www.who.int/en",
"www.wikipedia.org",
"www.woothemes.com",
"www.wordpress-fr.net",
"www.wordpress.com",
"www.wordpress.org",
"www.wsj.com",
"www.xing.com/app/share?op=data_protection",
"www.xinhuanet.com",
"www.xml-sitemaps.com",
"www.yahoo.co.jp",
"www.yahoo.com",
"www.yelp.com",
"www.yootheme.com",
"www.youku.com",
"www.youronlinechoices.com",
"www.youronlinechoices.com/uk/your-ad-choices",
"www.youtube.com",
"www.zend.com",
"xing.com",
"xinhuanet.com",
"xiti.com",
"yahoo.co.jp",
"yahoo.com",
"yale.edu",
"yandex.ru",
"yelp.com",
"yootheme.com",
"youku.com",
"youronlinechoices.com",
"youtu.be",
"youtube.com",
"zdnet.com",
"zendesk.com",
"zenfolio.com"
],
"matching_attributes": [
"hostname",
"domain",
"uri",
"url"
]
}

View File

@ -0,0 +1,70 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import datetime
import json
import csv
import os
# TODO: Include Top500 pages
# TODO: Include MozRank
moz_url_domains = "https://moz.com/top500/domains/csv"
moz_url_pages = "https://moz.com/top500/pages/csv"
moz_file_domains = "/tmp/top500.domains.csv"
moz_file_pages = "/tmp/top500.pages.csv"
user_agent = {"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
rDomains = requests.get(moz_url_domains, headers=user_agent)
rPages = requests.get(moz_url_pages, headers=user_agent)
open(moz_file_domains, 'wb').write(rDomains.content)
open(moz_file_pages, 'wb').write(rPages.content)
moz_warninglist = {}
version = int(datetime.date.today().strftime('%Y%m%d'))
moz_warninglist['description'] = "Event contains one or more entries from the top 500 of the most used domains (Mozilla)."
d = datetime.datetime.now()
moz_warninglist['version'] = version
moz_warninglist['name'] = "Top 500 domains and pages from Mozilla"
moz_warninglist['type'] = 'hostname'
moz_warninglist['list'] = []
moz_warninglist['matching_attributes'] = ['hostname', 'domain', 'uri', 'url']
with open(moz_file_domains) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
#print(f'Column names are {", ".join(row)}')
line_count += 1
else:
#print(f'\t{row[0]}. {row[1]}, MozTrust: {row[5]}.')
v = row[1]
moz_warninglist['list'].append(v.rstrip().rstrip('/'))
line_count += 1
with open(moz_file_pages) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
#print(f'Column names are {", ".join(row)}')
line_count += 1
else:
#print(f'\t{row[0]}. {row[1]}, MozTrust: {row[5]}.')
v = row[1]
moz_warninglist['list'].append(v.rstrip().rstrip('/'))
line_count += 1
moz_warninglist['list'] = sorted(set(moz_warninglist['list']))
print(json.dumps(moz_warninglist))
try:
os.remove(moz_file_domains)
os.remove(moz_file_pages)
except:
print(f'Perhaps {moz_file_domains}/{moz_file_pages} does not exist.')