Merge tranco scripts,:generate_tranco.py generates both full and 10k list

pull/154/head
Kevin Holvoet 2020-07-17 09:22:34 +02:00
parent 1d59d7f6f5
commit e0b3968635
5 changed files with 7541 additions and 7527 deletions

File diff suppressed because it is too large Load Diff

View File

@ -77,7 +77,6 @@
"2m.ma", "2m.ma",
"2mdn.net", "2mdn.net",
"2o7.net", "2o7.net",
"2track.info",
"300.cn", "300.cn",
"312168.com", "312168.com",
"313.cn", "313.cn",
@ -161,6 +160,7 @@
"8m.com", "8m.com",
"8ozhygzz5eke.com", "8ozhygzz5eke.com",
"8tracks.com", "8tracks.com",
"91boshi.net",
"91jm.com", "91jm.com",
"91mobiles.com", "91mobiles.com",
"92926.com", "92926.com",
@ -335,7 +335,6 @@
"admin5.com", "admin5.com",
"adminbuy.cn", "adminbuy.cn",
"admitad.com", "admitad.com",
"admixer.co.kr",
"admixer.net", "admixer.net",
"admob.com", "admob.com",
"adn.com", "adn.com",
@ -516,10 +515,10 @@
"allabout.co.jp", "allabout.co.jp",
"allaboutcookies.org", "allaboutcookies.org",
"allafrica.com", "allafrica.com",
"allbest.ru",
"allbusiness.com", "allbusiness.com",
"allegro.pl", "allegro.pl",
"allenpress.com", "allenpress.com",
"allkpop.com",
"allmusic.com", "allmusic.com",
"allocine.fr", "allocine.fr",
"allposters.com", "allposters.com",
@ -808,7 +807,6 @@
"aspca.org", "aspca.org",
"aspnetcdn.com", "aspnetcdn.com",
"asriran.com", "asriran.com",
"asrock.com",
"assemblee-nationale.fr", "assemblee-nationale.fr",
"associatedcontent.com", "associatedcontent.com",
"associates-amazon.com", "associates-amazon.com",
@ -866,6 +864,7 @@
"autoblog.com", "autoblog.com",
"autocar.co.uk", "autocar.co.uk",
"autodesk.com", "autodesk.com",
"autoevolution.com",
"autohome.com.cn", "autohome.com.cn",
"automattic.com", "automattic.com",
"automox.com", "automox.com",
@ -880,6 +879,7 @@
"avaaz.org", "avaaz.org",
"avast.com", "avast.com",
"avastbrowser.com", "avastbrowser.com",
"avature.net",
"avaz.ba", "avaz.ba",
"avcdn.net", "avcdn.net",
"avclub.com", "avclub.com",
@ -893,6 +893,7 @@
"avito.ru", "avito.ru",
"avma.org", "avma.org",
"avocet.io", "avocet.io",
"avon.com",
"avvo.com", "avvo.com",
"awardspace.com", "awardspace.com",
"aweber.com", "aweber.com",
@ -1016,6 +1017,7 @@
"bbci.co.uk", "bbci.co.uk",
"bbcollab.com", "bbcollab.com",
"bbt.com", "bbt.com",
"bbva.com.ar",
"bbva.es", "bbva.es",
"bbva.mx", "bbva.mx",
"bbvanet.com.mx", "bbvanet.com.mx",
@ -1139,7 +1141,6 @@
"birminghammail.co.uk", "birminghammail.co.uk",
"bis.org", "bis.org",
"bisnis.com", "bisnis.com",
"bit-z.pro",
"bit.do", "bit.do",
"bit.edu.cn", "bit.edu.cn",
"bit.ly", "bit.ly",
@ -1151,7 +1152,6 @@
"bitcointalk.org", "bitcointalk.org",
"bitdefender.com", "bitdefender.com",
"bitdefender.net", "bitdefender.net",
"biteable.com",
"bitfinex.com", "bitfinex.com",
"bitly.com", "bitly.com",
"bitmoji.com", "bitmoji.com",
@ -1239,7 +1239,6 @@
"blogspot.ru", "blogspot.ru",
"blogspot.se", "blogspot.se",
"blogtalkradio.com", "blogtalkradio.com",
"blogto.com",
"bloody-disgusting.com", "bloody-disgusting.com",
"bloomberg.com", "bloomberg.com",
"bloomberglaw.com", "bloomberglaw.com",
@ -1264,6 +1263,7 @@
"bmwi.de", "bmwi.de",
"bmwusa.com", "bmwusa.com",
"bna.com", "bna.com",
"bna.com.ar",
"bnf.fr", "bnf.fr",
"bnmla.com", "bnmla.com",
"bnnbloomberg.ca", "bnnbloomberg.ca",
@ -1338,7 +1338,6 @@
"bravehost.com", "bravehost.com",
"bravenet.com", "bravenet.com",
"bravesites.com", "bravesites.com",
"bravotube.net",
"bravotv.com", "bravotv.com",
"braze.com", "braze.com",
"brazzers.com", "brazzers.com",
@ -1372,6 +1371,7 @@
"brother.com", "brother.com",
"brown.edu", "brown.edu",
"brownpapertickets.com", "brownpapertickets.com",
"brownsfashion.com",
"brunarosso.com", "brunarosso.com",
"brynmawr.edu", "brynmawr.edu",
"bs.to", "bs.to",
@ -1391,7 +1391,6 @@
"buaa.edu.cn", "buaa.edu.cn",
"bucknell.edu", "bucknell.edu",
"buddypress.org", "buddypress.org",
"budgetphone.nl",
"budsgunshop.com", "budsgunshop.com",
"buff.ly", "buff.ly",
"buffalo.edu", "buffalo.edu",
@ -1560,10 +1559,10 @@
"carousell.com", "carousell.com",
"cars.com", "cars.com",
"carsales.com.au", "carsales.com.au",
"carscoops.com",
"carsensor.net", "carsensor.net",
"carsforsale.com", "carsforsale.com",
"carto.com", "carto.com",
"carview.co.jp",
"carwale.com", "carwale.com",
"cas.cn", "cas.cn",
"casadellibro.com", "casadellibro.com",
@ -1703,7 +1702,6 @@
"chimpstatic.com", "chimpstatic.com",
"china-ef.com", "china-ef.com",
"china-embassy.org", "china-embassy.org",
"china.com",
"china.com.cn", "china.com.cn",
"china.org.cn", "china.org.cn",
"chinaacc.com", "chinaacc.com",
@ -1927,12 +1925,14 @@
"codinghorror.com", "codinghorror.com",
"coe.int", "coe.int",
"cofeed.com", "cofeed.com",
"cofile.net",
"cognitivlabs.com", "cognitivlabs.com",
"cognitoforms.com", "cognitoforms.com",
"cognizant.com", "cognizant.com",
"cogocast.net", "cogocast.net",
"coinbase.com", "coinbase.com",
"coindesk.com", "coindesk.com",
"coingecko.com",
"coinmarketcap.com", "coinmarketcap.com",
"coinpayu.com", "coinpayu.com",
"cointelegraph.com", "cointelegraph.com",
@ -2142,7 +2142,6 @@
"crwdcntrl.net", "crwdcntrl.net",
"cryoutcreations.eu", "cryoutcreations.eu",
"cryptobrowser.site", "cryptobrowser.site",
"cryptocompare.com",
"cryptotabbrowser.com", "cryptotabbrowser.com",
"cs.com.cn", "cs.com.cn",
"csair.com", "csair.com",
@ -2193,7 +2192,6 @@
"custom-writings.net", "custom-writings.net",
"customink.com", "customink.com",
"customs.gov.cn", "customs.gov.cn",
"customwriting.org",
"cutestat.com", "cutestat.com",
"cutt.ly", "cutt.ly",
"cutt.us", "cutt.us",
@ -2309,6 +2307,7 @@
"defense.gov", "defense.gov",
"defensenews.com", "defensenews.com",
"defimedia.info", "defimedia.info",
"definicion.de",
"defra.gov.uk", "defra.gov.uk",
"degruyter.com", "degruyter.com",
"delaware.gov", "delaware.gov",
@ -2352,7 +2351,6 @@
"destatis.de", "destatis.de",
"destructoid.com", "destructoid.com",
"detik.com", "detik.com",
"detnews.com",
"detroitcoralfarms.com", "detroitcoralfarms.com",
"detroitnews.com", "detroitnews.com",
"deutsche-bank.de", "deutsche-bank.de",
@ -2376,7 +2374,6 @@
"diandongwajueji.com", "diandongwajueji.com",
"diannaojc.com", "diannaojc.com",
"dianping.com", "dianping.com",
"dianxiaomi.com",
"diariolibre.com", "diariolibre.com",
"dice.com", "dice.com",
"dicio.com.br", "dicio.com.br",
@ -2646,6 +2643,7 @@
"eastday.com", "eastday.com",
"eastmoney.com", "eastmoney.com",
"easychair.org", "easychair.org",
"easyhindityping.com",
"easyjet.com", "easyjet.com",
"easytomessage.com", "easytomessage.com",
"eater.com", "eater.com",
@ -2902,7 +2900,6 @@
"esuteru.com", "esuteru.com",
"esy.es", "esy.es",
"etao.com", "etao.com",
"etecsa.net",
"etemadonline.com", "etemadonline.com",
"ethereum.org", "ethereum.org",
"etherscan.io", "etherscan.io",
@ -3406,6 +3403,7 @@
"gazetaexpress.com", "gazetaexpress.com",
"gazzetta.gr", "gazzetta.gr",
"gazzetta.it", "gazzetta.it",
"gazzettadelsud.it",
"gbcinternetenforcement.net", "gbcinternetenforcement.net",
"gcloudcs.com", "gcloudcs.com",
"gcs-web.com", "gcs-web.com",
@ -3458,6 +3456,7 @@
"getdonspeg.work", "getdonspeg.work",
"getdropbox.com", "getdropbox.com",
"getemoji.com", "getemoji.com",
"getepic.com",
"getfirefox.com", "getfirefox.com",
"getfvid.com", "getfvid.com",
"getgo.com", "getgo.com",
@ -3470,7 +3469,6 @@
"getty.edu", "getty.edu",
"gettyimages.com", "gettyimages.com",
"getui.com", "getui.com",
"getui.net",
"gfan.com", "gfan.com",
"gfk.com", "gfk.com",
"gfx.ms", "gfx.ms",
@ -3832,6 +3830,7 @@
"hapitas.jp", "hapitas.jp",
"haplat.net", "haplat.net",
"haqqin.az", "haqqin.az",
"haraj.com.sa",
"harborfreight.com", "harborfreight.com",
"hardened-php.net", "hardened-php.net",
"hardrock.com", "hardrock.com",
@ -3987,7 +3986,6 @@
"hometax.go.kr", "hometax.go.kr",
"honda.co.jp", "honda.co.jp",
"honda.com", "honda.com",
"honeygain.com",
"honeywell.com", "honeywell.com",
"hongkiat.com", "hongkiat.com",
"hoodsite.com", "hoodsite.com",
@ -4250,7 +4248,6 @@
"immi.gov.au", "immi.gov.au",
"immobiliare.it", "immobiliare.it",
"immobilienscout24.de", "immobilienscout24.de",
"imo.im",
"imo.org", "imo.org",
"imoim.app", "imoim.app",
"imooc.com", "imooc.com",
@ -4448,6 +4445,7 @@
"isc.org", "isc.org",
"isi.edu", "isi.edu",
"islamweb.net", "islamweb.net",
"islcollective.com",
"ismedia.jp", "ismedia.jp",
"isna.ir", "isna.ir",
"isnssdk.com", "isnssdk.com",
@ -4463,6 +4461,7 @@
"italki.com", "italki.com",
"itar-tass.com", "itar-tass.com",
"itau.com.br", "itau.com.br",
"itavcn.com",
"itch.io", "itch.io",
"itesm.mx", "itesm.mx",
"iteye.com", "iteye.com",
@ -4512,7 +4511,6 @@
"jalopnik.com", "jalopnik.com",
"jamanetwork.com", "jamanetwork.com",
"jamendo.com", "jamendo.com",
"jamesclear.com",
"jamieoliver.com", "jamieoliver.com",
"jamnews.com", "jamnews.com",
"jandan.net", "jandan.net",
@ -4609,6 +4607,7 @@
"join.me", "join.me",
"joinhoney.com", "joinhoney.com",
"joins.com", "joins.com",
"jomashop.com",
"jooble.org", "jooble.org",
"joomag.com", "joomag.com",
"joomla.org", "joomla.org",
@ -4800,7 +4799,6 @@
"knect365.com", "knect365.com",
"knet.cn", "knet.cn",
"knightlab.com", "knightlab.com",
"knoji.com",
"knowyourmeme.com", "knowyourmeme.com",
"knoxnews.com", "knoxnews.com",
"ko-fi.com", "ko-fi.com",
@ -4851,7 +4849,6 @@
"ksu.edu.sa", "ksu.edu.sa",
"kth.se", "kth.se",
"ktla.com", "ktla.com",
"ktvu.com",
"ku.dk", "ku.dk",
"ku.edu", "ku.edu",
"ku6.com", "ku6.com",
@ -4938,6 +4935,7 @@
"lbl.gov", "lbl.gov",
"lboro.ac.uk", "lboro.ac.uk",
"lci.fr", "lci.fr",
"lcl.fr",
"ldoceonline.com", "ldoceonline.com",
"lds.org", "lds.org",
"le.ac.uk", "le.ac.uk",
@ -5168,7 +5166,6 @@
"luckyforbet.com", "luckyforbet.com",
"ludashi.com", "ludashi.com",
"lufthansa.com", "lufthansa.com",
"luisaviaroma.com",
"lulu.com", "lulu.com",
"lululemon.com", "lululemon.com",
"lumenlearning.com", "lumenlearning.com",
@ -5221,7 +5218,6 @@
"mail-archive.com", "mail-archive.com",
"mail-order-bride.biz", "mail-order-bride.biz",
"mail-order-bride.net", "mail-order-bride.net",
"mail-order-brides.org",
"mail.com", "mail.com",
"mail.ru", "mail.ru",
"mailchi.mp", "mailchi.mp",
@ -5260,6 +5256,7 @@
"manhuagui.com", "manhuagui.com",
"manithan.com", "manithan.com",
"manoramaonline.com", "manoramaonline.com",
"manre.me",
"manta.com", "manta.com",
"manualslib.com", "manualslib.com",
"manuscriptcentral.com", "manuscriptcentral.com",
@ -5277,7 +5274,6 @@
"maricopa.edu", "maricopa.edu",
"marieclaire.com", "marieclaire.com",
"marieclaire.com.tw", "marieclaire.com.tw",
"marijuanabreak.com",
"marinetraffic.com", "marinetraffic.com",
"marist.edu", "marist.edu",
"markethive.com", "markethive.com",
@ -5427,6 +5423,7 @@
"mercadolivre.com", "mercadolivre.com",
"mercadolivre.com.br", "mercadolivre.com.br",
"mercadopago.com", "mercadopago.com",
"mercadopago.com.ar",
"mercantilbanco.com", "mercantilbanco.com",
"mercari.com", "mercari.com",
"mercedes-benz.com", "mercedes-benz.com",
@ -5459,7 +5456,6 @@
"metropoles.com", "metropoles.com",
"metropolitanbaptistchurch.org", "metropolitanbaptistchurch.org",
"metrotimes.com", "metrotimes.com",
"metu.edu.tr",
"mewe.com", "mewe.com",
"mewkid.net", "mewkid.net",
"mext.go.jp", "mext.go.jp",
@ -5539,6 +5535,7 @@
"mirror.co.uk", "mirror.co.uk",
"mirtesen.ru", "mirtesen.ru",
"mises.org", "mises.org",
"misionesonline.net",
"missouri.edu", "missouri.edu",
"mit.edu", "mit.edu",
"mitre.org", "mitre.org",
@ -5558,6 +5555,7 @@
"mktoresp.com", "mktoresp.com",
"ml.com", "ml.com",
"ml314.com", "ml314.com",
"mlabs.com.br",
"mlb.com", "mlb.com",
"mlit.go.jp", "mlit.go.jp",
"mlive.com", "mlive.com",
@ -5578,7 +5576,6 @@
"mobile.de", "mobile.de",
"mobile.ir", "mobile.ir",
"mobile01.com", "mobile01.com",
"mobileapptracking.com",
"mobilesystemservice.com", "mobilesystemservice.com",
"mobinsb.com", "mobinsb.com",
"mobirise.com", "mobirise.com",
@ -5637,6 +5634,7 @@
"mooo.com", "mooo.com",
"mootools.net", "mootools.net",
"mop.com", "mop.com",
"moph.go.th",
"mopub.com", "mopub.com",
"morganstanley.com", "morganstanley.com",
"morgenpost.de", "morgenpost.de",
@ -5666,8 +5664,6 @@
"moviefone.com", "moviefone.com",
"movieranker.com", "movieranker.com",
"movieweb.com", "movieweb.com",
"movistar.es",
"movistarplus.es",
"movs4u.live", "movs4u.live",
"moz.com", "moz.com",
"mozaws.net", "mozaws.net",
@ -5990,7 +5986,6 @@
"newscientist.com", "newscientist.com",
"newsday.com", "newsday.com",
"newser.com", "newser.com",
"newsgator.com",
"newsit.gr", "newsit.gr",
"newsmax.com", "newsmax.com",
"newsmth.net", "newsmth.net",
@ -6692,7 +6687,6 @@
"pku.edu.cn", "pku.edu.cn",
"placed.com", "placed.com",
"placeit.net", "placeit.net",
"plagiarismdetector.net",
"plala.or.jp", "plala.or.jp",
"planalto.gov.br", "planalto.gov.br",
"planetminecraft.com", "planetminecraft.com",
@ -6731,6 +6725,7 @@
"poetryfoundation.org", "poetryfoundation.org",
"poets.org", "poets.org",
"pof.com", "pof.com",
"point2homes.com",
"pojoksatu.id", "pojoksatu.id",
"pokemon.com", "pokemon.com",
"poki.com", "poki.com",
@ -6741,6 +6736,7 @@
"politico.eu", "politico.eu",
"politicususa.com", "politicususa.com",
"politifact.com", "politifact.com",
"polito.it",
"polldaddy.com", "polldaddy.com",
"polyfill.io", "polyfill.io",
"polygon.com", "polygon.com",
@ -6759,6 +6755,7 @@
"popularmechanics.com", "popularmechanics.com",
"porngo.com", "porngo.com",
"pornhat.com", "pornhat.com",
"pornhd.com",
"pornhub.com", "pornhub.com",
"pornhubpremium.com", "pornhubpremium.com",
"pornolab.net", "pornolab.net",
@ -6973,6 +6970,7 @@
"quickanddirtytips.com", "quickanddirtytips.com",
"quickconnect.to", "quickconnect.to",
"quicksprout.com", "quicksprout.com",
"quidco.com",
"quikr.com", "quikr.com",
"quillbot.com", "quillbot.com",
"quizizz.com", "quizizz.com",
@ -7094,6 +7092,7 @@
"redditmedia.com", "redditmedia.com",
"redditstatic.com", "redditstatic.com",
"redfin.com", "redfin.com",
"redflagdeals.com",
"redgifs.com", "redgifs.com",
"redhat.com", "redhat.com",
"rediff.com", "rediff.com",
@ -7132,6 +7131,7 @@
"rentalcars.com", "rentalcars.com",
"repec.org", "repec.org",
"repl.it", "repl.it",
"report.az",
"repretel.com", "repretel.com",
"repubblica.it", "repubblica.it",
"republicworld.com", "republicworld.com",
@ -7407,6 +7407,7 @@
"sass-lang.com", "sass-lang.com",
"sastasundar.com", "sastasundar.com",
"sat.gob.mx", "sat.gob.mx",
"saturn.de",
"saude.gov.br", "saude.gov.br",
"saudigazette.com.sa", "saudigazette.com.sa",
"savefrom.net", "savefrom.net",
@ -7436,7 +7437,6 @@
"sc.gov.cn", "sc.gov.cn",
"scamadviser.com", "scamadviser.com",
"scdn.co", "scdn.co",
"scdn.vn",
"scene7.com", "scene7.com",
"sch.gr", "sch.gr",
"schema.org", "schema.org",
@ -7483,7 +7483,6 @@
"scu.edu.cn", "scu.edu.cn",
"scumvv.ca", "scumvv.ca",
"scuoladigitale.info", "scuoladigitale.info",
"scut.edu.cn",
"sd.gov", "sd.gov",
"sda.gov.cn", "sda.gov.cn",
"sdamgia.ru", "sdamgia.ru",
@ -7635,7 +7634,6 @@
"shinezone.com", "shinezone.com",
"shinobi.jp", "shinobi.jp",
"shinystat.com", "shinystat.com",
"shipsautopilot.com",
"sho.com", "sho.com",
"shomanews.com", "shomanews.com",
"shop-pro.jp", "shop-pro.jp",
@ -7691,7 +7689,6 @@
"sina.com", "sina.com",
"sina.com.cn", "sina.com.cn",
"sinaimg.cn", "sinaimg.cn",
"sinajs.cn",
"sindonews.com", "sindonews.com",
"singaporeair.com", "singaporeair.com",
"singular.net", "singular.net",
@ -7726,7 +7723,6 @@
"skillshare.com", "skillshare.com",
"skimlinks.com", "skimlinks.com",
"skimresources.com", "skimresources.com",
"skribbl.io",
"skrill.com", "skrill.com",
"skroutz.gr", "skroutz.gr",
"sky.com", "sky.com",
@ -7813,6 +7809,7 @@
"so-net.ne.jp", "so-net.ne.jp",
"so.com", "so.com",
"soap2day.to", "soap2day.to",
"soccerstreams.net",
"soccerway.com", "soccerway.com",
"socdm.com", "socdm.com",
"socialbakers.com", "socialbakers.com",
@ -7829,7 +7826,6 @@
"softpedia.com", "softpedia.com",
"softwareadvice.com", "softwareadvice.com",
"sogou.com", "sogou.com",
"sogoucdn.com",
"soha.vn", "soha.vn",
"sohatv.vn", "sohatv.vn",
"sohu.com", "sohu.com",
@ -7902,6 +7898,7 @@
"splcenter.org", "splcenter.org",
"splice.com", "splice.com",
"splunk.com", "splunk.com",
"spokeo.com",
"spokesman.com", "spokesman.com",
"sponichi.co.jp", "sponichi.co.jp",
"sporcle.com", "sporcle.com",
@ -7913,6 +7910,7 @@
"sportbox.ru", "sportbox.ru",
"sportingnews.com", "sportingnews.com",
"sportmaster.ru", "sportmaster.ru",
"sportradar.com",
"sportradarserving.com", "sportradarserving.com",
"sports.ru", "sports.ru",
"sportskeeda.com", "sportskeeda.com",
@ -7973,7 +7971,6 @@
"stan.com.au", "stan.com.au",
"standaard.be", "standaard.be",
"standard.co.uk", "standard.co.uk",
"standardchartered.com",
"standardmedia.co.ke", "standardmedia.co.ke",
"stanford.edu", "stanford.edu",
"staples.com", "staples.com",
@ -8014,6 +8011,7 @@
"steepster.com", "steepster.com",
"steepto.com", "steepto.com",
"steinberg.net", "steinberg.net",
"stellamccartney.com",
"stereogum.com", "stereogum.com",
"stern.de", "stern.de",
"stickyadstv.com", "stickyadstv.com",
@ -8043,6 +8041,7 @@
"strath.ac.uk", "strath.ac.uk",
"strava.com", "strava.com",
"streamable.com", "streamable.com",
"streamall-search.com",
"streamlabs.com", "streamlabs.com",
"streamyard.com", "streamyard.com",
"streetinsider.com", "streetinsider.com",
@ -8081,7 +8080,6 @@
"sudouest.fr", "sudouest.fr",
"sudrf.ru", "sudrf.ru",
"sueddeutsche.de", "sueddeutsche.de",
"suhosin.org",
"suicidepreventionlifeline.org", "suicidepreventionlifeline.org",
"suite101.com", "suite101.com",
"sulekha.com", "sulekha.com",
@ -8100,6 +8098,7 @@
"suntrust.com", "suntrust.com",
"sunysb.edu", "sunysb.edu",
"superbthemes.com", "superbthemes.com",
"superhaber.tv",
"superpages.com", "superpages.com",
"supersonic.com", "supersonic.com",
"supersonicads.com", "supersonicads.com",
@ -8143,6 +8142,7 @@
"symcd.com", "symcd.com",
"symfony.com", "symfony.com",
"sympatico.ca", "sympatico.ca",
"sympla.com.br",
"synology.com", "synology.com",
"synology.me", "synology.me",
"synxis.com", "synxis.com",
@ -8311,13 +8311,11 @@
"test.de", "test.de",
"testbook.com", "testbook.com",
"texas.gov", "texas.gov",
"texasmonthly.com",
"texastribune.org", "texastribune.org",
"text.ru", "text.ru",
"textnow.com", "textnow.com",
"tf1.fr", "tf1.fr",
"tfl.gov.uk", "tfl.gov.uk",
"tgd.kr",
"tgju.org", "tgju.org",
"thairath.co.th", "thairath.co.th",
"thalesgroup.com", "thalesgroup.com",
@ -8490,6 +8488,7 @@
"tigris.org", "tigris.org",
"tiki.vn", "tiki.vn",
"tiktok.com", "tiktok.com",
"tiktokcdn-in.com",
"tiktokcdn.com", "tiktokcdn.com",
"tiktokv.com", "tiktokv.com",
"tilda.cc", "tilda.cc",
@ -8632,7 +8631,6 @@
"travelchannel.com", "travelchannel.com",
"travelocity.com", "travelocity.com",
"traveloka.com", "traveloka.com",
"travelpod.com",
"travis-ci.org", "travis-ci.org",
"treas.gov", "treas.gov",
"treasuredata.com", "treasuredata.com",
@ -8674,6 +8672,7 @@
"trontv.com", "trontv.com",
"trudvsem.ru", "trudvsem.ru",
"truecaller.com", "truecaller.com",
"truecar.com",
"trueleadid.com", "trueleadid.com",
"trulia.com", "trulia.com",
"truoptik.com", "truoptik.com",
@ -8804,6 +8803,7 @@
"uc.cn", "uc.cn",
"uc.edu", "uc.edu",
"uc.pt", "uc.pt",
"uc3m.es",
"ucalgary.ca", "ucalgary.ca",
"ucar.edu", "ucar.edu",
"ucas.com", "ucas.com",
@ -9143,7 +9143,6 @@
"validcbdoil.com", "validcbdoil.com",
"value-domain.com", "value-domain.com",
"valuecommerce.com", "valuecommerce.com",
"valueimpression.com",
"valvesoftware.com", "valvesoftware.com",
"vam.ac.uk", "vam.ac.uk",
"vancouversun.com", "vancouversun.com",
@ -9996,6 +9995,7 @@
"zro56hd6szoy.com", "zro56hd6szoy.com",
"zscaler.com", "zscaler.com",
"zulily.com", "zulily.com",
"zum.com",
"zumiez.com", "zumiez.com",
"zurb.com", "zurb.com",
"zxart.cn", "zxart.cn",
@ -10010,5 +10010,5 @@
], ],
"name": "Top 10K most-used sites from Tranco", "name": "Top 10K most-used sites from Tranco",
"type": "hostname", "type": "hostname",
"version": 20200714 "version": 20200715
} }

View File

@ -1,39 +1,57 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import requests
import zipfile
import datetime
import json import json
import zipfile
tranco_url = 'https://tranco-list.eu/top-1m.csv.zip' from generator import download, download_to_file, get_abspath_list_file, get_version
tranco_file = 'top-1m.csv.zip'
user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0'}
r = requests.get(tranco_url, headers=user_agent)
with open(tranco_file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
with zipfile.ZipFile(tranco_file, 'r') as tranco_lists:
for name in tranco_lists.namelist():
if name == 'top-1m.csv':
with tranco_lists.open(name) as tranco:
sites = tranco.readlines()
else:
continue
tranco_warninglist = {}
version = int(datetime.date.today().strftime('%Y%m%d'))
tranco_warninglist['description'] = "Event contains one or more entries from the top 1,000,000 most-used sites (Tranco)." def process(file, warninglist, dst, first_10k=False):
d = datetime.datetime.now()
tranco_warninglist['version'] = version
tranco_warninglist['name'] = "Top 1,000,000 most-used sites from Tranco"
tranco_warninglist['type'] = 'hostname'
tranco_warninglist['list'] = []
tranco_warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip']
for site in sites: with zipfile.ZipFile(file, 'r') as tranco_lists:
v = site.decode('UTF-8').split(',')[1] for name in tranco_lists.namelist():
tranco_warninglist['list'].append(v.rstrip()) if name == 'top-1m.csv':
tranco_warninglist['list'] = sorted(set(tranco_warninglist['list'])) with tranco_lists.open(name) as tranco:
print(json.dumps(tranco_warninglist)) if first_10k:
sites = tranco.readlines()[:10000]
else:
sites = tranco.readlines()
else:
continue
warninglist['type'] = 'hostname'
warninglist['version'] = get_version()
warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip']
for site in sites:
v = site.decode('UTF-8').split(',')[1]
warninglist['list'].append(v.rstrip())
warninglist['list'] = sorted(set(warninglist['list']))
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write("\n")
if __name__ == '__main__':
tranco_url = 'https://tranco-list.eu/top-1m.csv.zip'
tranco_file = 'top-1m.csv.zip'
download_to_file(tranco_url, tranco_file)
# Top 1M
tranco_dst = "tranco"
tranco_warninglist = {
'description': "Event contains one or more entries from the top 1,000,000 most-used sites (https://tranco-list.eu/).",
'name': "Top 1,000,000 most-used sites from Tranco"
}
process(tranco_file, tranco_warninglist, tranco_dst)
# Top 10K
tranco_10k_dst = "tranco10k"
tranco_10k_warninglist = {
'description': "Event contains one or more entries from the top 10K most-used sites (https://tranco-list.eu/).",
'name': "Top 10K most-used sites from Tranco"
}
process(tranco_file, tranco_10k_warninglist, tranco_10k_dst, first_10k=True)

View File

@ -1,39 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import zipfile
import datetime
import json
tranco_url = 'https://tranco-list.eu/top-1m.csv.zip'
tranco_file = 'top-1m.csv.zip'
user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0'}
r = requests.get(tranco_url, headers=user_agent)
with open(tranco_file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
with zipfile.ZipFile(tranco_file, 'r') as tranco_lists:
for name in tranco_lists.namelist():
if name == 'top-1m.csv':
with tranco_lists.open(name) as tranco:
sites = tranco.readlines()[:10000]
else:
continue
tranco_warninglist = {}
version = int(datetime.date.today().strftime('%Y%m%d'))
tranco_warninglist['description'] = "Event contains one or more entries from the top 10K most-used sites (Tranco)."
d = datetime.datetime.now()
tranco_warninglist['version'] = version
tranco_warninglist['name'] = "Top 10K most-used sites from Tranco"
tranco_warninglist['type'] = 'hostname'
tranco_warninglist['list'] = []
tranco_warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip']
for site in sites:
v = site.decode('UTF-8').split(',')[1]
tranco_warninglist['list'].append(v.rstrip())
tranco_warninglist['list'] = sorted(set(tranco_warninglist['list']))
print(json.dumps(tranco_warninglist))

35
tools/generator.py Normal file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import datetime
from inspect import currentframe, getframeinfo
from os import path
import requests
def download_to_file(url, file):
user_agent = {
"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
r = requests.get(url, headers=user_agent)
with open(file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
def download(url):
user_agent = {
"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
return requests.get(url, headers=user_agent)
def get_abspath_list_file(dst):
rel_path = getframeinfo(currentframe()).filename
current_folder = path.dirname(path.abspath(rel_path))
real_path = path.join(
current_folder, '../lists/{dst}/list.json'.format(dst=dst))
return path.abspath(path.realpath(real_path))
def get_version():
return int(datetime.date.today().strftime('%Y%m%d'))