Merge tranco scripts,:generate_tranco.py generates both full and 10k list

pull/154/head
Kevin Holvoet 2020-07-17 09:22:34 +02:00
parent 1d59d7f6f5
commit e0b3968635
5 changed files with 7541 additions and 7527 deletions

File diff suppressed because it is too large Load Diff

View File

@ -77,7 +77,6 @@
"2m.ma",
"2mdn.net",
"2o7.net",
"2track.info",
"300.cn",
"312168.com",
"313.cn",
@ -161,6 +160,7 @@
"8m.com",
"8ozhygzz5eke.com",
"8tracks.com",
"91boshi.net",
"91jm.com",
"91mobiles.com",
"92926.com",
@ -335,7 +335,6 @@
"admin5.com",
"adminbuy.cn",
"admitad.com",
"admixer.co.kr",
"admixer.net",
"admob.com",
"adn.com",
@ -516,10 +515,10 @@
"allabout.co.jp",
"allaboutcookies.org",
"allafrica.com",
"allbest.ru",
"allbusiness.com",
"allegro.pl",
"allenpress.com",
"allkpop.com",
"allmusic.com",
"allocine.fr",
"allposters.com",
@ -808,7 +807,6 @@
"aspca.org",
"aspnetcdn.com",
"asriran.com",
"asrock.com",
"assemblee-nationale.fr",
"associatedcontent.com",
"associates-amazon.com",
@ -866,6 +864,7 @@
"autoblog.com",
"autocar.co.uk",
"autodesk.com",
"autoevolution.com",
"autohome.com.cn",
"automattic.com",
"automox.com",
@ -880,6 +879,7 @@
"avaaz.org",
"avast.com",
"avastbrowser.com",
"avature.net",
"avaz.ba",
"avcdn.net",
"avclub.com",
@ -893,6 +893,7 @@
"avito.ru",
"avma.org",
"avocet.io",
"avon.com",
"avvo.com",
"awardspace.com",
"aweber.com",
@ -1016,6 +1017,7 @@
"bbci.co.uk",
"bbcollab.com",
"bbt.com",
"bbva.com.ar",
"bbva.es",
"bbva.mx",
"bbvanet.com.mx",
@ -1139,7 +1141,6 @@
"birminghammail.co.uk",
"bis.org",
"bisnis.com",
"bit-z.pro",
"bit.do",
"bit.edu.cn",
"bit.ly",
@ -1151,7 +1152,6 @@
"bitcointalk.org",
"bitdefender.com",
"bitdefender.net",
"biteable.com",
"bitfinex.com",
"bitly.com",
"bitmoji.com",
@ -1239,7 +1239,6 @@
"blogspot.ru",
"blogspot.se",
"blogtalkradio.com",
"blogto.com",
"bloody-disgusting.com",
"bloomberg.com",
"bloomberglaw.com",
@ -1264,6 +1263,7 @@
"bmwi.de",
"bmwusa.com",
"bna.com",
"bna.com.ar",
"bnf.fr",
"bnmla.com",
"bnnbloomberg.ca",
@ -1338,7 +1338,6 @@
"bravehost.com",
"bravenet.com",
"bravesites.com",
"bravotube.net",
"bravotv.com",
"braze.com",
"brazzers.com",
@ -1372,6 +1371,7 @@
"brother.com",
"brown.edu",
"brownpapertickets.com",
"brownsfashion.com",
"brunarosso.com",
"brynmawr.edu",
"bs.to",
@ -1391,7 +1391,6 @@
"buaa.edu.cn",
"bucknell.edu",
"buddypress.org",
"budgetphone.nl",
"budsgunshop.com",
"buff.ly",
"buffalo.edu",
@ -1560,10 +1559,10 @@
"carousell.com",
"cars.com",
"carsales.com.au",
"carscoops.com",
"carsensor.net",
"carsforsale.com",
"carto.com",
"carview.co.jp",
"carwale.com",
"cas.cn",
"casadellibro.com",
@ -1703,7 +1702,6 @@
"chimpstatic.com",
"china-ef.com",
"china-embassy.org",
"china.com",
"china.com.cn",
"china.org.cn",
"chinaacc.com",
@ -1927,12 +1925,14 @@
"codinghorror.com",
"coe.int",
"cofeed.com",
"cofile.net",
"cognitivlabs.com",
"cognitoforms.com",
"cognizant.com",
"cogocast.net",
"coinbase.com",
"coindesk.com",
"coingecko.com",
"coinmarketcap.com",
"coinpayu.com",
"cointelegraph.com",
@ -2142,7 +2142,6 @@
"crwdcntrl.net",
"cryoutcreations.eu",
"cryptobrowser.site",
"cryptocompare.com",
"cryptotabbrowser.com",
"cs.com.cn",
"csair.com",
@ -2193,7 +2192,6 @@
"custom-writings.net",
"customink.com",
"customs.gov.cn",
"customwriting.org",
"cutestat.com",
"cutt.ly",
"cutt.us",
@ -2309,6 +2307,7 @@
"defense.gov",
"defensenews.com",
"defimedia.info",
"definicion.de",
"defra.gov.uk",
"degruyter.com",
"delaware.gov",
@ -2352,7 +2351,6 @@
"destatis.de",
"destructoid.com",
"detik.com",
"detnews.com",
"detroitcoralfarms.com",
"detroitnews.com",
"deutsche-bank.de",
@ -2376,7 +2374,6 @@
"diandongwajueji.com",
"diannaojc.com",
"dianping.com",
"dianxiaomi.com",
"diariolibre.com",
"dice.com",
"dicio.com.br",
@ -2646,6 +2643,7 @@
"eastday.com",
"eastmoney.com",
"easychair.org",
"easyhindityping.com",
"easyjet.com",
"easytomessage.com",
"eater.com",
@ -2902,7 +2900,6 @@
"esuteru.com",
"esy.es",
"etao.com",
"etecsa.net",
"etemadonline.com",
"ethereum.org",
"etherscan.io",
@ -3406,6 +3403,7 @@
"gazetaexpress.com",
"gazzetta.gr",
"gazzetta.it",
"gazzettadelsud.it",
"gbcinternetenforcement.net",
"gcloudcs.com",
"gcs-web.com",
@ -3458,6 +3456,7 @@
"getdonspeg.work",
"getdropbox.com",
"getemoji.com",
"getepic.com",
"getfirefox.com",
"getfvid.com",
"getgo.com",
@ -3470,7 +3469,6 @@
"getty.edu",
"gettyimages.com",
"getui.com",
"getui.net",
"gfan.com",
"gfk.com",
"gfx.ms",
@ -3832,6 +3830,7 @@
"hapitas.jp",
"haplat.net",
"haqqin.az",
"haraj.com.sa",
"harborfreight.com",
"hardened-php.net",
"hardrock.com",
@ -3987,7 +3986,6 @@
"hometax.go.kr",
"honda.co.jp",
"honda.com",
"honeygain.com",
"honeywell.com",
"hongkiat.com",
"hoodsite.com",
@ -4250,7 +4248,6 @@
"immi.gov.au",
"immobiliare.it",
"immobilienscout24.de",
"imo.im",
"imo.org",
"imoim.app",
"imooc.com",
@ -4448,6 +4445,7 @@
"isc.org",
"isi.edu",
"islamweb.net",
"islcollective.com",
"ismedia.jp",
"isna.ir",
"isnssdk.com",
@ -4463,6 +4461,7 @@
"italki.com",
"itar-tass.com",
"itau.com.br",
"itavcn.com",
"itch.io",
"itesm.mx",
"iteye.com",
@ -4512,7 +4511,6 @@
"jalopnik.com",
"jamanetwork.com",
"jamendo.com",
"jamesclear.com",
"jamieoliver.com",
"jamnews.com",
"jandan.net",
@ -4609,6 +4607,7 @@
"join.me",
"joinhoney.com",
"joins.com",
"jomashop.com",
"jooble.org",
"joomag.com",
"joomla.org",
@ -4800,7 +4799,6 @@
"knect365.com",
"knet.cn",
"knightlab.com",
"knoji.com",
"knowyourmeme.com",
"knoxnews.com",
"ko-fi.com",
@ -4851,7 +4849,6 @@
"ksu.edu.sa",
"kth.se",
"ktla.com",
"ktvu.com",
"ku.dk",
"ku.edu",
"ku6.com",
@ -4938,6 +4935,7 @@
"lbl.gov",
"lboro.ac.uk",
"lci.fr",
"lcl.fr",
"ldoceonline.com",
"lds.org",
"le.ac.uk",
@ -5168,7 +5166,6 @@
"luckyforbet.com",
"ludashi.com",
"lufthansa.com",
"luisaviaroma.com",
"lulu.com",
"lululemon.com",
"lumenlearning.com",
@ -5221,7 +5218,6 @@
"mail-archive.com",
"mail-order-bride.biz",
"mail-order-bride.net",
"mail-order-brides.org",
"mail.com",
"mail.ru",
"mailchi.mp",
@ -5260,6 +5256,7 @@
"manhuagui.com",
"manithan.com",
"manoramaonline.com",
"manre.me",
"manta.com",
"manualslib.com",
"manuscriptcentral.com",
@ -5277,7 +5274,6 @@
"maricopa.edu",
"marieclaire.com",
"marieclaire.com.tw",
"marijuanabreak.com",
"marinetraffic.com",
"marist.edu",
"markethive.com",
@ -5427,6 +5423,7 @@
"mercadolivre.com",
"mercadolivre.com.br",
"mercadopago.com",
"mercadopago.com.ar",
"mercantilbanco.com",
"mercari.com",
"mercedes-benz.com",
@ -5459,7 +5456,6 @@
"metropoles.com",
"metropolitanbaptistchurch.org",
"metrotimes.com",
"metu.edu.tr",
"mewe.com",
"mewkid.net",
"mext.go.jp",
@ -5539,6 +5535,7 @@
"mirror.co.uk",
"mirtesen.ru",
"mises.org",
"misionesonline.net",
"missouri.edu",
"mit.edu",
"mitre.org",
@ -5558,6 +5555,7 @@
"mktoresp.com",
"ml.com",
"ml314.com",
"mlabs.com.br",
"mlb.com",
"mlit.go.jp",
"mlive.com",
@ -5578,7 +5576,6 @@
"mobile.de",
"mobile.ir",
"mobile01.com",
"mobileapptracking.com",
"mobilesystemservice.com",
"mobinsb.com",
"mobirise.com",
@ -5637,6 +5634,7 @@
"mooo.com",
"mootools.net",
"mop.com",
"moph.go.th",
"mopub.com",
"morganstanley.com",
"morgenpost.de",
@ -5666,8 +5664,6 @@
"moviefone.com",
"movieranker.com",
"movieweb.com",
"movistar.es",
"movistarplus.es",
"movs4u.live",
"moz.com",
"mozaws.net",
@ -5990,7 +5986,6 @@
"newscientist.com",
"newsday.com",
"newser.com",
"newsgator.com",
"newsit.gr",
"newsmax.com",
"newsmth.net",
@ -6692,7 +6687,6 @@
"pku.edu.cn",
"placed.com",
"placeit.net",
"plagiarismdetector.net",
"plala.or.jp",
"planalto.gov.br",
"planetminecraft.com",
@ -6731,6 +6725,7 @@
"poetryfoundation.org",
"poets.org",
"pof.com",
"point2homes.com",
"pojoksatu.id",
"pokemon.com",
"poki.com",
@ -6741,6 +6736,7 @@
"politico.eu",
"politicususa.com",
"politifact.com",
"polito.it",
"polldaddy.com",
"polyfill.io",
"polygon.com",
@ -6759,6 +6755,7 @@
"popularmechanics.com",
"porngo.com",
"pornhat.com",
"pornhd.com",
"pornhub.com",
"pornhubpremium.com",
"pornolab.net",
@ -6973,6 +6970,7 @@
"quickanddirtytips.com",
"quickconnect.to",
"quicksprout.com",
"quidco.com",
"quikr.com",
"quillbot.com",
"quizizz.com",
@ -7094,6 +7092,7 @@
"redditmedia.com",
"redditstatic.com",
"redfin.com",
"redflagdeals.com",
"redgifs.com",
"redhat.com",
"rediff.com",
@ -7132,6 +7131,7 @@
"rentalcars.com",
"repec.org",
"repl.it",
"report.az",
"repretel.com",
"repubblica.it",
"republicworld.com",
@ -7407,6 +7407,7 @@
"sass-lang.com",
"sastasundar.com",
"sat.gob.mx",
"saturn.de",
"saude.gov.br",
"saudigazette.com.sa",
"savefrom.net",
@ -7436,7 +7437,6 @@
"sc.gov.cn",
"scamadviser.com",
"scdn.co",
"scdn.vn",
"scene7.com",
"sch.gr",
"schema.org",
@ -7483,7 +7483,6 @@
"scu.edu.cn",
"scumvv.ca",
"scuoladigitale.info",
"scut.edu.cn",
"sd.gov",
"sda.gov.cn",
"sdamgia.ru",
@ -7635,7 +7634,6 @@
"shinezone.com",
"shinobi.jp",
"shinystat.com",
"shipsautopilot.com",
"sho.com",
"shomanews.com",
"shop-pro.jp",
@ -7691,7 +7689,6 @@
"sina.com",
"sina.com.cn",
"sinaimg.cn",
"sinajs.cn",
"sindonews.com",
"singaporeair.com",
"singular.net",
@ -7726,7 +7723,6 @@
"skillshare.com",
"skimlinks.com",
"skimresources.com",
"skribbl.io",
"skrill.com",
"skroutz.gr",
"sky.com",
@ -7813,6 +7809,7 @@
"so-net.ne.jp",
"so.com",
"soap2day.to",
"soccerstreams.net",
"soccerway.com",
"socdm.com",
"socialbakers.com",
@ -7829,7 +7826,6 @@
"softpedia.com",
"softwareadvice.com",
"sogou.com",
"sogoucdn.com",
"soha.vn",
"sohatv.vn",
"sohu.com",
@ -7902,6 +7898,7 @@
"splcenter.org",
"splice.com",
"splunk.com",
"spokeo.com",
"spokesman.com",
"sponichi.co.jp",
"sporcle.com",
@ -7913,6 +7910,7 @@
"sportbox.ru",
"sportingnews.com",
"sportmaster.ru",
"sportradar.com",
"sportradarserving.com",
"sports.ru",
"sportskeeda.com",
@ -7973,7 +7971,6 @@
"stan.com.au",
"standaard.be",
"standard.co.uk",
"standardchartered.com",
"standardmedia.co.ke",
"stanford.edu",
"staples.com",
@ -8014,6 +8011,7 @@
"steepster.com",
"steepto.com",
"steinberg.net",
"stellamccartney.com",
"stereogum.com",
"stern.de",
"stickyadstv.com",
@ -8043,6 +8041,7 @@
"strath.ac.uk",
"strava.com",
"streamable.com",
"streamall-search.com",
"streamlabs.com",
"streamyard.com",
"streetinsider.com",
@ -8081,7 +8080,6 @@
"sudouest.fr",
"sudrf.ru",
"sueddeutsche.de",
"suhosin.org",
"suicidepreventionlifeline.org",
"suite101.com",
"sulekha.com",
@ -8100,6 +8098,7 @@
"suntrust.com",
"sunysb.edu",
"superbthemes.com",
"superhaber.tv",
"superpages.com",
"supersonic.com",
"supersonicads.com",
@ -8143,6 +8142,7 @@
"symcd.com",
"symfony.com",
"sympatico.ca",
"sympla.com.br",
"synology.com",
"synology.me",
"synxis.com",
@ -8311,13 +8311,11 @@
"test.de",
"testbook.com",
"texas.gov",
"texasmonthly.com",
"texastribune.org",
"text.ru",
"textnow.com",
"tf1.fr",
"tfl.gov.uk",
"tgd.kr",
"tgju.org",
"thairath.co.th",
"thalesgroup.com",
@ -8490,6 +8488,7 @@
"tigris.org",
"tiki.vn",
"tiktok.com",
"tiktokcdn-in.com",
"tiktokcdn.com",
"tiktokv.com",
"tilda.cc",
@ -8632,7 +8631,6 @@
"travelchannel.com",
"travelocity.com",
"traveloka.com",
"travelpod.com",
"travis-ci.org",
"treas.gov",
"treasuredata.com",
@ -8674,6 +8672,7 @@
"trontv.com",
"trudvsem.ru",
"truecaller.com",
"truecar.com",
"trueleadid.com",
"trulia.com",
"truoptik.com",
@ -8804,6 +8803,7 @@
"uc.cn",
"uc.edu",
"uc.pt",
"uc3m.es",
"ucalgary.ca",
"ucar.edu",
"ucas.com",
@ -9143,7 +9143,6 @@
"validcbdoil.com",
"value-domain.com",
"valuecommerce.com",
"valueimpression.com",
"valvesoftware.com",
"vam.ac.uk",
"vancouversun.com",
@ -9996,6 +9995,7 @@
"zro56hd6szoy.com",
"zscaler.com",
"zulily.com",
"zum.com",
"zumiez.com",
"zurb.com",
"zxart.cn",
@ -10010,5 +10010,5 @@
],
"name": "Top 10K most-used sites from Tranco",
"type": "hostname",
"version": 20200714
"version": 20200715
}

View File

@ -1,39 +1,57 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import zipfile
import datetime
import json
import zipfile
tranco_url = 'https://tranco-list.eu/top-1m.csv.zip'
tranco_file = 'top-1m.csv.zip'
user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0'}
r = requests.get(tranco_url, headers=user_agent)
with open(tranco_file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
with zipfile.ZipFile(tranco_file, 'r') as tranco_lists:
for name in tranco_lists.namelist():
if name == 'top-1m.csv':
with tranco_lists.open(name) as tranco:
sites = tranco.readlines()
else:
continue
from generator import download, download_to_file, get_abspath_list_file, get_version
tranco_warninglist = {}
version = int(datetime.date.today().strftime('%Y%m%d'))
tranco_warninglist['description'] = "Event contains one or more entries from the top 1,000,000 most-used sites (Tranco)."
d = datetime.datetime.now()
tranco_warninglist['version'] = version
tranco_warninglist['name'] = "Top 1,000,000 most-used sites from Tranco"
tranco_warninglist['type'] = 'hostname'
tranco_warninglist['list'] = []
tranco_warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip']
def process(file, warninglist, dst, first_10k=False):
for site in sites:
v = site.decode('UTF-8').split(',')[1]
tranco_warninglist['list'].append(v.rstrip())
tranco_warninglist['list'] = sorted(set(tranco_warninglist['list']))
print(json.dumps(tranco_warninglist))
with zipfile.ZipFile(file, 'r') as tranco_lists:
for name in tranco_lists.namelist():
if name == 'top-1m.csv':
with tranco_lists.open(name) as tranco:
if first_10k:
sites = tranco.readlines()[:10000]
else:
sites = tranco.readlines()
else:
continue
warninglist['type'] = 'hostname'
warninglist['version'] = get_version()
warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip']
for site in sites:
v = site.decode('UTF-8').split(',')[1]
warninglist['list'].append(v.rstrip())
warninglist['list'] = sorted(set(warninglist['list']))
with open(get_abspath_list_file(dst), 'w') as data_file:
json.dump(warninglist, data_file, indent=2, sort_keys=True)
data_file.write("\n")
if __name__ == '__main__':
tranco_url = 'https://tranco-list.eu/top-1m.csv.zip'
tranco_file = 'top-1m.csv.zip'
download_to_file(tranco_url, tranco_file)
# Top 1M
tranco_dst = "tranco"
tranco_warninglist = {
'description': "Event contains one or more entries from the top 1,000,000 most-used sites (https://tranco-list.eu/).",
'name': "Top 1,000,000 most-used sites from Tranco"
}
process(tranco_file, tranco_warninglist, tranco_dst)
# Top 10K
tranco_10k_dst = "tranco10k"
tranco_10k_warninglist = {
'description': "Event contains one or more entries from the top 10K most-used sites (https://tranco-list.eu/).",
'name': "Top 10K most-used sites from Tranco"
}
process(tranco_file, tranco_10k_warninglist, tranco_10k_dst, first_10k=True)

View File

@ -1,39 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import zipfile
import datetime
import json
tranco_url = 'https://tranco-list.eu/top-1m.csv.zip'
tranco_file = 'top-1m.csv.zip'
user_agent = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0'}
r = requests.get(tranco_url, headers=user_agent)
with open(tranco_file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
with zipfile.ZipFile(tranco_file, 'r') as tranco_lists:
for name in tranco_lists.namelist():
if name == 'top-1m.csv':
with tranco_lists.open(name) as tranco:
sites = tranco.readlines()[:10000]
else:
continue
tranco_warninglist = {}
version = int(datetime.date.today().strftime('%Y%m%d'))
tranco_warninglist['description'] = "Event contains one or more entries from the top 10K most-used sites (Tranco)."
d = datetime.datetime.now()
tranco_warninglist['version'] = version
tranco_warninglist['name'] = "Top 10K most-used sites from Tranco"
tranco_warninglist['type'] = 'hostname'
tranco_warninglist['list'] = []
tranco_warninglist['matching_attributes'] = ['hostname', 'domain', 'url', 'domain|ip']
for site in sites:
v = site.decode('UTF-8').split(',')[1]
tranco_warninglist['list'].append(v.rstrip())
tranco_warninglist['list'] = sorted(set(tranco_warninglist['list']))
print(json.dumps(tranco_warninglist))

35
tools/generator.py Normal file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import datetime
from inspect import currentframe, getframeinfo
from os import path
import requests
def download_to_file(url, file):
user_agent = {
"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
r = requests.get(url, headers=user_agent)
with open(file, 'wb') as fd:
for chunk in r.iter_content(4096):
fd.write(chunk)
def download(url):
user_agent = {
"User-agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0"}
return requests.get(url, headers=user_agent)
def get_abspath_list_file(dst):
rel_path = getframeinfo(currentframe()).filename
current_folder = path.dirname(path.abspath(rel_path))
real_path = path.join(
current_folder, '../lists/{dst}/list.json'.format(dst=dst))
return path.abspath(path.realpath(real_path))
def get_version():
return int(datetime.date.today().strftime('%Y%m%d'))