diff --git a/tools/generate-google.py b/tools/generate-google.py new file mode 100644 index 0000000..e5d9299 --- /dev/null +++ b/tools/generate-google.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 + +from lxml import etree +from bs4 import BeautifulSoup +import datetime +import urllib.request, urllib.parse, urllib.error +import json + +#webpage = urllib.request.urlopen("https://en.wikipedia.org/w/index.php?title=List_of_Google_domains&printable=yes") +webpage = urllib.request.urlopen("https://en.wikipedia.org/w/index.php?title=List_of_Google_domains") +soup = BeautifulSoup(webpage,'html.parser') + +tables = soup.find_all("table", { "class" : "wikitable sortable" }) +#print(tables) + +gdomains = [] +for tabl in tables : + + for row in tabl.findAll("tr"): + cells = row.findAll('td') + if len(cells) == 4: + domain = cells[2].find_all(text=True) + + if len(domain) is 1 : + domain = "{}".format(domain[0]) + elif len(domain) is 2 : + domain = "{}{}".format(domain[0], domain[1]) + elif len(domain) is 3 and not "[" in domain[2]: + domain = "{}{}".format(domain[0], domain[2]) + else: + domain = "{}{}".format(domain[0], domain[1]) + + print(domain) + + gdomains.append(domain) +#print(gdomains) +gdomains = sorted(set(gdomains)) + +google_warninglist = {} +version = int(datetime.date.today().strftime('%Y%m%d')) + +google_warninglist['description'] = "Event contains one or more entries from the google owned domains." +d = datetime.datetime.now() +google_warninglist['version'] = version +google_warninglist['name'] = "Known Google domains" +google_warninglist['list'] = [] +google_warninglist['matching_attributes'] = ['hostname', 'domain'] + +for site in gdomains: + #v = str(site).split(',')[1] + google_warninglist['list'].append(site) +google_warninglist['list'] = sorted(set(google_warninglist['list'])) +#print(json.dumps(google_warninglist))