adding wikipedia scrapper for google domains
parent
93b6a5c120
commit
02b4bf8a52
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from lxml import etree
|
||||
from bs4 import BeautifulSoup
|
||||
import datetime
|
||||
import urllib.request, urllib.parse, urllib.error
|
||||
import json
|
||||
|
||||
#webpage = urllib.request.urlopen("https://en.wikipedia.org/w/index.php?title=List_of_Google_domains&printable=yes")
|
||||
webpage = urllib.request.urlopen("https://en.wikipedia.org/w/index.php?title=List_of_Google_domains")
|
||||
soup = BeautifulSoup(webpage,'html.parser')
|
||||
|
||||
tables = soup.find_all("table", { "class" : "wikitable sortable" })
|
||||
#print(tables)
|
||||
|
||||
gdomains = []
|
||||
for tabl in tables :
|
||||
|
||||
for row in tabl.findAll("tr"):
|
||||
cells = row.findAll('td')
|
||||
if len(cells) == 4:
|
||||
domain = cells[2].find_all(text=True)
|
||||
|
||||
if len(domain) is 1 :
|
||||
domain = "{}".format(domain[0])
|
||||
elif len(domain) is 2 :
|
||||
domain = "{}{}".format(domain[0], domain[1])
|
||||
elif len(domain) is 3 and not "[" in domain[2]:
|
||||
domain = "{}{}".format(domain[0], domain[2])
|
||||
else:
|
||||
domain = "{}{}".format(domain[0], domain[1])
|
||||
|
||||
print(domain)
|
||||
|
||||
gdomains.append(domain)
|
||||
#print(gdomains)
|
||||
gdomains = sorted(set(gdomains))
|
||||
|
||||
google_warninglist = {}
|
||||
version = int(datetime.date.today().strftime('%Y%m%d'))
|
||||
|
||||
google_warninglist['description'] = "Event contains one or more entries from the google owned domains."
|
||||
d = datetime.datetime.now()
|
||||
google_warninglist['version'] = version
|
||||
google_warninglist['name'] = "Known Google domains"
|
||||
google_warninglist['list'] = []
|
||||
google_warninglist['matching_attributes'] = ['hostname', 'domain']
|
||||
|
||||
for site in gdomains:
|
||||
#v = str(site).split(',')[1]
|
||||
google_warninglist['list'].append(site)
|
||||
google_warninglist['list'] = sorted(set(google_warninglist['list']))
|
||||
#print(json.dumps(google_warninglist))
|
Loading…
Reference in New Issue