2021-04-21 12:09:59 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
2021-06-02 14:51:30 +02:00
import ipaddress
from urllib . parse import urlparse , urljoin
import requests
from bs4 import BeautifulSoup
2021-04-21 12:09:59 +02:00
from generator import download_to_file , get_version , write_to_file , get_abspath_source_file
2021-06-02 14:51:30 +02:00
def is_valid ( url ) :
"""
Checks whether ` url ` is a valid URL .
"""
parsed = urlparse ( url )
return bool ( parsed . netloc ) and bool ( parsed . scheme )
def get_all_website_links ( url ) :
internal_urls = set ( )
external_urls = set ( )
"""
Returns all URLs that is found on ` url ` in which it belongs to the same website
"""
# all URLs of `url`
urls = set ( )
# domain name of the URL without the protocol
domain_name = urlparse ( url ) . netloc
soup = BeautifulSoup ( requests . get ( url ) . content , " html.parser " )
for a_tag in soup . findAll ( " a " ) :
href = a_tag . attrs . get ( " href " )
if href == " " or href is None :
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin ( url , href )
parsed_href = urlparse ( href )
# remove URL GET parameters, URL fragments, etc.
href = parsed_href . scheme + " :// " + parsed_href . netloc + parsed_href . path
if not is_valid ( href ) :
# not a valid URL
continue
if href in internal_urls :
# already in the set
continue
if domain_name not in href :
# external link
if href not in external_urls :
external_urls . add ( href )
continue
urls . add ( href )
internal_urls . add ( href )
return urls , internal_urls , external_urls
def get_file_link ( base_url , filename ) :
urls , internal_urls , external_urls = get_all_website_links ( base_url )
for url in internal_urls :
if filename in url :
return url
2021-04-21 12:09:59 +02:00
def process ( files , dst ) :
warninglist = {
' name ' : " List of known Stackpath CDN IP ranges " ,
' version ' : get_version ( ) ,
' description ' : " List of known Stackpath (Highwinds) CDN IP ranges (https://support.stackpath.com/hc/en-us/articles/360001091666-Whitelist-CDN-WAF-IP-Blocks) " ,
' type ' : " cidr " ,
' list ' : [ ] ,
' matching_attributes ' : [ " ip-dst " , " ip-src " , " domain|ip " ]
}
for file in files :
with open ( get_abspath_source_file ( file ) , ' r ' ) as f :
ips = f . readlines ( )
for ip in ips :
2021-04-21 16:42:55 +02:00
iptoadd = ip . strip ( )
try :
ipaddress . ip_network ( ip . strip ( ) )
2021-06-02 14:51:30 +02:00
except ValueError as err : # if it's host given strip to the subnet
2021-04-21 16:42:55 +02:00
iptoadd = str ( ipaddress . IPv6Interface ( ip . strip ( ) ) . ip )
warninglist [ ' list ' ] . append ( iptoadd )
2021-04-21 12:09:59 +02:00
write_to_file ( warninglist , dst )
2021-06-02 14:51:30 +02:00
2021-04-21 12:09:59 +02:00
if __name__ == ' __main__ ' :
2022-01-04 15:45:01 +01:00
# Base url where a text file is attached https://support.stackpath.com/hc/en-us/articles/360001091666-Whitelist-CDN-WAF-IP-Blocks"
sp_base_url = " https://support.stackpath.com/hc/en-us/article_attachments/360096407372/ipblocks.txt "
2021-06-02 14:51:30 +02:00
filename = ' ipblocks.txt '
2021-04-21 12:09:59 +02:00
sp_dst = ' stackpath '
to_process = list ( )
2022-01-04 15:45:01 +01:00
#url = get_file_link(sp_base_url, filename)
2021-06-02 14:51:30 +02:00
file = ' stackpath_ {} ' . format ( filename )
2022-01-04 15:45:01 +01:00
download_to_file ( sp_base_url , file )
2021-06-02 14:51:30 +02:00
to_process . append ( file )
2021-04-21 12:09:59 +02:00
process ( to_process , sp_dst )