#!/usr/bin/env python3 # -*- coding: utf-8 -*- import ipaddress from urllib.parse import urlparse, urljoin import requests from bs4 import BeautifulSoup from generator import download_to_file, get_version, write_to_file, get_abspath_source_file def is_valid(url): """ Checks whether `url` is a valid URL. """ parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def get_all_website_links(url): internal_urls = set() external_urls = set() """ Returns all URLs that is found on `url` in which it belongs to the same website """ # all URLs of `url` urls = set() # domain name of the URL without the protocol domain_name = urlparse(url).netloc soup = BeautifulSoup(requests.get(url).content, "html.parser") for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: # href empty tag continue # join the URL if it's relative (not absolute link) href = urljoin(url, href) parsed_href = urlparse(href) # remove URL GET parameters, URL fragments, etc. href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if not is_valid(href): # not a valid URL continue if href in internal_urls: # already in the set continue if domain_name not in href: # external link if href not in external_urls: external_urls.add(href) continue urls.add(href) internal_urls.add(href) return urls, internal_urls, external_urls def get_file_link(base_url, filename): urls, internal_urls, external_urls = get_all_website_links(base_url) for url in internal_urls: if filename in url: return url def process(files, dst): warninglist = { 'name': "List of known Stackpath CDN IP ranges", 'version': get_version(), 'description': "List of known Stackpath (Highwinds) CDN IP ranges (https://support.stackpath.com/hc/en-us/articles/360001091666-Whitelist-CDN-WAF-IP-Blocks)", 'type': "cidr", 'list': [], 'matching_attributes': ["ip-src", "ip-dst", "domain|ip", "ip-src|port", "ip-dst|port"] } for file in files: with open(get_abspath_source_file(file), 'r') as f: ips = f.readlines() for ip in ips: iptoadd = ip.strip() try: ipaddress.ip_network(ip.strip()) except ValueError as err: # if it's host given strip to the subnet iptoadd = str(ipaddress.IPv6Interface(ip.strip()).ip) warninglist['list'].append(iptoadd) write_to_file(warninglist, dst) if __name__ == '__main__': # Base url where a text file is attached https://support.stackpath.com/hc/en-us/articles/360001091666-Whitelist-CDN-WAF-IP-Blocks" sp_base_url = "https://k3t9x2h3.map2.ssl.hwcdn.net/ipblocks.txt" filename = 'ipblocks.txt' sp_dst = 'stackpath' to_process = list() #url = get_file_link(sp_base_url, filename) file = 'stackpath_{}'.format(filename) download_to_file(sp_base_url, file) to_process.append(file) process(to_process, sp_dst)