misp-warninglists/tools/generate-stackpath.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import ipaddress
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup

from generator import download_to_file, get_version, write_to_file, get_abspath_source_file


def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_website_links(url):
    internal_urls = set()
    external_urls = set()

    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")

    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue

        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)

        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

        if not is_valid(href):
            # not a valid URL
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                external_urls.add(href)
            continue
        urls.add(href)
        internal_urls.add(href)

    return urls, internal_urls, external_urls


def get_file_link(base_url, filename):
    urls, internal_urls, external_urls = get_all_website_links(base_url)
    for url in internal_urls:
        if filename in url:
            return url


def process(files, dst):
    warninglist = {
        'name': "List of known Stackpath CDN IP ranges",
        'version': get_version(),
        'description': "List of known Stackpath (Highwinds) CDN IP ranges (https://support.stackpath.com/hc/en-us/articles/360001091666-Whitelist-CDN-WAF-IP-Blocks)",
        'type': "cidr",
        'list': [],
        'matching_attributes': ["ip-dst", "ip-src", "domain|ip"]
    }

    for file in files:
        with open(get_abspath_source_file(file), 'r') as f:
            ips = f.readlines()
        for ip in ips:
            iptoadd = ip.strip()
            try:
                ipaddress.ip_network(ip.strip())
            except ValueError as err:  # if it's host given strip to the subnet
                iptoadd = str(ipaddress.IPv6Interface(ip.strip()).ip)
            warninglist['list'].append(iptoadd)

    write_to_file(warninglist, dst)


if __name__ == '__main__':
    sp_base_url = "https://support.stackpath.com/hc/en-us/articles/360001091666-Whitelist-CDN-WAF-IP-Blocks"
    filename = 'ipblocks.txt'
    sp_dst = 'stackpath'

    to_process = list()

    url = get_file_link(sp_base_url, filename)
    file = 'stackpath_{}'.format(filename)
    download_to_file(url, file)
    to_process.append(file)

    process(to_process, sp_dst)
Created new Stackpath CDN IP list 2021-04-21 12:09:59 +02:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`
generate-stackpath.py: Added scraper on website to get new link for ipblocks.txt file 2021-06-02 14:51:30 +02:00			`import ipaddress`
			`from urllib.parse import urlparse, urljoin`

			`import requests`
			`from bs4 import BeautifulSoup`
Created new Stackpath CDN IP list 2021-04-21 12:09:59 +02:00
			`from generator import download_to_file, get_version, write_to_file, get_abspath_source_file`
generate-stackpath.py: Added scraper on website to get new link for ipblocks.txt file 2021-06-02 14:51:30 +02:00

			`def is_valid(url):`
			`"""`
			Checks whether `url` is a valid URL.
			`"""`
			`parsed = urlparse(url)`
			`return bool(parsed.netloc) and bool(parsed.scheme)`


			`def get_all_website_links(url):`
			`internal_urls = set()`
			`external_urls = set()`

			`"""`
			Returns all URLs that is found on `url` in which it belongs to the same website
			`"""`
			# all URLs of `url`
			`urls = set()`
			`# domain name of the URL without the protocol`
			`domain_name = urlparse(url).netloc`
			`soup = BeautifulSoup(requests.get(url).content, "html.parser")`

			`for a_tag in soup.findAll("a"):`
			`href = a_tag.attrs.get("href")`
			`if href == "" or href is None:`
			`# href empty tag`
			`continue`

			`# join the URL if it's relative (not absolute link)`
			`href = urljoin(url, href)`

			`parsed_href = urlparse(href)`
			`# remove URL GET parameters, URL fragments, etc.`
			`href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path`

			`if not is_valid(href):`
			`# not a valid URL`
			`continue`
			`if href in internal_urls:`
			`# already in the set`
			`continue`
			`if domain_name not in href:`
			`# external link`
			`if href not in external_urls:`
			`external_urls.add(href)`
			`continue`
			`urls.add(href)`
			`internal_urls.add(href)`

			`return urls, internal_urls, external_urls`


			`def get_file_link(base_url, filename):`
			`urls, internal_urls, external_urls = get_all_website_links(base_url)`
			`for url in internal_urls:`
			`if filename in url:`
			`return url`

Created new Stackpath CDN IP list 2021-04-21 12:09:59 +02:00
			`def process(files, dst):`
			`warninglist = {`
			`'name': "List of known Stackpath CDN IP ranges",`
			`'version': get_version(),`
			`'description': "List of known Stackpath (Highwinds) CDN IP ranges (https://support.stackpath.com/hc/en-us/articles/360001091666-Whitelist-CDN-WAF-IP-Blocks)",`
			`'type': "cidr",`
			`'list': [],`
			`'matching_attributes': ["ip-dst", "ip-src", "domain\|ip"]`
			`}`

			`for file in files:`
			`with open(get_abspath_source_file(file), 'r') as f:`
			`ips = f.readlines()`
			`for ip in ips:`
chg: [stackpath] host IPv6 addresses are without subnet 2021-04-21 16:42:55 +02:00			`iptoadd = ip.strip()`
			`try:`
			`ipaddress.ip_network(ip.strip())`
generate-stackpath.py: Added scraper on website to get new link for ipblocks.txt file 2021-06-02 14:51:30 +02:00			`except ValueError as err: # if it's host given strip to the subnet`
chg: [stackpath] host IPv6 addresses are without subnet 2021-04-21 16:42:55 +02:00			`iptoadd = str(ipaddress.IPv6Interface(ip.strip()).ip)`
			`warninglist['list'].append(iptoadd)`
Created new Stackpath CDN IP list 2021-04-21 12:09:59 +02:00
			`write_to_file(warninglist, dst)`

generate-stackpath.py: Added scraper on website to get new link for ipblocks.txt file 2021-06-02 14:51:30 +02:00
Created new Stackpath CDN IP list 2021-04-21 12:09:59 +02:00			`if __name__ == '__main__':`
generate-stackpath.py: Added scraper on website to get new link for ipblocks.txt file 2021-06-02 14:51:30 +02:00			`sp_base_url = "https://support.stackpath.com/hc/en-us/articles/360001091666-Whitelist-CDN-WAF-IP-Blocks"`
			`filename = 'ipblocks.txt'`
Created new Stackpath CDN IP list 2021-04-21 12:09:59 +02:00			`sp_dst = 'stackpath'`

			`to_process = list()`

generate-stackpath.py: Added scraper on website to get new link for ipblocks.txt file 2021-06-02 14:51:30 +02:00			`url = get_file_link(sp_base_url, filename)`
			`file = 'stackpath_{}'.format(filename)`
			`download_to_file(url, file)`
			`to_process.append(file)`
Created new Stackpath CDN IP list 2021-04-21 12:09:59 +02:00
			`process(to_process, sp_dst)`