Merge pull request #574 from VVX7/main

new: [dev] add ASPI's China Defence University Tracker.
2020-08-23 10:12:02 +02:00 · 2020-08-23 10:12:02 +02:00 · a0fd1a2bc5
parent 6d546eb025 4635146b00
commit a0fd1a2bc5
3 changed files with 5814 additions and 0 deletions
--- a/clusters/china-defence-universities.json
+++ b/clusters/china-defence-universities.json
--- a/galaxies/china-defence-universities.json
+++ b/galaxies/china-defence-universities.json
@ -0,0 +1,9 @@
+{
+  "description": "China Defence Universities",
+  "icon": "globe",
+  "name": "China Defence Universities",
+  "namespace": "misp",
+  "type": "china-defence-universities",
+  "uuid": "c51c59e9-f213-4ad4-9913-09a43d78dff5",
+  "version": 1
+}
--- a/tools/gen_defence_university.py
+++ b/tools/gen_defence_university.py
@ -0,0 +1,295 @@
+#!/usr/bin/python3
+import requests
+import json
+from bs4 import BeautifulSoup
+import bs4
+import uuid
+
+# This tool is part of the MISP core project and released under the GNU Affero
+# General Public License v3.0
+#
+# Copyright (C) 2020 Cormac Doherty
+# Copyright (C) 2020 Roger Johnston
+#
+#
+# version 0.1 - initial
+# version 0.2 - fixed typo ( _curRef NOT curRef)
+
+def _buildArticleSection(nxtSibling):
+    _sectionParagraphs = []
+    _nxtsib = nxtSibling
+
+    # Headings and their content are at the same hierarchical
+    # level in the html - just a sequence. This loop is bounded on
+    # the next element being a <p>
+    while ((_nxtsib is not None) and (_nxtsib.name == 'p')):
+        # Almost every sentence, if not clause, in parapgraph
+        # text is referenced/cited/footnoted.
+        #
+        # The following iterates through the sequence of 'tokens'
+        # in the current <p>, building 'statements' composed of a
+        # statement and a reference.
+        #
+        # so-called "clauses" and "references" are accumulated over
+        # loop iterations i.e. a clause is appended to previous clauses
+        # if a reference has yet to be accumulated. (implicitly -
+        # references come after statements.)
+        #
+        # Once a 'clause' AND a 'statement' are accumulated, an encapsulating
+        # 'statement' is appended to the section's list of paragraphs and
+        # are reset.
+        #
+        _curClause = None
+        _curRef = None
+
+        for token in _nxtsib.contents:
+            # References (links) are interleved within text blocks as <spans>.
+            # The following control structure parses 'the next token' as
+            #    - <spans> containing a link
+            #    - disposable 'junk' if its <em>phasised and contains "Last update"
+            #    - as relevant paragraph text to be accumulated.
+            if (token.name == 'span'):
+                _anchors = token.find_all('a', recursive=True)
+                _anch = None
+                if (len(_anchors) != 0):
+                    _anch = _anchors[0]
+
+                if (_anch is not None):
+                    _curRef = _anch['href']
+                else:
+                    _curRef = None
+            elif ((token.name != 'em') or (not ("Last updated" in token.text))):  # ignore the "last updated footer
+                if (_curClause is not None):
+                    if (isinstance(token, bs4.element.NavigableString)):
+                        _curClause = _curClause + token
+                    else:
+                        _curClause = _curClause + token.text
+                else:
+                    # anomalous html handling
+                    #  - <strong> and
+                    #  - (useless) <a> tags
+                    # appear in a few places
+                    if ((token.name != 'strong') and
+                            (token.name != 'em') and
+                            (token.name != 'br') and
+                            (token.name != 'sup') and
+                            (token.name != 'a')):
+                        _curClause = token  # this quashes them
+
+            # Once a 'clause' AND a 'statement' are accumulated, an encapsulating
+            # 'statement' is appended to the section's list of paragraphs and
+            # are reset.
+            if ((_curRef is not None) and (_curClause is not None)):
+                statement = {}
+                statement["clause"] = _curClause
+                statement["ref"] = _curRef
+                _sectionParagraphs.append(statement)
+                _curClause = None
+                _curRef = None
+
+        # If a sequence of 'clauses' have been accumulated without finding a reference
+        # create a reference-LESS statement.
+        if ((_curClause is not None) and (not "Last updated" in _curClause)):
+            statement = {}
+            statement["clause"] = _curClause
+            _sectionParagraphs.append(statement)
+
+        _nxtsib = _nxtsib.find_next_sibling()
+
+    return _sectionParagraphs
+
+
+def _buildListSection(listContent):
+    laboratories = []
+    for lab in listContent.find_all('li', recursive="False"):
+        _lab = {}
+        _lab['name'] = lab.contents[0].replace(u'\xa0', '')
+
+        ref = lab.find('a')
+        if (ref is not None):
+            _lab['ref'] = ref['href']
+        else:
+            _lab['ref'] = None
+
+        laboratories.append(_lab)
+
+    return laboratories
+
+
+def _fetchArticle(url):
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html5lib')
+    _article = soup.body.find_all('article')[0]
+
+    article = {}
+    article['url'] = url
+    article['name'] = _article.h1.text.replace('\n', '').strip()
+    article['_name'] = _article.h2.contents[0]
+
+    _artbody = _article.find('div', {"class": "article__copy"})
+
+    # Risk Statement
+    article['risk statement'] = _artbody.find('p').text
+
+    article['intro'] = _buildArticleSection(_artbody.find('p').find_next_sibling())
+
+    # Article body
+    sections = []
+
+    for _heading in _artbody.findChildren('h2'):
+        _nxtSibling = _heading.find_next_sibling()
+
+        section = {}
+        section['title'] = _heading.text
+        if (_nxtSibling.name == 'ul'):
+            section['body'] = _buildListSection(_nxtSibling)
+        else:
+            section['body'] = _buildArticleSection(_nxtSibling)
+        sections.append(section)
+
+    article['sections'] = sections
+
+    #    # Logo
+    #    logo = _article.div[0].aside[0].find("div", {"class": "aside__logo"})
+
+    _panel = _article.find("div", {"class": "aside__groups cf"})
+    _paneldivs = _panel.find_all('div')
+
+    for _paneldiv in _panel.find_all('div'):
+        _title = _paneldiv.find('h3').text
+        _items = []
+        for _item in _paneldiv.find_all('li'):
+            _anch = _item.find('a')
+            if (_anch is not None):
+                if ("Location" in _title):  # locations
+                    _loc = {}
+                    _loc['name'] = _anch.contents[0].replace('\n', '').strip()
+                    _loc['ref'] = _anch['href']
+                    _latlong = _anch['href'].split("=")[1]
+                    _loc['lat'] = _latlong.split(",")[0]
+                    _loc['long'] = _latlong.split(",")[1]
+                    _items.append(_loc)
+                else:
+                    _items.append(_anch.text)
+            else:
+                _items.append(_item.text.replace('\n', '').strip())
+        article[_title.lower()] = _items
+
+    return article
+
+
+def _gen_galaxy(scrape):
+    base = {
+        "authors": [
+            "Australian Strategic Policy Institute"
+        ],
+        "category": "academic-institution",
+        "description": "The China Defence Universities Tracker is a database of Chinese institutions engaged in military or security-related science and technology research. It was created by ASPI’s International Cyber Policy Centre.",
+        "name": "China Defence Universities Tracker",
+        "source": "ASPI International Cyber Policy Centre",
+        "type": "china-defence-universities",
+        "uuid": "d985d2eb-d6ad-4b44-9c69-44eb90095e23",
+        "values": [
+        ],
+        "version": 1
+    }
+
+    for uni in scrape:
+        new_template = template = {
+            "description": "",
+            "meta": {
+                "refs": []
+            },
+            "uuid": "",
+            "value": ""
+        }
+
+        new_template["uuid"] = str(uuid.uuid4())
+
+        new_template["meta"]["refs"].append(uni["url"])
+
+        new_template["value"] = uni["name"] + f" ({uni['_name']})"
+
+        def _append_meta(key, meta):
+            if uni.get(meta):
+                values = []
+                for value in uni[meta]:
+                    if value != "":
+                        values.append(value)
+                if values:
+                    new_template["meta"][key] = values
+
+        if uni.get("intro"):
+            for intro in uni["intro"]:
+                new_template["description"] += intro["clause"]
+            if new_template["description"] == "":
+                new_template["description"] += uni["name"] + f" ({uni['_name']})"
+        else:
+            new_template["description"] += uni["name"] + f" ({uni['_name']})"
+
+        if uni.get("risk"):
+            if uni.get("risk") != "":
+                new_template["meta"]["risk"] = uni["risk statement"]
+
+        _append_meta("aliases", "aliases")
+
+        _append_meta("supervising agencies", "supervising agencies")
+
+        _append_meta("subsidiaries", "subsidiaries")
+
+        _append_meta("topics", "topics")
+
+        _append_meta("categories", "categories")
+
+        if uni.get("sections"):
+            labs = []
+            for section in uni["sections"]:
+                if section["title"] == "Major defence laboratories":
+                    for lab in section["body"]:
+                        if lab.get("name"):
+                            if lab["name"] != "":
+                                labs.append(lab["name"])
+            if labs:
+                new_template["meta"]["major defence laboratories"] = labs
+
+        if uni.get("location"):
+            if uni.get(uni["location"][0]["name"]) != "":
+                new_template["meta"]["address"] = uni["location"][0]["name"]
+            if uni.get(uni["location"][0]["lat"]) != "":
+                new_template["meta"]["lat"] = uni["location"][0]["lat"]
+            if uni.get(uni["location"][0]["long"]) != "":
+                new_template["meta"]["long"] = uni["location"][0]["long"]
+
+        base["values"].append(new_template)
+
+    return base
+
+
+def main():
+    url = "https://unitracker.aspi.org.au"
+    response = requests.get(url)
+
+    soup = BeautifulSoup(response.content, 'html5lib')
+
+    table = soup.find_all('table')[0]  # Grab the first table
+    head = None
+    articles = []
+    for row in table.find_all('tr'):
+        if head is not None:
+            colOne = row.find_all('td')[0].find_all('a')[0]['href']
+            article = _fetchArticle(url + colOne)
+            print("Processing: {}".format(url + colOne))
+            articles.append(article)
+        else:
+            head = "bloop"
+
+    galaxy = _gen_galaxy(articles)
+
+    print(galaxy)
+
+    with open("china-defence-universities.json", "w") as g:
+        g.write(json.dumps(galaxy, indent=4, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()