add: [api/backend] new full-text indexer

First version using Python whoosh (maybe not optimal on the long run) The indexer is running by enumerating the item(s) from the CyCAT backend.
2021-05-31 11:40:08 +02:00 · 2021-05-31 11:40:08 +02:00 · 44d2176a23
parent 5e0df4b667
commit 44d2176a23
4 changed files with 106 additions and 2 deletions
--- a/backend/bin/server.py
+++ b/backend/bin/server.py
@ -14,7 +14,16 @@ cycat_type = {"1": "Publisher", "2": "Project", "3": "Item"}

 r = redis.Redis(host='127.0.0.1', port='3033', decode_responses=True)

-# genericc lib - TODO: move to cycat Python library
+# full-text part (/search API)
+
+from whoosh import index, qparser
+from whoosh.fields import Schema, TEXT, ID
+from whoosh.qparser import QueryParser
+indexpath = "../index"
+ix = index.open_dir(indexpath)
+
+
+# generic lib - TODO: move to cycat Python library

 def _validate_uuid(value=None):
    if uuid is None:
@ -182,5 +191,18 @@ class propose(Resource):
        r.rpush("proposal", json.dumps(x))
        return {'message': 'Proposal submitted'}, 200

+@api.route('/search/<string:searchquery>')
+@api.doc(description="Full-text search in CyCAT and return matching UUID.")
+class search(Resource):
+    def get(self, searchquery=None):
+        if searchquery is None:
+            return None
+        with ix.searcher() as searcher:
+            query = QueryParser("content", ix.schema).parse(searchquery)
+            results = searcher.search(query, limit=None)
+            uuids = []
+            for result in results:
+                uuids.append(result['path'])
+        return(uuids)
 if __name__ == '__main__':
    app.run()
--- a/backend/run.sh
+++ b/backend/run.sh
@ -1,2 +1,3 @@
 ./kvrocks/src/kvrocks -c ./etc/kvrocks.conf
-python3.8 ./bin/server.py
+cd bin
+python3.8 server.py
--- a/backend/sbin/indexer.py
+++ b/backend/sbin/indexer.py
@ -0,0 +1,61 @@
+import redis
+import os
+import sys
+
+cycat_type = {"1": "Publisher", "2": "Project", "3": "Item"}
+
+rdb = redis.Redis(host='127.0.0.1', port='3033', decode_responses=True)
+
+from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
+from whoosh.analysis import StemmingAnalyzer
+from whoosh.index import create_in, exists_in, open_dir
+
+schema = Schema(
+    title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT
+)
+indexpath = "../index"
+if not os.path.exists(indexpath):
+    os.mkdir(indexpath)
+
+if not exists_in(indexpath):
+    ix = create_in(indexpath, schema)
+else:
+    ix = open_dir(indexpath)
+
+try:
+    writer = ix.writer()
+except:
+    print("Index is locked.")
+    sys.exit(1)
+
+def getUUID(oid=None, oidtype=1):
+    if oid is None:
+        return None
+    return rdb.hgetall('{}:{}'.format(oidtype, oid))
+
+for ctype in cycat_type:
+    card = rdb.zcard("t:{}".format(ctype))
+    for start in range(0, card, 100):
+        i = start+100
+        x = rdb.zrange('t:{}'.format(ctype), start, i)
+        for item in x:
+            toindex = getUUID(oid=item, oidtype=ctype)
+            print(toindex)
+            title = ""
+            content = ""
+            if 'title' in toindex:
+                title = toindex['title']
+                content = content + toindex['title']
+            if 'raw' in toindex:
+                content = toindex['raw']
+            if 'description' in toindex:
+                title = title + toindex['description']
+                content = content + toindex['description']
+            if 'mitre-cti:description' in toindex:
+                title = title + toindex['mitre-cti:description']
+                content = content + toindex['mitre-cti:description']
+            if 'github:description' in toindex:
+                title = title + toindex['github:description']
+                content = content + toindex['github:description']
+            writer.update_document(title=title, path=item, content=content)
+writer.commit()
--- a/backend/sbin/search.py
+++ b/backend/sbin/search.py
@ -0,0 +1,20 @@
+import argparse
+from whoosh import index, qparser
+from whoosh.fields import Schema, TEXT, ID
+from whoosh.qparser import QueryParser
+indexpath = "../index"
+argParser = argparse.ArgumentParser(description="Full text search for cycat")
+argParser.add_argument("-q", action="append", help="query to lookup (one or more)")
+args = argParser.parse_args()
+ix = index.open_dir(indexpath)
+
+with ix.searcher() as searcher:
+    if len(args.q) == 1:
+        query = QueryParser("content", ix.schema).parse(" ".join(args.q))
+    else:
+        query = QueryParser("content", schema=ix.schema, group=qparser.AndGroup).parse(" ".join(args.q))
+
+    results = searcher.search(query, limit=None)
+    for result in results:
+        print(result['path'])
+    print(results)