add: [api/backend] new full-text indexer

First version using Python whoosh (maybe not optimal on the long run)

The indexer is running by enumerating the item(s) from the CyCAT backend.
main
Alexandre Dulaunoy 2021-05-31 11:40:08 +02:00
parent 5e0df4b667
commit 44d2176a23
No known key found for this signature in database
GPG Key ID: 09E2CD4944E6CBCD
4 changed files with 106 additions and 2 deletions

View File

@ -14,7 +14,16 @@ cycat_type = {"1": "Publisher", "2": "Project", "3": "Item"}
r = redis.Redis(host='127.0.0.1', port='3033', decode_responses=True)
# genericc lib - TODO: move to cycat Python library
# full-text part (/search API)
from whoosh import index, qparser
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
indexpath = "../index"
ix = index.open_dir(indexpath)
# generic lib - TODO: move to cycat Python library
def _validate_uuid(value=None):
if uuid is None:
@ -182,5 +191,18 @@ class propose(Resource):
r.rpush("proposal", json.dumps(x))
return {'message': 'Proposal submitted'}, 200
@api.route('/search/<string:searchquery>')
@api.doc(description="Full-text search in CyCAT and return matching UUID.")
class search(Resource):
def get(self, searchquery=None):
if searchquery is None:
return None
with ix.searcher() as searcher:
query = QueryParser("content", ix.schema).parse(searchquery)
results = searcher.search(query, limit=None)
uuids = []
for result in results:
uuids.append(result['path'])
return(uuids)
if __name__ == '__main__':
app.run()

View File

@ -1,2 +1,3 @@
./kvrocks/src/kvrocks -c ./etc/kvrocks.conf
python3.8 ./bin/server.py
cd bin
python3.8 server.py

61
backend/sbin/indexer.py Normal file
View File

@ -0,0 +1,61 @@
import redis
import os
import sys
cycat_type = {"1": "Publisher", "2": "Project", "3": "Item"}
rdb = redis.Redis(host='127.0.0.1', port='3033', decode_responses=True)
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer
from whoosh.index import create_in, exists_in, open_dir
schema = Schema(
title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT
)
indexpath = "../index"
if not os.path.exists(indexpath):
os.mkdir(indexpath)
if not exists_in(indexpath):
ix = create_in(indexpath, schema)
else:
ix = open_dir(indexpath)
try:
writer = ix.writer()
except:
print("Index is locked.")
sys.exit(1)
def getUUID(oid=None, oidtype=1):
if oid is None:
return None
return rdb.hgetall('{}:{}'.format(oidtype, oid))
for ctype in cycat_type:
card = rdb.zcard("t:{}".format(ctype))
for start in range(0, card, 100):
i = start+100
x = rdb.zrange('t:{}'.format(ctype), start, i)
for item in x:
toindex = getUUID(oid=item, oidtype=ctype)
print(toindex)
title = ""
content = ""
if 'title' in toindex:
title = toindex['title']
content = content + toindex['title']
if 'raw' in toindex:
content = toindex['raw']
if 'description' in toindex:
title = title + toindex['description']
content = content + toindex['description']
if 'mitre-cti:description' in toindex:
title = title + toindex['mitre-cti:description']
content = content + toindex['mitre-cti:description']
if 'github:description' in toindex:
title = title + toindex['github:description']
content = content + toindex['github:description']
writer.update_document(title=title, path=item, content=content)
writer.commit()

20
backend/sbin/search.py Normal file
View File

@ -0,0 +1,20 @@
import argparse
from whoosh import index, qparser
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
indexpath = "../index"
argParser = argparse.ArgumentParser(description="Full text search for cycat")
argParser.add_argument("-q", action="append", help="query to lookup (one or more)")
args = argParser.parse_args()
ix = index.open_dir(indexpath)
with ix.searcher() as searcher:
if len(args.q) == 1:
query = QueryParser("content", ix.schema).parse(" ".join(args.q))
else:
query = QueryParser("content", schema=ix.schema, group=qparser.AndGroup).parse(" ".join(args.q))
results = searcher.search(query, limit=None)
for result in results:
print(result['path'])
print(results)