2024-03-29 14:43:41 +01:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# A simple convertor of the UK Health Security Agency Culture Collections
# to a MISP Galaxy datastructure.
# Copyright (C) 2024 MISP Project
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import json
import requests
import uuid
2024-06-18 16:58:38 +02:00
from pymispgalaxies import Cluster , Galaxy
2024-03-29 14:43:41 +01:00
'''
From https : / / www . culturecollections . org . uk / search / ? searchScope = Product & pageNumber = 1 & filter . collectionGroup = 0 & filter . collection = 0 & filter . sorting = DateCreated
JSON is loaded , needs to be paginated
2024-04-22 09:09:57 +02:00
Culturecollections . org . uk is published under the Open Government Licence , allowing the reproduction of information as
long as the license terms are obeyed . Material on this website is subject to Crown copyright protection unless otherwise
indicated . Users should be aware that information provided to third parties through feeds may be edited or cached , and
2024-03-29 14:43:41 +01:00
we do not guarantee the accuracy of such third - party products .
https : / / www . culturecollections . org . uk / training - and - support / policies / terms - and - conditions - of - use /
2024-04-22 09:09:57 +02:00
The Culture Collections represent deposits of cultures from world - wide sources . While every effort is made to ensure
details distributed by Culture Collections are accurate , Culture Collections cannot be held responsible for any
inaccuracies in the data supplied . References where quoted are mainly attributed to the establishment of the cell
culture and not for any specific property of the cell line , therefore further references should be obtained regarding
2024-03-29 14:43:41 +01:00
cell culture characteristics . Passage numbers where given act only as a guide and Culture Collections does not guarantee
the passage number stated will be the passage number received by the customer .
'''
2024-06-18 16:58:38 +02:00
2024-03-29 14:43:41 +01:00
def download_items ( ) :
data = { ' items ' : [ ] ,
' collections ' : { } ,
' collection_groups ' : { } }
page_number = 1
page_number_max = None
while True :
url = ' https://www.culturecollections.org.uk/umbraco/api/searchApi/getSearchResults?searchParams= { " searchText " : " " , " searchScope " : " Product " , " pageNumber " : ' + str ( page_number ) + ' , " filter " : { " collectionGroup " : " 0 " , " collection " : " 0 " , " facets " : {} , " sorting " : " DateCreated " }} '
page_resp = requests . get ( url )
page_resp . encoding = ' utf-8-sig '
page_data = page_resp . json ( )
page_number_max = page_data [ ' pagination ' ] [ ' totalPages ' ]
for c in page_data [ ' filter ' ] [ ' collections ' ] [ ' aggregationItems ' ] :
data [ ' collections ' ] [ int ( c [ ' value ' ] ) ] = c [ ' title ' ]
for cg in page_data [ ' filter ' ] [ ' collectionGroups ' ] [ ' aggregationItems ' ] :
data [ ' collection_groups ' ] [ int ( cg [ ' value ' ] ) ] = cg [ ' title ' ]
for item in page_data [ ' items ' ] :
item [ ' collection ' ] = data [ ' collections ' ] [ item [ ' collectionId ' ] ]
data [ ' items ' ] . extend ( page_data [ ' items ' ] )
print ( f " Fetching page { page_number } / { page_number_max } : " , end = " " )
print ( f " items size is now { len ( data [ ' items ' ] ) } as I extended with { len ( page_data [ ' items ' ] ) } items. " )
if page_number > = page_number_max :
break
page_number + = 1
return data
def save_items ( d ) :
with open ( ' items.json ' , ' w ' ) as f :
json . dump ( d , f , indent = 2 , sort_keys = True )
return True
2024-06-18 16:58:38 +02:00
2024-03-29 14:43:41 +01:00
def load_saved_items ( ) :
with open ( ' items.json ' , ' r ' ) as f :
d = json . load ( f )
return d
2024-06-18 16:58:38 +02:00
2024-03-29 14:43:41 +01:00
data = download_items ( )
2024-04-22 09:09:57 +02:00
# save_items(data)
2024-03-29 14:43:41 +01:00
# data = load_saved_items()
2024-04-22 09:09:57 +02:00
clusters_dict = { }
2024-03-29 14:43:41 +01:00
for item in data [ ' items ' ] :
2024-04-22 09:09:57 +02:00
# create a cluster
2024-03-29 14:43:41 +01:00
cluster = {
' value ' : f " { item [ ' name ' ] } " ,
2024-04-22 09:09:57 +02:00
' uuid ' : str ( uuid . uuid5 ( uuid . UUID ( " bbe11c06-1d6a-477e-88f1-cdda2d71de56 " ) , item [ ' name ' ] ) ) ,
2024-03-29 14:43:41 +01:00
' meta ' : {
' refs ' : [ item [ ' url ' ] ] ,
2024-04-22 09:09:57 +02:00
' external_id ' : [ item [ ' catalogueNumber ' ] ]
2024-03-29 14:43:41 +01:00
}
}
2024-04-22 09:09:57 +02:00
# add all properties of the culture
2024-03-29 14:43:41 +01:00
for p in item [ ' properties ' ] :
if p [ ' value ' ] :
p_name = p [ ' name ' ] . lower ( ) . replace ( ' ' , ' _ ' )
if p [ ' name ' ] not in cluster [ ' meta ' ] :
cluster [ ' meta ' ] [ p_name ] = [ ]
cluster [ ' meta ' ] [ p_name ] . append ( p [ ' value ' ] )
2024-04-22 09:09:57 +02:00
# merge if the collection already exists
if cluster [ ' value ' ] in clusters_dict :
clusters_dict [ cluster [ ' value ' ] ] [ ' meta ' ] [ ' refs ' ] . extend ( cluster [ ' meta ' ] [ ' refs ' ] )
clusters_dict [ cluster [ ' value ' ] ] [ ' meta ' ] [ ' external_id ' ] . extend ( cluster [ ' meta ' ] [ ' external_id ' ] )
else :
clusters_dict [ cluster [ ' value ' ] ] = cluster
# transform dict to list
2024-06-18 16:58:38 +02:00
cluster = Cluster ( ' ukhsa-culture-collections ' , skip_duplicates = True )
cluster . cluster_values = { }
2024-04-22 09:09:57 +02:00
for item in clusters_dict . values ( ) :
2024-06-18 16:58:38 +02:00
cluster . append ( item , skip_duplicates = True )
cluster . save ( ' ukhsa-culture-collections ' )
for cluster , duplicate in cluster . duplicates :
print ( f " WARNING: Skipped duplicate: { duplicate } in cluster { cluster } " )
try :
galaxy = Galaxy ( ' ukhsa-culture-collections ' )
except KeyError :
galaxy = Galaxy ( {
' icon ' : " virus " ,
' name ' : " UKHSA Culture Collections " ,
' description ' : " UK Health Security Agency Culture Collections represent deposits of cultures that consist of expertly preserved, authenticated cell lines and microbial strains of known provenance. " ,
' namespace ' : " gov.uk " ,
' type ' : " ukhsa-culture-collections " ,
' uuid ' : " bbe11c06-1d6a-477e-88f1-cdda2d71de56 " ,
' version ' : 1
} )
galaxy . save ( ' ukhsa-culture-collections ' )
print ( " All done, please don ' t forget to ./jq_all_the_things.sh, commit, and then ./validate_all.sh. " )