add naics generation script

2023-10-31 11:40:20 +01:00 · 2023-10-31 11:40:20 +01:00 · 68903b5ce7
parent 2436c6f326
commit 68903b5ce7
1 changed files with 101 additions and 0 deletions
--- a/tools/generate_naics_clusters.py
+++ b/tools/generate_naics_clusters.py
@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#Used to generate naics galaxy clusters; takes naics.csv as entry
+#naics.csv is extract from [2022]_NAICS_Structure.xlsx and only uses the 2022 NAICS Code and 2022 NAICS Title columns, without title.
+#Note 1 : This only generate the file for the "clusters" folder
+#Note 2 : The generated file needs to pass the jq_all_the_thigs.sh script to be in the corresponding information
+#Note 3 : New uuids are generated on every run
+
+import json
+import csv
+import uuid
+
+galaxy={}
+galaxy['description']="The North American Industry Classification System or NAICS is a classification of business establishments by type of economic activity (the process of production)."
+galaxy['name']="NAICS"
+galaxy['source']="North American Industry Classification System - NAICS"
+galaxy['type']="naics"
+galaxy['uuid']="b73ecad4-6529-4625-8c4f-ee3ef703a72a"
+galaxy['version']=2022  #Change when updating
+galaxy['authors']=[]
+galaxy['authors'].append("Executive Office of the President Office of Management and Budget")
+galaxy['category']="sector"
+
+values = []
+
+with open('naics.csv', newline='') as csvfile:
+    reader = csv.reader(csvfile, delimiter=',', quotechar='"')
+    for row in reader:
+        #Cluster creation
+        cluster = {}
+        cluster['value']=row[0]
+        cluster['description']=row[1].strip()
+        cluster['uuid']=str(uuid.uuid4())
+        cluster['related']=[]
+
+        values.append(cluster)
+
+        #Relationsship preparation (Yes it's crappy but at least it works as intended ¯\_(ツ)_/¯)
+        relationparent={}
+        relationparent['tags']=[]
+        relationparent['tags'].append("estimative-language:likelihood-probability=\"likely\"")
+        relationparent['type']="parent-of"
+
+        relationchild={}
+        relationchild['tags']=[]
+        relationchild['tags'].append("estimative-language:likelihood-probability=\"likely\"")
+        relationchild['type']="child-of"
+
+        relationsiblings={}
+        relationsiblings['tags']=[]
+        relationsiblings['tags'].append("estimative-language:likelihood-probability=\"likely\"")
+        relationsiblings['type']="similar"
+
+        relationsiblings2={}
+        relationsiblings2['tags']=[]
+        relationsiblings2['tags'].append("estimative-language:likelihood-probability=\"likely\"")
+        relationsiblings2['type']="similar"
+
+        #Building relationships
+        if len(cluster['value']) > 2:               #2 digit codes have no parents
+            if len(cluster['value']) == 6:          #specific case of 6 digit codes, parent have only 4 digits
+                for value in values:
+                    if value['value'] == cluster['value'][0:len(cluster['value'])-2]:
+                        relationchild['dest-uuid']=value['uuid']
+                        cluster['related'].append(relationchild)
+
+                        relationparent['dest-uuid']=cluster['uuid']
+                        value['related'].append(relationparent)
+                        break
+
+                if cluster['value'][5] == "0":      #If a 6 digit code ends with 0, it has a similar/identical 5 digit code
+                    for value in values:
+                        if value['value'] == cluster['value'][0:len(cluster['value'])-1]:
+                            relationsiblings['dest-uuid']=value['uuid']
+                            cluster['related'].append(relationsiblings)
+
+                            relationsiblings2['dest-uuid']=cluster['uuid']
+                            value['related'].append(relationsiblings2)
+                            break
+
+
+
+            else:                                   #All other cases (codes with 3 to 5 digits)
+                for value in values:
+                    if value['value'] == cluster['value'][0:len(cluster['value'])-1]:
+                        relationchild['dest-uuid']=value['uuid']
+                        cluster['related'].append(relationchild)
+
+                        relationparent['dest-uuid']=cluster['uuid']
+                        value['related'].append(relationparent)
+                        break
+
+
+
+galaxy['values']=values
+
+tojson = json.dumps(galaxy, indent=2)
+jsonFile = open("naisc_cluster.json", "w")
+jsonFile.write(tojson)
+jsonFile.close()