-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsv_to_json.py
More file actions
106 lines (92 loc) · 4.9 KB
/
csv_to_json.py
File metadata and controls
106 lines (92 loc) · 4.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
import json
#############Parameters for User
mycsvpath=r'C:\Users\lena.schmidt\Documents\SR automation review\Update_2\includes2024_mapper.csv'#infile
myoutpath=r'C:\Users\lena.schmidt\Documents\SR automation review\Update_2\includes2024_bla.json'#outfile
mapping_vars=["Entities mined","Data extraction approach", "Target data", "Scope of mining", "Granularity of extraction", "Validation scores", "Year"]#name of columns that have codes that should be transferred to map
delimit_str=","#if there are multiple codes in a column, specify the delimiter so that they may be splitted
nlreplace=";"#if there are random newlines (mostly in authors field, they get replaced with this
#####################Other Variables
def get_metadata(row, i):
citation_vars = ["ItemId", "Title", "ParentTitle", "ShortTitle", "DateCreated", "CreatedBy",
"DateEdited", "EditedBy", "Year", "Month", "StandardNumber", "City", "Country",
"Publisher", "Institution", "Volume", "Pages", "Edition", "Issue", "Availability",
"URL", "OldItemId", "Abstract", "Comments", "TypeName", "Authors", "ParentAuthors",
"DOI", "Keywords", "ItemStatus", "ItemStatusTooltip",
"QuickCitation"] # all these fields are required otherwise refs won't be downloadable from map. If there is no fitting column, an empty string "" will be added
mycols=row.keys()
refdict = {}
refdict["Codes"] = []
refdict["Outcomes"] = []
if 'ItemId' not in mycols:
refdict["ItemId"]=i#need to assign ID if not specified by user
citation_vars.remove("ItemId")
for cvar in citation_vars:#iterate through all required fields and fill them if possible from the row data
if cvar in mycols:
mv=str(row[cvar]).replace("\n", "; ").strip() #
if mv.endswith(nlreplace):
mv=mv[:-1]
refdict[cvar] =mv
else:
refdict[cvar] =""
return refdict
def custom_json(mycsvpath, myoutpath):
df=pd.read_csv(mycsvpath).fillna("")
reflist = [] # contains dicts that have a list of codes and metadata
attributeslist = [] # so many attributes lists
attributes={k:{} for k in mapping_vars}#collecting all codes and assigning new ids to them as they are added to the dicts
cnt=1#attributte ID counter
for i, row in df.iterrows():##############################parse each reference (one row is one reference)
extraction = []
myattributes = []
for mvar in mapping_vars:
thisvar=str(row[mvar])#get the codes, split and strip whitespaces
thiscodes=thisvar.split(delimit_str)
thiscodes=[s.strip() for s in thiscodes]
if "" in thiscodes:
thiscodes.remove("")
for c in thiscodes:
if c not in attributes[mvar].keys():#assign code to the relevant attribute dict, every time a new code is discovered it gets a new ID and the counter is incremented. A list of attributes is saved for each reference
attributes[mvar][c] = cnt
myattributes.append(cnt)
cnt += 1
else:
myattributes.append(attributes[mvar][c])
#print(myattributes)
refdict = get_metadata(row, i)#get metadata
#print(refdict)
codelist = [] ########
for att in myattributes:#########################################################reformat thre reference-level attribute list to be correct format
codelist.append({"AttributeId": att, "ItemAttributeFullTextDetails": []})
refdict["Codes"] = codelist#assigne codes to ref
reflist.append(refdict)#add reference to reflist
# print(refdict)
# print("-------------------------")
for key, value in attributes.items(): ########################################## reformat global-level attributes list to correct format
thisattributes = []
for k, v in value.items():
thisattributes.append({"AttributeId": v,
"AttributeName": k})
adict = {"AttributeId": cnt,
"AttributeName": key,
"Attributes": {
"AttributesList": thisattributes
}
}
attributeslist.append(adict)
cnt += 1#also these parent level attributes need ids so we need to increment
##########################################Yayyy, just need to fill the vars into the json tremplate :)
final_json = {
"CodeSets": [
{
"SetName": "Mapping tool",
"Attributes": {
"AttributesList": attributeslist
}
}
],
"References": reflist
}
with open(myoutpath, 'w', encoding='utf-8') as f:
json.dump(final_json, f, ensure_ascii=False, indent=4)
custom_json(mycsvpath,myoutpath)