-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutils.py
More file actions
384 lines (312 loc) · 11.4 KB
/
utils.py
File metadata and controls
384 lines (312 loc) · 11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
"""Utility functions for the meta-analysis pipeline."""
import os.path as op
import nibabel as nib
import numpy as np
import pandas as pd
import requests
from cognitiveatlas.api import get_concept
from neuromaps import transforms
from neuromaps.datasets import fetch_atlas
from nibabel.gifti import GiftiDataArray
from nilearn._utils.niimg_conversions import check_same_fov
from nilearn.image import concat_imgs, resample_to_img
from nimare.extract import fetch_neuroquery
from nimare.io import convert_neurosynth_to_dataset
from nimare.utils import get_resource_path
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def get_data(dset, imtype="z"):
"""Get data from a Dataset object.
Parameters
----------
dset : :obj:`nimare.dataset.Dataset`
Dataset object.
imtype : :obj:`str`, optional
Type of image to load. Default is 'z'.
Returns
-------
data : :obj:`numpy.ndarray`
Data from the Dataset object.
"""
images = dset.get_images(imtype=imtype)
_resample_kwargs = {"clip": True, "interpolation": "linear"}
masker = dset.masker
imgs = [
(
nib.load(img)
if check_same_fov(nib.load(img), reference_masker=masker.mask_img)
else resample_to_img(nib.load(img), masker.mask_img, **_resample_kwargs)
)
for img in images
]
img4d = concat_imgs(imgs, ensure_ndim=4)
return masker.transform(img4d)
# Number of vertices in total without the medial wall
N_VERTICES = {
"fsLR": {
"32k": 59412,
"164k": 298261,
},
"fsaverage": {
"3k": 4661,
"10k": 18715,
"41k": 74947,
"164k": 299881,
},
"civet": {
"41k": 76910,
},
}
# Number of vertices per hemisphere including the medial wall
N_VERTICES_PH = {
"fsLR": {
"32k": 32492,
"164k": 163842,
},
"fsaverage": {
"3k": 2562,
"10k": 10242,
"41k": 40962,
"164k": 163842,
},
"civet": {
"41k": 40962,
},
}
def _rm_medial_wall(
data_lh,
data_rh,
space="fsLR",
density="32k",
join=True,
neuromaps_dir=None,
):
"""Remove medial wall from data in fsLR space.
Data in 32k fs_LR space (e.g., Human Connectome Project data) often in
GIFTI format include the medial wall in their data arrays, which results
in a total of 64984 vertices across hemispheres. This function removes
the medial wall vertices to produce a data array with the full 59412 vertices,
which is used to perform functional decoding.
This function was adapted from :func:`surfplot.utils.add_fslr_medial_wall`.
Parameters
----------
data : numpy.ndarray
Surface vertices. Must have exactly 32492 vertices per hemisphere.
join : bool
Return left and right hemipsheres in the same arrays. Default: True.
Returns
-------
numpy.ndarray
Vertices with medial wall excluded (59412 vertices total).
ValueError
------
`data` has the incorrect number of vertices (59412 or 64984 only
accepted)
"""
assert data_lh.shape[0] == N_VERTICES_PH[space][density]
assert data_rh.shape[0] == N_VERTICES_PH[space][density]
atlas = fetch_atlas(space, density, data_dir=neuromaps_dir, verbose=0)
medial_lh, medial_rh = atlas["medial"]
wall_lh = nib.load(medial_lh).agg_data()
wall_rh = nib.load(medial_rh).agg_data()
data_lh = data_lh[np.where(wall_lh != 0)]
data_rh = data_rh[np.where(wall_rh != 0)]
if not join:
return data_lh, data_rh
data = np.hstack((data_lh, data_rh))
assert data.shape[0] == N_VERTICES[space][density]
return data
def _zero_medial_wall(data_lh, data_rh, space="fsLR", density="32k", neuromaps_dir=None):
"""Remove medial wall from data in fsLR space."""
atlas = fetch_atlas(space, density, data_dir=neuromaps_dir, verbose=0)
medial_lh, medial_rh = atlas["medial"]
medial_arr_lh = nib.load(medial_lh).agg_data()
medial_arr_rh = nib.load(medial_rh).agg_data()
data_arr_lh = data_lh.agg_data()
data_arr_rh = data_rh.agg_data()
data_arr_lh[np.where(medial_arr_lh == 0)] = 0
data_arr_rh[np.where(medial_arr_rh == 0)] = 0
data_lh.remove_gifti_data_array(0)
data_rh.remove_gifti_data_array(0)
data_lh.add_gifti_data_array(GiftiDataArray(data_arr_lh))
data_rh.add_gifti_data_array(GiftiDataArray(data_arr_rh))
return data_lh, data_rh
def _vol_to_surf(metamap, space="fsLR", density="32k", return_hemis=False, neuromaps_dir=None):
"""Transform 4D metamaps from volume to surface space."""
if space == "fsLR":
metamap_lh, metamap_rh = transforms.mni152_to_fslr(metamap, fslr_density=density)
elif space == "fsaverage":
metamap_lh, metamap_rh = transforms.mni152_to_fsaverage(metamap, fsavg_density=density)
elif space == "civet":
metamap_lh, metamap_rh = transforms.mni152_to_civet(metamap, civet_density=density)
metamap_lh, metamap_rh = _zero_medial_wall(
metamap_lh,
metamap_rh,
space=space,
density=density,
neuromaps_dir=neuromaps_dir,
)
if return_hemis:
return metamap_lh, metamap_rh
metamap_arr_lh = metamap_lh.agg_data()
metamap_arr_rh = metamap_rh.agg_data()
metamap_surf = _rm_medial_wall(
metamap_arr_lh,
metamap_arr_rh,
space=space,
density=density,
neuromaps_dir=neuromaps_dir,
)
return metamap_surf
def nifti_to_grayordinate(nifti_fn, axis):
nifti = nib.load(nifti_fn)
nifti_data = nifti.get_fdata(dtype=np.float32)
n_go = axis.name.shape[0]
grayordinate = np.zeros((n_go))
volume_mask = axis.volume_mask
vox_indices = tuple(axis.voxel[volume_mask].T)
grayordinate[volume_mask] = nifti_data[vox_indices]
grayordinate[~volume_mask] = _vol_to_surf(nifti_fn)
return grayordinate
def _generate_counts(
text_df,
id_col="id",
vocabulary=None,
text_column="abstract",
tfidf=True,
min_df=0.01,
max_df=0.99,
):
"""Generate tf-idf/counts weights for unigrams/bigrams derived from textual data.
Parameters
----------
text_df : (D x 2) :obj:`pandas.DataFrame`
A DataFrame with two columns ('id' and 'text'). D = document.
Returns
-------
weights_df : (D x T) :obj:`pandas.DataFrame`
A DataFrame where the index is 'id' and the columns are the
unigrams/bigrams derived from the data. D = document. T = term.
"""
if text_column not in text_df.columns:
raise ValueError(f"Column '{text_column}' not found in DataFrame")
# Remove rows with empty text cells
orig_ids = text_df[id_col].tolist()
text_df = text_df.fillna("")
keep_ids = text_df.loc[text_df[text_column] != "", id_col]
text_df = text_df.loc[text_df[id_col].isin(keep_ids)]
if len(keep_ids) != len(orig_ids):
print(f"\t\tRetaining {len(keep_ids)}/{len(orig_ids)} studies", flush=True)
ids = text_df[id_col].tolist()
text = text_df[text_column].tolist()
stoplist = op.join(get_resource_path(), "neurosynth_stoplist.txt")
with open(stoplist, "r") as fo:
stop_words = fo.read().splitlines()
if tfidf:
vectorizer = TfidfVectorizer(
min_df=min_df,
max_df=max_df,
ngram_range=(1, 2),
vocabulary=vocabulary,
stop_words=stop_words,
)
else:
vectorizer = CountVectorizer(
min_df=min_df,
max_df=max_df,
ngram_range=(1, 2),
vocabulary=vocabulary,
stop_words=stop_words,
)
weights = vectorizer.fit_transform(text).toarray()
names = vectorizer.get_feature_names_out()
names = [str(name) for name in names]
weights_df = pd.DataFrame(weights, columns=names, index=ids)
weights_df.index.name = id_col
return weights_df
def _fetch_neuroquery_dset():
files = fetch_neuroquery(
data_dir="./data",
version="1",
overwrite=False,
source="combined",
vocab="neuroquery6308",
type="tfidf",
)
neuroquery_db = files[0]
return convert_neurosynth_to_dataset(
coordinates_file=neuroquery_db["coordinates"],
metadata_file=neuroquery_db["metadata"],
annotations_files=neuroquery_db["features"],
)
def _lowercase(words):
return [word.lower() for word in words]
def _extract_vocabulary(phrase):
words = phrase.split()
sub_phrases = [
_lowercase(words[i:j]) for i in range(len(words)) for j in range(i + 1, len(words) + 1)
]
return [" ".join(sub_phrase) for sub_phrase in sub_phrases]
def _cogat_vocabulary(cogat_dir):
"""Get vocabulary from cognitive atlas concepts."""
cogat_fn = op.join(cogat_dir, "cogat_concepts.csv")
if not op.isfile(cogat_fn):
concepts_df = get_concept(silent=True).pandas
else:
concepts_df = pd.read_csv(cogat_fn)
cog_names = concepts_df["name"].to_list()
vocabulary = []
for name in cog_names:
vocabulary.extend(_extract_vocabulary(name))
return list(set(vocabulary))
def _add_texts(dset, texts_fn):
"""Add texts to NeuroQuery dataset."""
texts_df = pd.read_csv(texts_fn)
study_ids = [f"{id_}-1" for id_ in texts_df["pmid"]]
texts_df["id"] = study_ids
texts_df = texts_df[["id", "title", "keywords", "abstract", "body"]]
texts_df.set_index("id", inplace=True)
new_texts_df = dset.texts.copy()
new_texts_df = pd.merge(new_texts_df, texts_df, left_on="id", right_index=True)
new_dset = dset.copy()
new_dset.texts = new_texts_df
return new_dset
def _get_studies_to_keep(dset, feature_group, min_img_thr=20, freq_thr=0.05):
feature_names = dset.annotations.columns.values
feature_names = [f for f in feature_names if f.startswith(feature_group)]
# Get topics with more than 10 images and at least 5% of the dataset
feature_to_keep_ids = []
feature_to_keep = []
idx_to_keep = []
for f_i, feature in enumerate(feature_names):
temp_feature_ids = dset.get_studies_by_label(
labels=[feature],
label_threshold=freq_thr,
)
if len(temp_feature_ids) >= min_img_thr:
idx_to_keep.append(f_i)
feature_to_keep.append(feature)
feature_to_keep_ids.append(temp_feature_ids)
return idx_to_keep, feature_to_keep, feature_to_keep_ids
def get_pmcids_from_dois(dois):
"""Query PubMed for the PMC IDs of a list of papers based on their DOIs."""
pmids = []
for i in range(0, len(dois), 100):
chunk = dois[i : i + 100]
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=({' OR '.join(chunk)})&retmode=json"
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"PubMed API returned status code {response.status_code} for {url}")
data = response.json()
pmids += data["esearchresult"]["idlist"]
pmcids = []
for i in range(0, len(pmids), 100):
chunk = pmids[i : i + 100]
url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids={'%2C'.join(chunk)}&format=json"
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"PubMed API returned status code {response.status_code} for {url}")
data = response.json()
for record in data["records"]:
if "pmcid" in record:
pmcids.append(record["pmcid"])
return pmcids