Skip to content

Commit 7743ee5

Browse files
authored
feat: add parser uri to readers, add function to download portal project fo… (#43)
## Pull request overview This PR adds functionality to download AreTomo files from the CryoET Data Portal and improves the reader functions by introducing URI-based object resolution. **Changes:** - Added URI-based resolution for tomograms and segmentations in readers - Renamed variables in writers for consistency (`tomogram` → `tomo`, `segmentation_volume` → `seg_vol`, `segmentation` → `seg`) - Added new portal download functionality with CLI commands for downloading CryoET Data Portal projects
1 parent 08b3356 commit 7743ee5

6 files changed

Lines changed: 346 additions & 81 deletions

File tree

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ dependencies = [
2929
"trimesh",
3030
"manifold3d",
3131
"mapbox-earcut",
32+
"mdocfile",
3233
"tqdm",
3334
"scikit-learn",
3435
"shapely",
@@ -88,6 +89,9 @@ clippicks = "copick_utils.cli.logical_commands:clippicks"
8889
picksin = "copick_utils.cli.logical_commands:picksin"
8990
picksout = "copick_utils.cli.logical_commands:picksout"
9091

92+
[project.entry-points."copick.download.commands"]
93+
project = "copick_utils.cli.download_commands:project"
94+
9195
[tool.hatch.version]
9296
path = "src/copick_utils/__init__.py"
9397

src/copick_utils/cli/download.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import click
2+
3+
4+
@click.command(
5+
context_settings={"show_default": True},
6+
short_help="Download tilt series and alignments from the CryoET Data Portal.",
7+
no_args_is_help=True,
8+
)
9+
@click.option(
10+
"-ds",
11+
"--dataset",
12+
required=True,
13+
type=str,
14+
help="Dataset ID to download from the CryoET Data Portal.",
15+
)
16+
@click.option(
17+
"-o",
18+
"--output",
19+
required=True,
20+
default=".",
21+
type=str,
22+
help="Output directory to save the downloaded files.",
23+
)
24+
def project(dataset: str, output: str):
25+
"""
26+
Download tilt series and alignments from the CryoET Data Portal for sub-tomogram averaging with py2rely.
27+
"""
28+
download_project(dataset, output)
29+
30+
31+
def download_project(dataset: str, output: str):
32+
import copick_utils.io.portal as portal
33+
34+
portal.download_aretomo_files(dataset, output)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
"""CLI commands for downloading data from the CryoET Data Portal.
2+
3+
This module imports all download commands from specialized files for better organization.
4+
"""
5+
6+
from copick_utils.cli.download import project
7+
8+
# All commands are now available for import by the main CLI
9+
__all__ = [
10+
"project",
11+
]

src/copick_utils/io/portal.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
"""
2+
A minimal example using minimal libraries / imports to download relevant AreTomo files
3+
from the CryoET Data Portal. Downloads the corresponding files, using the run ID as the
4+
base filename.
5+
6+
Original implementation by Daniel Ji and Utz Ermel.
7+
"""
8+
import multiprocessing
9+
import os
10+
11+
import cryoet_data_portal as cdp
12+
import mdocfile
13+
import numpy as np
14+
import pandas as pd
15+
import requests
16+
import s3fs
17+
18+
global_client = cdp.Client()
19+
20+
21+
def download_aretomo_files(dataset_id: int, output_dir: str):
22+
print(f"Fetching tiltseries for dataset id {dataset_id}...", flush=True)
23+
tiltseries_list: list[cdp.TiltSeries] = [
24+
tiltseries for run in cdp.Dataset.get_by_id(global_client, dataset_id).runs for tiltseries in run.tiltseries
25+
] # a bit slow for some reason, can take some time
26+
tiltseries_run_ids_and_ts_ids = [(ts.run.id, ts.id) for ts in tiltseries_list]
27+
print(
28+
f"Found {len(tiltseries_run_ids_and_ts_ids)} tiltseries for dataset id {dataset_id}. Starting downloads...",
29+
flush=True,
30+
)
31+
with multiprocessing.Pool(processes=8) as pool: # adjust number of processes as needed
32+
for _ in pool.imap_unordered(
33+
_worker_download_aretomo_files_for_tiltseries,
34+
[
35+
(dataset_id, run_name, output_dir, tiltseries_id)
36+
for run_name, tiltseries_id in tiltseries_run_ids_and_ts_ids
37+
],
38+
):
39+
pass
40+
41+
42+
def _worker_download_aretomo_files_for_tiltseries(args):
43+
dataset_id, run_name, output_dir, tiltseries_id = args
44+
download_aretomo_files_for_tiltseries(dataset_id, run_name, output_dir, tiltseries_id)
45+
46+
47+
# note: this function assumes that there is only one tiltseries per run
48+
# note: the tiltseries name is equivlaent to the run name
49+
# if tiltseries_id is provided, will be prioritized over dataset_id + run_name
50+
def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output_dir: str, tiltseries_id: int = None):
51+
print(f"[{run_name}] Downloading AreTomo files for tiltseries id {tiltseries_id}...", flush=True)
52+
53+
client = cdp.Client()
54+
s3 = s3fs.S3FileSystem(anon=True)
55+
if not tiltseries_id:
56+
all_tiltseries = cdp.TiltSeries.find(
57+
client,
58+
query_filters=[cdp.TiltSeries.run.dataset_id == dataset_id, cdp.TiltSeries.run.name == run_name],
59+
)
60+
if len(all_tiltseries) == 0:
61+
raise ValueError(f"No tiltseries found for dataset_id {dataset_id} and run_name {run_name}")
62+
if len(all_tiltseries) > 1:
63+
raise ValueError(f"Multiple tiltseries found for dataset_id {dataset_id} and run_name {run_name}")
64+
tiltseries = all_tiltseries[0]
65+
else:
66+
tiltseries = cdp.TiltSeries.get_by_id(client, tiltseries_id)
67+
68+
# get the s3 folder path and then glob for *.tlt / *.rawtlt files to download them, renaming the base to match the run id
69+
s3_folder_path = tiltseries.s3_mrc_file.rsplit("/", 1)[0] + "/"
70+
tlt_files = s3.glob(s3_folder_path + "*.tlt") + s3.glob(s3_folder_path + "*.rawtlt")
71+
for tlt_file in tlt_files:
72+
base_name = os.path.basename(tlt_file)
73+
ext = os.path.splitext(base_name)[1]
74+
dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}")
75+
s3.get(tlt_file, dest_file)
76+
print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True)
77+
78+
# do the same for "*CTF*.txt" files and "*ctf*.txt" files
79+
ctf_files = s3.glob(s3_folder_path + "*CTF*.txt") + s3.glob(s3_folder_path + "*ctf*.txt")
80+
if len(ctf_files) == 0:
81+
print(f"WARNING: No CTF files found for tiltseries id {tiltseries.id}")
82+
else:
83+
ctf_file = ctf_files[0]
84+
base_name = os.path.basename(ctf_file)
85+
if len(ctf_files) > 1:
86+
print(f"WARNING: Multiple CTF files found for tiltseries id {tiltseries.id}, using {base_name}")
87+
ext = os.path.splitext(base_name)[1]
88+
dest_file = os.path.join(output_dir, f"{tiltseries.run.id}_CTF.txt")
89+
s3.get(ctf_file, dest_file)
90+
print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True)
91+
92+
# now find the corresponding alignment for this tiltseries and download the "*.aln" file
93+
if len(tiltseries.alignments) == 0:
94+
print(f"WARNING: No alignments found for tiltseries id {tiltseries.id}")
95+
elif len(tiltseries.alignments) > 1:
96+
print(f"WARNING: Multiple alignments found for tiltseries id {tiltseries.id}")
97+
else:
98+
alignment = tiltseries.alignments[0]
99+
s3_alignment_folder_path = alignment.s3_alignment_metadata.rsplit("/", 1)[0] + "/"
100+
aln_files = s3.glob(s3_alignment_folder_path + "*.aln")
101+
if len(aln_files) == 0:
102+
raise ValueError(f"No .aln files found for run name {tiltseries.run.name} and alignment id {alignment.id}")
103+
aln_file = aln_files[0]
104+
base_name = os.path.basename(aln_file)
105+
if len(aln_files) > 1:
106+
print(f"WARNING: Multiple .aln files found for run name {tiltseries.run.name}, using {base_name}")
107+
ext = os.path.splitext(base_name)[1]
108+
dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}")
109+
s3.get(aln_file, dest_file)
110+
print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True)
111+
112+
# now get the mdoc file from the Frames/ folder
113+
frames = tiltseries.run.frames
114+
if len(frames) == 0:
115+
raise ValueError(f"No frames found for run name {tiltseries.run.name}")
116+
frame = frames[0]
117+
s3_frames_folder_path = frame.s3_frame_path.rsplit("/", 1)[0] + "/"
118+
mdoc_files = s3.glob(s3_frames_folder_path + "*.mdoc")
119+
if len(mdoc_files) == 0:
120+
raise ValueError(f"No .mdoc files found for run name {tiltseries.run.name}")
121+
mdoc_file = mdoc_files[0]
122+
base_name = os.path.basename(mdoc_file)
123+
if len(mdoc_files) > 1:
124+
print(f"WARNING: Multiple .mdoc files found for run name {tiltseries.run.name}, using {base_name}")
125+
ext = os.path.splitext(base_name)[1]
126+
dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}")
127+
s3.get(mdoc_file, dest_file)
128+
print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True)
129+
130+
# download tiltseries mrc file
131+
tiltseries_file = os.path.join(output_dir, f"{tiltseries.run.id}.mrc")
132+
tiltseries_url = tiltseries.https_mrc_file
133+
response = requests.get(tiltseries_url, stream=True)
134+
response.raise_for_status()
135+
with open(tiltseries_file, "wb") as f:
136+
for chunk in response.iter_content(chunk_size=8192):
137+
f.write(chunk)
138+
print(f"[{tiltseries.run.id}] Downloaded tiltseries mrc file as {os.path.basename(tiltseries_file)}.", flush=True)
139+
140+
# create imod file for order list
141+
mdoc = mdocfile.read(os.path.join(output_dir, f"{tiltseries.run.id}.mdoc"))
142+
order_list = mdoc["TiltAngle"]
143+
imodpath = os.path.join(output_dir, f"{tiltseries.run.id}_Imod")
144+
os.makedirs(imodpath, exist_ok=True)
145+
number = np.arange(len(order_list)) + 1
146+
147+
# save in csv with 'ImageNumber', 'TiltAngle' headers
148+
df = pd.DataFrame({"ImageNumber": number, "TiltAngle": order_list})
149+
df.to_csv(os.path.join(imodpath, f"{tiltseries.run.id}_order_list.csv"), index=False)

0 commit comments

Comments
 (0)