1+ #!/usr/bin/env python3
2+ #
3+ # Create a new database including computation for all PDB entries.
4+ #
5+ import argparse
6+ import logging
7+ import os
8+ import typing
9+ import sys
10+ import requests
11+ import re
12+ import json
13+ import datetime
14+ import dataclasses
15+
16+ # Create a symlink to the directory to allow typed imports
17+ try :
18+ script_dir = os .path .dirname (os .path .abspath (__file__ ))
19+ sys .path .append (os .path .join (script_dir , ".." ))
20+ sys .path .append (os .path .join (script_dir , "../executor-p2rank" ))
21+ sys .path .append (os .path .join (script_dir , "../conservation/hmm_based" ))
22+ sys .path .append (os .path .join (script_dir , "../conservation/alignment_based" ))
23+
24+ os .symlink (os .path .join (script_dir , ".." , "executor-p2rank" ), os .path .join (script_dir , ".." , "executor_p2rank" ), target_is_directory = True )
25+ except FileExistsError :
26+ print ("Symlinks already exist" )
27+
28+ logger = logging .getLogger (__name__ )
29+ logger .setLevel (logging .DEBUG )
30+
31+ # Define a schema for the prediction.
32+ # We use a schema that corresponds to DatabaseV3 from web_server.src.database_v3
33+ from executor_p2rank .run_p2rank_task import execute_directory_task
34+
35+ @dataclasses .dataclass
36+ class Prediction :
37+ # Directory with given prediction task.
38+ directory : str
39+ # User identifier of given task.
40+ identifier : str
41+ # Name of a database.
42+ database : str
43+ # Name of a conservation to compute.
44+ conservation : str
45+ # If true structure is not modified before predictions.
46+ structure_sealed : bool
47+ # Configuration file for p2rank.
48+ p2rank_configuration : str
49+ # Additional metadata to save to info file.
50+ metadata : typing .Dict
51+ # Identification of experimental structure.
52+ structure_code : typing .Optional [str ] = None
53+ # File with user provided structure.
54+ structure_file : typing .Optional [str ] = None
55+ # Identification of predicted structure.
56+ uniprot_code : typing .Optional [str ] = None
57+ # Restriction to given chains.
58+ chains : typing .Optional [list [str ]] = None
59+
60+
61+ def _init_logging ():
62+ logging .basicConfig (
63+ filename = "create_pdb_database.log" ,
64+ filemode = 'a' ,
65+ level = logging .DEBUG ,
66+ format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" )
67+
68+ def _end_logging ():
69+ logging .shutdown ()
70+
71+
72+ def _read_arguments () -> typing .Dict [str , str ]:
73+ parser = argparse .ArgumentParser ()
74+ parser .add_argument (
75+ "--output_directory" , required = True ,
76+ help = "Output directory for the database." )
77+ parser .add_argument (
78+ "--database_name" , required = True ,
79+ help = "Name of the database (e.g. 'v3')" )
80+ parser .add_argument (
81+ "--input_pdbs" ,
82+ help = "Path to the JSON file with PDB IDs that will be merged with all current PDB entries available online." )
83+ parser .add_argument (
84+ "--compute_conservation" , action = "store_true" ,
85+ help = "If set, conservation will be computed for all PDB entries." )
86+
87+
88+ return vars (parser .parse_args ())
89+
90+ def _create_folders (args : typing .Dict [str , str ]):
91+ folder_names = ["" , "-alphafold" , "-alphafold-conservation-hmm" , "-conservation-hmm" , "-user-upload" ]
92+ for folder_name in folder_names :
93+ folder_path = os .path .join (args ["output_directory" ], args ["database_name" ] + folder_name )
94+ if not os .path .exists (folder_path ):
95+ os .makedirs (folder_path )
96+ logger .info (f"Created folder { folder_path } " )
97+
98+ logger .info ("All folders created" )
99+
100+ def _get_pdb_entries (args : typing .Dict [str , str ]):
101+ # First, get a list of all PDB entries from the following endpoint
102+ url_entries = "https://data.rcsb.org/rest/v1/holdings/current/entry_ids"
103+
104+ response = requests .get (url_entries )
105+ if response .status_code != 200 :
106+ logger .error (f"Failed to get PDB entries: { response .text } " )
107+ sys .exit (1 )
108+
109+ entries = response .json ()
110+ logger .info (f"Number of PDB entries: { len (entries )} " )
111+
112+ # If the input_pdbs is provided, we need to merge the entries with the input_pdbs
113+ if args ["input_pdbs" ]:
114+ print (args ["input_pdbs" ])
115+ with open (args ["input_pdbs" ], "r" ) as f :
116+ input_pdbs = json .load (f )
117+
118+ entries = set (entries )
119+ entries .update (input_pdbs )
120+ logger .info (f"Merged PDB entries with the input_pdbs: { len (entries )} " )
121+ logger .info (f"Final number of PDB entries: { len (entries )} " )
122+
123+ entries_list = list (entries )
124+
125+ logger .info ("Returning list of PDB entries" )
126+
127+ return entries_list
128+
129+ def _get_directory (identifier : str , args : typing .Dict [str , str ]) -> typing .Optional [str ]:
130+ """Return directory for task with given identifier."""
131+ if not re .match (r"[_,\w]+" , identifier ):
132+ return None
133+ directory = identifier [1 :3 ]
134+ db = _get_database_name (args )
135+ return os .path .join (args ["output_directory" ], db , directory , identifier )
136+
137+
138+ def _get_database_name (args : typing .Dict [str , str ]) -> str :
139+ return args ["database_name" ] + "-conservation-hmm" if args ["compute_conservation" ] else args ["database_name" ]
140+
141+ def _parser_identifier (identifier : str ):
142+ """2SRC_A,B into 2SRC, [A,B]"""
143+ if "_" not in identifier :
144+ return identifier , []
145+ code , chains = identifier .split ("_" )
146+ return code .upper (), [chain .upper () for chain in chains .split ("," )]
147+
148+ def _prepare_prediction_directory (prediction : Prediction ):
149+ """Initialize content of a directory for given task."""
150+ info = _create_info_file (prediction )
151+ _save_json (_info_file (prediction ), info )
152+ input_directory = os .path .join (prediction .directory , "input" )
153+ os .makedirs (input_directory , exist_ok = True )
154+ _save_json (
155+ os .path .join (input_directory , "configuration.json" ),
156+ {
157+ "p2rank_configuration" : prediction .p2rank_configuration ,
158+ "structure_file" : prediction .structure_file ,
159+ "structure_code" : prediction .structure_code ,
160+ "structure_sealed" : prediction .structure_sealed ,
161+ "structure_uniprot" : prediction .uniprot_code ,
162+ "conservation" : prediction .conservation ,
163+ "chains" : prediction .chains ,
164+ })
165+ return info
166+
167+ def _create_info_file (prediction : Prediction ):
168+ now = datetime .datetime .today ().strftime ("%Y-%m-%dT%H:%M:%S" )
169+ return {
170+ "id" : prediction .identifier ,
171+ "database" : prediction .database ,
172+ "created" : now ,
173+ "lastChange" : now ,
174+ "status" : "queued" ,
175+ "metadata" : prediction .metadata ,
176+ }
177+
178+ def _info_file (prediction : Prediction ) -> str :
179+ return os .path .join (prediction .directory , "info.json" )
180+
181+ def _save_json (path : str , content ):
182+ with open (path , "w" , encoding = "utf-8" ) as stream :
183+ json .dump (content , stream , ensure_ascii = True )
184+
185+
186+ def _run_predictions (args : typing .Dict [str , str ], entries_list : typing .List [str ]):
187+ successful_entries = []
188+ for entry in entries_list :
189+ logger .info (f"Running prediction for entry { entry } " )
190+
191+ directory = _get_directory (entry , args )
192+ if directory is None :
193+ logger .error (f"Invalid entry directory: { entry } " )
194+ continue
195+
196+ logger .info (f"Preparing prediction for entry { entry } " )
197+
198+ pdb_code , chains = _parser_identifier (entry )
199+
200+ prediction = Prediction (
201+ directory = directory ,
202+ identifier = entry ,
203+ database = _get_database_name (args ),
204+ structure_sealed = len (chains ) == 0 ,
205+ conservation = "hmm" if args ["compute_conservation" ] else "none" ,
206+ p2rank_configuration = "conservation_hmm" if args ["compute_conservation" ] else "default" ,
207+ structure_code = pdb_code ,
208+ chains = chains ,
209+ metadata = {},
210+ )
211+
212+ try :
213+ os .makedirs (prediction .directory , exist_ok = True )
214+ except OSError :
215+ logger .error (f"Failed to create directory { prediction .directory } " )
216+ continue
217+
218+ _prepare_prediction_directory (prediction )
219+
220+ logger .info (f"Running prediction for entry { entry } " )
221+
222+ execute_directory_task (prediction .directory , keep_working = False , stdout = False )
223+
224+ successful_entries .append (entry )
225+
226+ logger .info (f"Number of successful entries: { len (successful_entries )} " )
227+ logger .info (f"Number of unsuccessful entries: { len (entries_list ) - len (successful_entries )} " )
228+
229+ logger .info ("Successful entries:" )
230+ logger .info (successful_entries )
231+
232+ logger .info ("Unsuccessful entries:" )
233+ logger .info ([entry for entry in entries_list if entry not in successful_entries ])
234+
235+ logger .info ("All predictions done" )
236+
237+
238+ if __name__ == "__main__" :
239+
240+ args = _read_arguments ()
241+ _init_logging ()
242+ logger .info ("Creating a new database ..." )
243+
244+ _create_folders (args )
245+ pdb_entries = _get_pdb_entries (args )
246+
247+ _run_predictions (args , pdb_entries )
248+
249+ _end_logging ()
0 commit comments