From 75a4d501fc240dd3996c5507a22e702890af2509 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Mon, 22 Dec 2025 21:14:45 +0100 Subject: [PATCH 01/12] WIP: fast paras inference --- pyproject.toml | 3 + scripts/run_paras_on_gbks.py | 48 ++ src/biocracker/data/AMP-binding_converted.hmm | 690 ++++++++++++++++++ src/biocracker/paras.py | 2 +- src/biocracker/paras_fast.py | 345 +++++++++ 5 files changed, 1087 insertions(+), 1 deletion(-) create mode 100644 scripts/run_paras_on_gbks.py create mode 100644 src/biocracker/data/AMP-binding_converted.hmm create mode 100644 src/biocracker/paras_fast.py diff --git a/pyproject.toml b/pyproject.toml index cb27c5a..b101214 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,9 @@ allow-direct-references = true [tool.hatch.build.targets.wheel] packages = ["src/biocracker"] sources = ["src"] +include = [ + "src/retromol/data/AMP-binding_converted.hmm" +] # ------------------------- # Hatch env + scripts diff --git a/scripts/run_paras_on_gbks.py b/scripts/run_paras_on_gbks.py new file mode 100644 index 0000000..25a1cab --- /dev/null +++ b/scripts/run_paras_on_gbks.py @@ -0,0 +1,48 @@ +import argparse +import glob + +import joblib +from tqdm import tqdm + +from biocracker.antismash import parse_region_gbk_file +from biocracker.paras_fast import find_a_domains, featurize_signature + + +def cli(): + parser = argparse.ArgumentParser() + parser.add_argument("--gbks", type=str, required=True) + parser.add_argument("--model", type=str, required=True) + return parser.parse_args() + + +def main(): + args = cli() + + model = joblib.load(args.model) + + gbk_iter = glob.iglob(f"{args.gbks}/*.gbk") + for gbk_file in tqdm(gbk_iter): + for region in parse_region_gbk_file(gbk_file, top_level="region"): + for gene in region.genes: + name, protein_seq = gene.name, gene.protein_seq + a_domains = find_a_domains(seq_id=name, protein_seq=protein_seq, evalue_cutoff=1e-5) + for a_domain in a_domains: + protein_name = a_domain.protein + signature = a_domain.extended_signature + if signature is None: + print(protein_name, "N/A", "0.0000", "N/A", sep="\t") + else: + if model is not None: + features = featurize_signature(signature) + features_reshaped = features.reshape(1, -1) # reshape for single sample + prediction = model.predict_proba(features_reshaped) + pred_names = model.classes_ + prediction = {name: prob for name, prob in zip(pred_names, prediction[0])} + top_pred = max(prediction.items(), key=lambda x: x[1]) + print(protein_name, top_pred[0], f"{top_pred[1]:.4f}", signature, sep="\t") + else: + print(protein_name, "N/A", "0.0000", signature, sep="\t") + + +if __name__ == "__main__": + main() diff --git a/src/biocracker/data/AMP-binding_converted.hmm b/src/biocracker/data/AMP-binding_converted.hmm new file mode 100644 index 0000000..d3b5e7a --- /dev/null +++ b/src/biocracker/data/AMP-binding_converted.hmm @@ -0,0 +1,690 @@ +HMMER3/f [3.4 | Aug 2023] +NAME AMP-binding +LENG 166 +ALPH amino +RF no +MM no +CONS yes +CS no +MAP yes +DATE Mon Feb 23 10:56:16 2004 +COM [1] hmmbuild aa-activating-core.198-334.hmm aa-activating-core.198-334.aln +COM [2] hmmcalibrate aa-activating-core.198-334.hmm +NSEQ 201 +STATS LOCAL MSV -9.0550 0.70049 +STATS LOCAL VITERBI -9.8471 0.70049 +STATS LOCAL FORWARD -6.0470 0.70049 +HMM A C D E F G H I K L M N P Q R S T V W Y + m->m m->i m->d i->m i->i d->m d->d + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01026 * 4.58503 0.00000 * 0.00000 * + 1 9.62963 10.04414 9.51457 9.52705 10.17930 8.61279 10.03027 10.56885 0.00113 9.59220 10.93899 9.87778 9.23385 9.85699 9.15067 10.02889 9.95957 10.24515 10.42606 10.37962 1 K - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 2 2.49805 4.65097 8.61924 8.68647 8.81401 0.18502 8.98037 8.34891 8.48130 2.65956 8.75163 7.63012 7.52268 8.47160 8.33643 6.13708 6.52039 7.31889 10.15317 9.21465 2 G - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 3 3.65808 7.86410 8.92253 8.57804 7.58892 8.22384 9.73074 2.16574 8.61478 5.90596 7.22155 3.71076 8.58844 8.96690 8.85461 7.74141 3.98871 0.21037 9.98444 8.67092 3 V - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 4 2.84536 6.78312 7.37022 3.13301 4.26145 4.83746 6.91066 2.82248 4.98510 2.11963 1.06327 6.73876 2.43293 2.41352 6.54884 5.88827 5.00451 1.93317 7.40488 4.94143 4 m - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 5 3.92938 6.78238 7.38472 6.76851 4.35151 3.34229 6.91477 1.22056 3.59321 2.84322 3.01304 4.30299 6.94388 6.67979 2.68172 5.89168 3.07473 1.01470 7.40482 6.22855 5 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 6 3.58555 7.73681 3.98965 0.97932 7.08317 4.00698 6.18139 6.57024 4.91708 6.04276 6.77818 4.42010 2.50147 4.03055 2.73506 2.19510 1.40421 6.12871 8.16864 4.77776 6 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 7 4.90653 9.71420 5.00703 5.68285 8.80825 3.71293 0.16263 8.72508 6.89794 8.10679 9.11740 4.52322 7.58208 2.66212 7.68882 6.69138 7.26739 8.18581 10.09820 3.79333 7 H - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 8 2.27964 7.72154 2.42936 4.40552 4.48731 2.79465 3.28332 4.96489 2.70662 4.40413 6.76222 3.24727 6.37544 2.36212 0.97236 3.64791 3.70128 4.12341 8.15337 3.92656 8 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 9 1.99069 5.11955 2.61313 4.08884 7.38545 1.52697 6.45247 6.88153 5.22976 6.35266 7.09779 1.34883 4.04656 3.35480 5.73784 1.55054 4.70713 6.42821 8.47993 7.05343 9 n - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 10 2.55004 6.83577 7.44643 6.83092 3.84831 6.64100 6.97440 1.48398 6.61258 1.42853 3.98763 6.80805 6.99936 4.71544 6.61397 5.95340 5.69762 1.23029 4.64404 2.41627 10 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 11 1.62044 3.64304 4.27519 6.75111 3.66730 4.75554 4.27866 2.39260 6.53693 2.36765 3.70127 4.27450 6.94034 4.28212 4.68969 2.61372 2.72185 1.15880 7.40475 4.57394 11 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 12 2.73589 7.71199 5.46273 3.38190 4.85831 4.24071 3.08108 6.53641 4.90891 4.12149 6.75406 0.89142 6.37907 4.13397 1.84312 2.21048 3.56697 6.10112 8.14660 2.58132 12 n - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 13 3.10874 3.91001 7.35634 6.74221 2.25894 3.95784 4.77714 5.21590 6.53011 0.97176 4.13390 3.62582 6.93907 4.84992 3.15518 2.64571 2.37816 2.99298 7.40625 2.22844 13 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 14 2.34429 3.31955 7.38763 6.77142 2.67839 4.75644 3.91150 2.47322 4.94082 1.50905 4.14023 6.74647 6.94332 3.69038 3.75138 3.38470 2.93970 1.25674 4.62474 4.75298 14 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 15 2.56467 5.35251 2.60765 3.23772 3.60232 4.31210 3.43873 3.53785 1.96787 2.32277 4.43063 2.65686 4.28645 2.73034 2.73588 2.89323 2.34495 3.12751 4.91028 3.28485 15 k - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 16 1.76057 4.11380 2.11130 3.07478 3.35827 3.14409 3.49274 6.55507 3.87051 6.02758 6.76232 2.51957 4.19074 3.44769 4.04796 2.62284 3.91556 3.55235 1.41469 6.74222 16 w - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 17 2.88288 4.60397 4.42444 4.30591 2.60216 6.31535 2.44065 3.05755 4.62199 1.90832 2.33322 3.61900 3.72644 3.26619 3.13449 2.11349 2.10170 3.93300 3.27728 4.56307 17 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.06801 8.15565 2.72623 0.61948 0.77267 0.48579 0.95505 + 18 2.02815 3.73260 3.57318 2.86547 3.92044 2.61247 4.81114 2.58614 2.84607 3.57526 4.73351 1.97755 6.31596 2.47038 2.09261 2.84260 3.51773 3.10045 8.08002 5.02879 18 n - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.04815 8.08865 3.06403 0.61948 0.77267 2.15850 0.12273 + 19 2.88202 5.06474 2.23462 1.54771 5.08484 3.42684 3.40881 6.44757 2.32750 4.42150 5.06266 3.06085 4.10057 2.55486 2.10431 3.14264 3.91966 3.92729 8.04597 2.49386 19 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 1.23585 8.04048 0.34377 0.61948 0.77267 2.59059 0.07794 + 20 2.16806 6.14603 2.76763 2.29214 5.35585 3.69922 3.49059 3.18768 3.95222 2.46819 3.32908 4.46862 5.29623 4.29602 1.27252 4.11719 2.57147 2.66366 6.66867 5.35792 20 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.34135 6.80633 1.24452 0.61948 0.77267 2.77981 0.06406 + 21 3.13720 5.41350 6.01376 5.39755 2.43227 5.21110 5.54450 2.03995 5.17991 3.03947 4.53251 5.37607 3.28969 3.56834 5.18476 4.52211 2.99372 2.44336 2.78716 0.99815 21 y - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00192 6.65197 7.37493 0.61948 0.77267 0.96446 0.47995 + 22 2.42942 4.28220 3.09137 3.29446 1.92481 2.70598 3.17663 2.52438 4.19209 3.09137 4.18585 2.31505 5.98873 3.68817 4.07426 3.54053 3.75610 3.35546 2.72331 2.21593 22 f - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00083 7.49306 8.21531 0.61948 0.77267 0.02287 3.78943 + 23 2.28104 7.69729 2.48067 2.36283 3.71447 2.72535 3.25214 2.80714 3.80735 2.12300 4.28701 4.05688 3.48989 2.83902 2.25470 3.24244 3.56960 2.45225 8.13536 4.59338 23 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.05512 8.14970 2.93099 0.61948 0.77267 0.33173 1.26471 + 24 3.60168 2.51621 3.34730 4.72666 1.40025 4.23314 3.56564 2.72347 4.82925 2.41779 2.86972 4.01411 3.45820 3.79715 4.95817 3.26412 3.34522 2.87734 4.58457 2.11904 24 f - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00045 8.10126 8.82352 0.61948 0.77267 0.08938 2.45925 + 25 2.28873 4.65791 3.02000 3.13784 7.06798 1.56716 4.57126 6.55505 3.59809 4.01675 3.97446 2.99089 2.35943 3.21200 2.56044 2.14178 2.44954 3.23696 5.29768 6.74220 25 g - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.34333 8.15580 1.23680 0.61948 0.77267 0.48579 0.95505 + 26 2.71022 4.25594 3.26751 2.10302 2.72755 2.60694 3.32851 2.66239 2.93272 2.11134 3.90382 3.97383 2.46208 4.27327 3.23008 3.44426 2.54387 3.00134 7.78337 6.39153 26 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.10235 7.81292 2.33428 0.61948 0.77267 3.49180 0.03092 + 27 2.50207 7.30904 2.03211 2.01063 3.73240 2.51454 3.37405 4.60785 3.87935 3.07807 4.51081 2.46949 3.75805 2.42582 3.45445 2.48959 2.21441 5.70094 7.74087 6.32893 27 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00067 7.71123 8.43349 0.61948 0.77267 3.69469 0.02517 + 28 2.81335 7.30910 1.46171 3.00535 3.33529 2.79394 3.69434 6.14254 3.60631 5.61505 6.34979 3.76851 1.76878 2.30943 2.71631 2.40093 3.25766 4.48869 7.74094 6.32899 28 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 1.70184 7.71142 0.20187 0.61948 0.77267 3.69469 0.02517 + 29 1.91507 5.89997 1.76951 3.13154 5.23109 4.20384 2.40859 4.70707 2.40166 2.67684 4.94482 3.67636 4.59686 3.49337 2.76140 2.68308 2.29561 4.28355 6.34151 3.30968 29 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 1.61022 6.01309 0.22601 0.61948 0.77267 1.18301 0.36579 + 30 2.79058 5.64149 3.69514 5.60060 2.91951 3.67989 3.57037 2.89871 3.65494 0.98770 2.92505 5.59020 5.79607 5.52158 5.40097 3.69444 3.40887 1.68154 6.26255 3.47610 30 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 2.95139 6.91619 0.05473 0.61948 0.77267 3.16174 0.04327 + 31 2.87170 4.79934 3.47682 2.95557 3.98698 3.73883 2.25272 3.40958 2.91190 1.83059 3.94539 3.41236 2.47661 3.26610 3.30838 1.98655 3.12331 3.14203 5.37050 4.07916 31 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01160 4.85877 5.58103 0.61948 0.77267 1.13451 0.38799 + 32 2.67681 6.75390 1.96702 3.10656 6.10026 1.59619 5.20056 3.84615 3.03516 5.05985 5.79458 2.58878 2.92149 3.49611 3.09478 2.06822 2.15833 5.14580 7.18573 5.77379 32 g - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00125 7.08313 7.80539 0.61948 0.77267 2.17385 0.12074 + 33 1.92122 6.90772 2.92005 1.53306 6.25478 2.80498 3.20562 5.74115 3.20424 5.21436 5.94910 3.85579 2.78488 2.29968 3.09472 2.66497 2.36483 3.42327 7.34024 5.92830 33 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00104 7.26245 7.98471 0.61948 0.77267 0.71487 0.67189 + 34 2.28816 4.33849 2.40392 2.55987 6.73193 2.27984 3.06587 6.21900 3.46998 5.69151 6.42625 2.67494 2.44135 2.27846 2.56265 2.14329 2.63543 5.77746 7.81740 6.40546 34 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01438 7.79357 4.27862 0.61948 0.77267 0.87005 0.54288 + 35 2.85298 7.50885 1.12705 2.55354 4.30166 4.26493 3.83795 4.55605 4.11243 2.27906 6.55162 3.94469 3.34443 3.66397 5.20761 2.25618 2.36432 2.60553 7.94623 6.53914 35 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00053 7.94607 8.66833 0.61948 0.77267 0.07814 2.58803 + 36 2.37129 5.15844 3.55588 4.89851 7.04241 4.32874 4.37588 3.39022 2.54042 6.00477 5.12933 2.94661 3.63768 3.33269 0.92331 4.43202 2.61320 2.12939 8.13689 6.72842 36 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14560 8.86856 0.61948 0.77267 0.97393 0.47417 + 37 3.06217 6.77189 7.37909 6.76288 2.56449 4.86920 4.62591 1.95729 6.54384 2.29208 2.44249 6.73793 6.93409 6.67138 6.54592 2.77035 3.03028 1.20107 2.79808 3.04553 37 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14560 8.86856 0.61948 0.77267 0.26948 1.44298 + 38 1.64327 6.78296 7.39016 6.77395 4.59539 3.94175 6.91674 3.18969 6.55492 0.59870 3.04066 6.74900 4.80888 6.68315 6.55769 3.49536 2.80915 3.57022 7.40541 6.22983 38 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 39 2.27216 3.82828 7.10686 6.50798 2.93065 3.77144 2.26523 4.06741 4.33912 1.87291 2.55982 3.11017 4.97405 1.27819 6.44213 2.85440 4.67461 3.49279 7.43264 6.25013 39 q - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 40 3.43103 6.78517 7.35009 6.73665 1.48190 4.94279 2.95831 3.25220 2.67273 1.72797 3.40954 4.56710 6.93836 2.73858 6.54118 3.69997 2.90701 1.86313 7.40623 3.04980 40 f - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 41 1.32527 6.78519 7.34872 6.73459 3.30213 3.25014 3.16350 3.94190 6.52457 2.55076 5.90490 3.92319 3.23281 6.65973 6.54051 1.20328 2.28043 3.33193 4.63159 6.23068 41 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 42 2.36358 7.86994 7.21699 4.65582 8.57210 3.03801 8.21929 8.09591 7.30849 4.89564 8.45010 1.81391 1.56438 7.48178 7.61555 0.96481 2.48973 7.21353 9.84125 8.67538 42 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 4.80339 2.61166 0.08514 0.01822 4.01422 0.48579 0.95505 + 43 0.38524 5.76960 3.99307 2.35724 6.12935 2.34892 5.51106 5.63375 4.62591 5.23727 6.05172 4.35628 5.04527 4.74929 5.07022 3.83018 4.22181 4.92466 7.38464 6.08915 44 A - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00523 5.65092 6.37318 0.61948 0.77267 0.01609 4.13754 + 44 3.79924 3.57674 7.38073 6.76452 1.68306 6.57252 2.37343 1.78634 6.54479 1.82931 4.13611 6.73957 3.26274 6.67302 6.54756 5.88353 4.59358 3.67516 7.39528 1.42313 45 y - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14630 8.86856 0.61948 0.77267 0.27476 1.42608 + 45 1.37864 4.05835 4.75496 4.56296 3.89130 2.64294 3.85179 4.03201 6.10244 5.06341 4.46315 2.45926 6.84480 6.32840 6.29583 1.17901 3.01516 2.27211 4.67248 6.28335 46 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 46 8.41995 4.72409 9.02369 9.09924 0.07446 8.68820 4.06907 7.92089 8.85248 5.35832 8.39361 8.13299 8.91486 8.34995 8.54888 8.11358 8.58700 7.90009 4.82599 3.50693 47 F - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 47 4.62462 9.30128 0.14203 3.50241 9.04135 4.59481 7.65852 4.64610 7.00835 8.11946 9.08224 6.33738 3.13227 6.87041 7.83666 4.84781 4.16229 8.05916 10.26891 8.59219 48 D - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 48 1.26834 5.34890 4.63149 6.77124 2.46541 2.96170 3.88636 2.84386 6.55220 1.61076 2.73504 6.74698 3.02616 6.68043 6.55497 4.54208 3.71376 2.12438 7.40269 4.26343 49 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 49 2.58206 3.14212 7.38765 6.77144 1.98110 3.07489 4.78488 3.33482 6.55310 4.45078 4.11807 6.74718 6.94403 6.68133 6.55656 0.57262 3.60722 3.75486 7.40498 6.22940 50 S - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 50 2.97135 3.61528 7.38808 6.77187 5.87286 4.50736 6.91397 2.09590 6.55284 2.67745 3.96047 6.74692 6.94308 3.75183 6.55561 2.45426 2.28513 0.88082 3.77055 3.19731 51 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 51 3.05469 7.59272 5.53546 2.36708 2.14320 3.23698 6.21682 6.33189 3.39294 2.70257 3.04983 4.43266 2.66791 2.60761 3.09142 3.93013 2.63187 4.84716 1.62680 2.99785 52 w - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00413 8.15554 5.56455 0.61948 0.77267 0.48579 0.95505 + 52 3.50838 5.06103 1.61262 0.81135 7.10374 2.59897 6.19987 6.59150 4.53355 6.06402 6.80014 5.46999 4.53840 2.22606 5.43395 2.62531 4.57098 6.14997 8.19059 4.20361 53 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01011 8.15181 4.62855 0.61948 0.77267 0.37006 1.17342 + 53 3.89206 6.80328 7.41394 6.79843 2.57161 6.60781 6.94329 1.03491 6.58078 1.45565 1.99492 6.77486 6.96894 6.70762 6.58286 4.27745 3.38329 2.27287 4.22130 6.25430 54 i - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14560 8.86786 0.61948 0.77267 0.26834 1.44669 + 54 2.83515 3.88042 7.38844 6.77223 0.78690 6.58023 6.91432 3.53662 6.55319 2.07130 3.98231 4.77735 6.94344 6.68143 5.30622 5.89193 3.29055 3.66346 2.52670 2.08031 55 f - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 55 2.31866 6.79985 3.90042 6.61755 5.89460 1.42242 6.88095 4.01548 6.43248 2.48986 3.17261 6.66885 2.18557 6.59052 3.24539 2.57373 1.80157 3.33065 2.87317 6.23979 56 g - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.02411 8.15542 3.74909 0.61948 0.77267 0.48579 0.95505 + 56 1.16810 4.03288 7.35860 6.74239 5.85101 3.99337 6.89003 3.83187 6.52474 3.86791 4.58463 6.72090 1.48279 4.53818 6.53029 2.32774 1.80510 2.88434 3.91089 4.22142 57 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00044 8.13175 8.85401 0.61948 0.77267 0.16372 1.89035 + 57 5.04485 6.88031 7.49374 6.87753 3.13315 6.68969 7.02309 2.85936 6.66058 0.24827 4.13128 6.85743 4.56034 6.77911 6.66058 5.32142 4.12297 3.31060 4.06613 6.32856 58 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 58 2.18406 4.95457 7.38821 6.77200 3.13159 4.72167 4.50194 2.53756 6.55297 0.68686 4.86377 6.74705 6.94321 4.89981 4.61701 3.11426 3.07475 2.71016 5.14172 3.19051 59 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 59 1.77028 3.69307 4.45899 4.50821 4.00360 3.06438 3.08379 4.71546 4.27878 2.82733 4.21015 1.56372 6.60844 4.61634 5.79469 1.59768 2.74138 3.03596 7.72441 3.33194 60 n - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 60 4.28938 8.17031 6.66271 5.29236 8.90435 0.05286 8.15229 8.43162 5.39425 7.98801 8.79275 6.97532 7.54509 7.41062 5.44208 4.06410 6.71539 7.53123 10.05775 8.86761 61 G - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01313 8.15553 4.36193 0.61948 0.77267 0.48579 0.95505 + 61 1.05269 4.74231 6.54727 6.21109 8.17062 0.86346 2.70238 7.66462 4.29246 7.19536 7.94812 3.19174 7.18358 3.03024 6.60896 3.22363 6.26169 6.99296 9.30669 8.01674 62 g - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14283 8.86509 0.61948 0.77267 0.23521 1.56258 + 62 2.65207 2.90784 5.46971 2.87665 7.07712 4.66012 6.17811 6.56281 2.23688 4.56100 6.77144 5.46070 6.38605 2.47878 2.08716 2.38521 0.98713 6.12266 8.16120 6.75204 63 t - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.04496 8.15548 3.13085 0.61948 0.77267 0.48579 0.95505 + 63 3.69422 5.08121 7.79350 7.20571 6.25956 7.03381 7.42405 3.00316 7.01371 0.43227 3.98049 7.20502 7.37276 7.15511 7.03450 4.16071 4.10387 1.56349 7.88707 6.70803 64 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00045 8.11096 8.83322 0.61948 0.77267 0.10446 2.31075 + 64 3.08442 3.12254 7.38817 6.77196 3.47189 6.57996 2.19511 2.59228 5.03077 3.19047 5.90067 4.49428 6.94317 6.68116 6.55570 5.06682 4.77500 0.67989 7.40342 2.27759 65 V - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 65 3.68893 2.56326 7.39103 6.77482 4.11175 6.58282 5.39753 1.26915 6.55578 1.40362 3.09421 6.75056 3.42900 6.68402 6.55856 4.65171 4.94006 1.40709 7.40628 6.23070 66 i - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 66 1.88876 3.11216 7.38888 6.77267 5.87297 3.54815 6.91477 1.83400 6.55364 1.99135 4.71541 6.74772 1.72587 6.68187 6.55641 3.05186 3.78452 1.63784 7.40413 4.11307 67 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.14946 8.15531 1.97660 0.61948 0.77267 0.48579 0.95505 + 67 2.64084 7.58229 1.53874 2.58886 6.92934 4.32588 4.94070 6.41572 3.13852 4.09507 6.62367 3.31666 1.35852 3.47678 2.49805 2.69976 2.70045 4.45966 8.01481 6.60287 68 p - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00050 8.00634 8.72930 0.61948 0.77267 2.80388 0.06249 + 68 3.14618 7.58301 1.80840 2.09259 6.92937 3.58078 3.75684 6.41575 1.64551 4.33007 6.62370 3.65079 2.21459 2.69009 2.74484 3.00200 5.29978 5.97491 8.01484 2.48769 69 k - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.43773 8.00620 1.03799 0.61948 0.77267 2.80388 0.06249 + 69 2.33298 7.17947 2.23040 1.30158 6.52583 2.12157 3.52173 3.96396 3.36646 3.44132 6.22084 3.50301 2.53746 3.81632 4.85465 2.71491 3.42677 3.12178 7.61199 6.20074 70 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 2.98050 7.56983 0.05264 0.61948 0.77267 3.89796 0.02049 + 70 2.99871 4.70316 4.19855 3.85128 1.24782 3.79652 4.25608 3.47421 3.82009 3.07495 4.12854 3.96426 4.40094 4.04813 4.04605 1.28594 3.38410 3.23368 4.77316 3.18239 71 f - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 1.25250 1.00643 1.05357 0.68349 0.70290 1.89918 0.16216 + 71 3.66201 6.17606 2.40464 2.48921 5.52242 4.42239 4.61647 5.00880 2.55297 4.48131 5.21813 3.88867 1.68446 3.71400 1.96796 1.55831 2.60496 4.56795 6.60719 5.19456 74 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 2.95326 0.05452 7.02758 0.00911 4.70241 1.08801 0.41081 + 72 2.39831 6.96337 1.82854 1.32670 2.89183 5.22496 3.18503 5.79542 3.35970 3.06581 6.00406 4.69193 2.94381 2.74003 3.37426 2.82251 3.13789 5.35527 4.37793 4.39596 76 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00098 7.32824 8.05049 0.61948 0.77267 0.21137 1.65797 + 73 3.13920 4.70572 2.55557 2.35040 4.29468 2.22702 4.94485 2.69282 5.75098 2.86194 3.34437 4.59343 6.67009 3.27853 3.49548 5.56106 1.40218 1.96293 4.50956 6.22371 77 t - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00956 8.04521 4.68969 0.61948 0.77267 1.25181 0.33685 + 74 2.10996 7.54284 3.33891 2.95421 4.52627 4.45349 3.32366 2.57922 2.93549 2.13768 3.35277 4.63301 6.31667 3.77489 1.59495 3.20305 2.62912 2.55634 8.00032 3.80747 78 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00047 8.06385 8.78611 0.61948 0.77267 2.40172 0.09493 + 75 3.20297 7.35353 3.29793 5.00792 3.88433 3.64104 3.33744 3.93424 2.45575 1.41187 2.58676 3.04077 4.06178 3.70827 1.72518 3.49617 2.80579 3.47884 4.72442 4.57748 79 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00047 8.06385 8.78611 0.61948 0.77267 0.16247 1.89741 + 76 4.54482 7.90173 0.61676 2.49103 7.24394 3.86068 3.91960 6.73586 3.51758 6.20629 6.94727 2.40438 3.05178 4.50947 5.58108 2.23110 3.56956 6.29224 8.33287 6.90984 80 D - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.03067 8.14603 3.50957 0.61948 0.77267 0.97695 0.47234 + 77 1.96010 4.23916 6.29643 4.53791 2.93951 2.57908 6.57576 2.47511 3.87249 5.16382 3.57721 3.25004 1.03475 4.86230 4.61000 2.92565 3.77406 2.62067 7.54825 6.32554 81 p - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.03954 8.11609 3.25782 0.61948 0.77267 0.11354 2.23180 + 78 2.14593 7.68695 2.17296 1.86728 4.73414 2.78570 3.32982 4.18378 2.20207 3.18218 4.68493 3.65699 2.77461 2.64014 2.85363 3.52806 3.29170 3.07613 8.11878 6.70684 82 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00990 8.11834 4.65052 0.61948 0.77267 1.72070 0.19716 + 79 1.34242 7.67640 3.49672 2.32253 6.13207 3.90568 3.81418 4.56209 2.88398 2.78486 4.38118 4.44148 3.88904 2.62682 2.00992 2.75436 3.02053 3.41701 8.10892 3.19520 83 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00045 8.10819 8.83114 0.61948 0.77267 0.10008 2.35142 + 80 3.11433 3.54547 7.41877 6.80257 1.99975 6.61195 6.94605 2.65824 6.58422 0.53305 3.72776 6.77900 6.97239 6.70968 6.58630 5.92366 4.32526 3.30980 3.23078 4.77858 84 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 81 1.75916 4.36262 5.72050 3.11565 2.11405 3.61194 4.35153 3.31458 2.85641 2.67897 4.83188 3.13852 4.26419 3.17041 2.75105 3.21616 2.67550 2.83007 7.88866 2.25615 85 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 82 1.74378 7.72217 2.18739 1.82003 5.44103 3.24098 3.34287 5.06257 2.51595 3.89184 5.05494 4.09701 3.09680 2.42722 1.94202 3.32831 3.69984 6.11407 8.15401 6.74206 86 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 83 2.16268 6.78250 5.33521 4.28440 2.20149 6.57803 4.53116 3.30429 4.64553 1.43765 3.21002 6.74161 6.94193 5.21045 4.23935 5.88904 2.41083 2.37340 2.80869 1.89512 87 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 84 3.23001 4.23854 7.39929 6.78378 2.23465 4.92337 6.92726 0.88717 6.56474 1.46942 2.84185 6.75952 6.95499 6.69298 6.56821 5.90417 4.37856 2.47656 7.41523 3.78453 88 i - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 85 1.93379 4.43466 2.93885 1.62534 4.65300 4.01877 3.20294 5.16593 2.36146 4.01392 6.76294 2.36285 4.01254 3.04698 2.06757 2.75101 3.46079 3.75330 8.15409 6.74215 89 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 86 2.54237 7.72226 2.12510 1.82566 7.06862 4.36812 4.53864 4.57537 1.69188 6.02821 6.76294 3.85311 6.37548 2.48484 1.58028 3.30206 2.72675 6.11416 8.15478 6.74284 90 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.02389 8.15589 3.75856 0.61948 0.77267 0.48579 0.95505 + 87 2.97224 7.68841 5.44123 1.45702 3.60855 4.70095 1.57901 6.51214 3.95859 3.61964 6.73048 2.26731 6.35687 2.13838 4.44517 3.54616 3.15176 4.49647 4.28229 2.42257 91 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00044 8.13175 8.85470 0.61948 0.77267 0.16477 1.88447 + 88 3.24937 4.89698 3.25561 2.93746 7.06861 1.42224 2.96865 6.55568 2.12231 6.02820 6.76294 2.48552 3.36513 2.12925 1.87070 3.31938 5.00234 4.72300 8.15408 6.74283 92 g - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 89 3.49553 3.97727 7.42568 6.81085 5.12651 6.62024 6.95849 0.88167 6.59390 2.74139 5.93402 6.78798 4.63160 6.72352 6.59806 4.43544 3.88161 0.91564 4.56852 6.27020 93 i - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 90 4.72459 7.75295 3.12550 3.51505 7.09862 3.44365 3.57535 4.56239 4.93323 6.05890 6.79502 1.89447 6.39993 4.07095 3.42217 2.34501 0.62740 6.14485 8.18547 6.77145 94 T - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 91 2.16465 6.79210 4.96635 3.77968 2.92157 4.06110 2.65886 2.01077 6.47672 2.78294 3.39360 4.69741 6.92865 4.56155 3.94326 4.67176 2.36774 1.51517 4.01189 2.28803 95 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 92 2.18271 6.78105 7.38825 6.77204 3.75061 4.59694 6.91414 2.35600 6.55301 1.36965 1.97893 6.74709 6.94325 6.68124 6.55578 3.79775 2.20351 2.12934 2.38165 4.40772 96 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 93 2.79965 4.97682 2.85857 3.38952 2.33524 4.53390 2.39624 3.32575 2.89946 2.80242 3.57459 2.14047 6.53502 3.54963 5.66235 2.38515 2.56467 2.82114 3.33614 3.04641 97 n - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 94 2.01694 4.54762 7.38814 6.77193 2.10220 2.38639 6.91402 2.96309 6.55289 1.09991 4.01459 6.74698 6.94314 3.15717 6.55567 2.00932 4.36740 3.22024 7.40339 6.22781 98 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 95 3.50119 7.02168 7.60739 4.53259 6.23149 6.65500 7.24765 3.88657 6.84701 5.30475 4.06679 6.96484 1.88892 6.97940 6.86087 3.31473 0.70018 1.50839 7.77028 6.59678 99 t - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 96 3.08034 7.30715 7.42568 6.93354 6.77065 4.71478 7.48529 6.12949 6.80670 4.07431 6.77273 4.02787 0.34379 7.01603 5.05095 3.04221 2.20212 4.12630 4.58100 7.08326 100 P - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 97 1.26235 7.45700 7.13608 6.64810 7.08270 3.07423 4.54994 6.46026 6.58572 3.55389 7.05151 6.79158 2.66250 4.37181 6.86575 0.89636 2.17522 4.06128 8.52237 7.32600 101 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 98 2.45300 6.78101 7.38752 6.77131 2.36012 6.58000 3.09416 3.54748 5.06478 0.93709 2.38507 5.10568 6.94321 3.46777 6.55505 4.24617 5.64079 1.94908 4.70296 3.16972 102 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 99 2.26730 4.30377 7.38827 6.77206 1.49375 3.36732 6.91416 2.78300 6.55303 1.02864 3.37287 6.74711 6.94327 4.50686 6.55580 3.89342 5.64085 2.49188 3.99739 3.63349 103 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 100 2.81141 7.72236 2.39136 2.47801 7.06872 3.20651 2.82874 3.47475 2.11203 3.73468 3.85252 2.02677 4.00155 2.13213 2.16263 3.09630 4.60874 4.36753 8.15419 4.21919 104 n - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01708 8.15532 4.09556 0.61948 0.77267 0.48579 0.95505 + 101 2.07594 6.93213 6.46010 3.46986 3.71246 4.07706 2.98674 3.13784 4.82427 1.66075 2.12932 4.60732 3.17250 1.83958 4.76674 3.89337 2.73998 2.58333 7.53032 4.77090 105 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.13867 8.86093 0.61948 0.77267 1.22425 0.34811 + 102 3.31042 2.30536 7.37434 4.76880 2.14039 6.56544 6.90023 2.25406 6.53841 0.84698 4.44649 6.73318 6.92865 6.66664 6.54118 5.87714 4.46659 3.01722 2.67758 2.97702 106 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.02344 8.13881 3.77753 0.61948 0.77267 1.22425 0.34811 + 103 1.85415 7.11444 2.18686 3.22519 3.23559 6.19463 4.50890 3.50314 4.41810 1.18526 2.51333 3.87259 4.74596 3.84486 5.77043 3.51354 2.98606 2.61730 7.67797 6.41852 107 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.04038 8.11624 3.23717 0.61948 0.77267 0.53365 0.88300 + 104 2.19786 7.66056 1.61215 1.66830 3.70130 3.62921 3.93281 4.70220 3.67843 2.36907 4.36880 3.06776 3.04836 2.58880 5.34752 3.38869 3.33948 3.65001 8.09516 3.92519 108 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.26880 8.09998 1.44646 0.61948 0.77267 2.00743 0.14425 + 105 1.41738 3.73110 2.56939 2.13548 6.74491 2.87160 3.18282 3.69506 3.67981 3.44414 3.99242 3.17035 4.54417 2.90626 4.14145 2.71634 3.44345 2.71703 4.42287 3.22233 109 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.16696 7.83179 1.87488 0.61948 0.77267 1.96140 0.15159 + 106 1.65033 7.23710 2.35318 4.07288 6.54880 2.39269 4.64473 2.69768 4.53590 3.00266 3.63204 3.14337 4.32865 2.67203 2.51053 2.27693 3.53500 2.76144 7.68972 3.31042 110 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 1.09168 7.71609 0.40962 0.61948 0.77267 3.68590 0.02539 + 107 1.72035 3.00406 2.19446 1.56924 5.72328 4.63989 4.82704 4.66969 3.05605 4.68356 5.41898 4.10686 3.40539 3.01168 2.41003 2.86543 3.88574 4.76881 6.81013 2.46548 111 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.33778 6.62671 1.25412 0.61948 0.77267 2.29764 0.10591 + 108 3.28822 6.40877 3.41576 1.30236 3.03800 4.27734 2.76975 5.23943 2.98809 2.17018 5.45015 4.14010 2.98740 2.35594 2.94304 2.58191 4.13040 4.79998 6.84199 3.15098 112 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.51320 6.66834 0.91591 0.61948 0.77267 4.44175 0.01185 + 109 2.06828 5.18051 4.79720 4.22119 2.78569 3.30139 3.00819 2.92917 4.14079 2.16809 4.29189 2.32682 5.03703 3.04631 2.22354 3.93077 2.64151 2.16047 3.75540 4.57470 113 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.43878 6.15793 1.04112 0.61948 0.77267 2.12547 0.12713 + 110 2.14802 6.22303 2.09257 2.36359 5.55553 4.52066 4.70920 5.03359 2.62074 2.07524 3.29795 3.99179 2.62005 3.80880 3.93911 2.14455 2.02602 4.60730 6.66318 5.25955 114 t - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.10284 6.46108 2.34170 0.61948 0.77267 3.33696 0.03619 + 111 2.41835 6.25492 2.36844 2.06415 5.59574 2.63738 4.71683 3.07684 3.45322 4.55879 5.29768 2.92781 1.67530 3.81435 3.94535 2.19169 2.51400 4.64405 6.69022 3.67572 115 p - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.04315 6.47555 3.20182 0.61948 0.77267 3.50121 0.03062 + 112 2.77112 3.45249 2.09530 2.66715 2.48762 2.89312 4.74382 5.12574 2.96104 3.11354 5.33507 2.14729 3.64033 2.62071 2.85430 2.15075 3.24038 4.68559 6.72691 3.92660 116 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.04342 6.51880 3.19378 0.61948 0.77267 1.92103 0.15836 + 113 2.72953 6.61809 2.68725 2.30256 1.89083 2.35315 5.08831 5.43766 3.29445 2.76003 5.66085 4.37021 2.73161 2.79538 3.17453 2.23047 3.17592 3.46219 3.21474 5.64838 117 f - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00143 6.94400 7.66626 0.61948 0.77267 1.29272 0.32093 + 114 1.84857 4.89980 3.41508 2.89591 4.33558 2.64984 2.93265 4.09714 3.23833 2.29079 4.33281 2.21663 3.57312 3.03454 2.68935 2.69836 2.36496 3.79007 7.48593 6.07538 118 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01083 7.43235 4.58768 0.61948 0.77267 0.45739 1.00221 + 115 2.55346 7.52056 2.96519 2.21798 4.73480 2.76280 3.72211 4.77500 3.46218 3.03104 6.56124 2.93608 1.43542 2.36701 3.24384 2.77804 2.87231 4.01046 4.30158 6.54114 119 p - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00053 7.94053 8.66279 0.61948 0.77267 0.38621 1.13827 + 116 2.03351 7.66256 2.20887 1.67584 5.10831 3.33108 3.43990 3.95838 2.94222 2.88053 6.70324 4.67648 3.36782 3.55774 2.07995 2.43345 2.85073 3.97085 8.09439 5.00226 120 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.02912 8.09251 3.56140 0.61948 0.77267 1.51468 0.24830 + 117 2.53821 2.80161 1.65792 3.23067 3.33187 4.17543 3.70755 3.45248 3.25770 2.47237 3.17591 3.51209 3.70825 2.78151 2.89449 2.66783 2.68170 3.39079 4.28009 6.64788 121 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00594 8.07209 5.18375 0.61948 0.77267 0.11817 2.19414 + 118 3.01941 3.43599 4.36134 3.84980 2.09752 3.55660 4.39184 2.62293 3.63562 1.27129 3.59056 3.45540 2.39142 4.32599 3.92535 3.56769 3.47550 2.72551 4.71207 3.70701 122 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01839 8.14554 4.02131 0.61948 0.77267 0.26557 1.45571 + 119 2.01014 7.70573 2.90360 3.44218 4.73836 3.43247 2.78230 6.53847 2.70328 2.77953 6.74641 3.96135 1.80081 3.44010 2.81488 1.69891 2.92578 3.87470 8.13756 4.56646 123 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.07117 8.13802 2.68226 0.61948 0.77267 1.25825 0.33429 + 120 2.64569 4.24270 2.64015 4.02644 3.92940 2.18198 2.83076 6.47256 3.19189 5.94507 4.84089 2.77739 3.64244 2.85988 2.87859 1.28227 2.43290 4.40421 8.07096 6.65902 124 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00047 8.06731 8.78957 0.61948 0.77267 0.15004 1.97094 + 121 4.53310 6.77336 7.37986 6.76365 4.74451 3.73598 6.90644 2.65883 6.54462 0.36313 3.71519 6.73870 3.36515 4.52825 4.49637 4.41873 4.57747 2.54031 7.39580 6.22022 125 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14768 8.86994 0.61948 0.77267 0.28854 1.38372 + 122 4.50816 7.72228 3.00888 2.89451 7.06865 4.46033 3.14197 4.54351 1.73765 3.94394 4.56084 3.58974 4.21427 2.52645 1.02440 3.36516 2.82658 3.34437 8.15411 6.74217 126 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00583 8.15586 5.19751 0.61948 0.77267 0.48579 0.95505 + 123 2.78017 3.27993 3.11150 2.73442 3.49619 6.05044 2.15565 3.03733 3.28270 2.24714 4.18518 4.78753 4.80485 3.74087 1.92691 3.32637 2.32547 2.47241 7.98709 3.45044 127 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14976 8.87271 0.61948 0.77267 0.33350 1.26024 + 124 3.25625 6.88418 7.50247 6.88903 2.61578 6.70188 7.03945 1.93442 6.67346 1.13383 3.44825 6.86963 5.03140 6.79615 6.67554 6.01567 4.55729 1.04996 4.63561 6.34491 128 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 125 3.75429 3.68567 3.90540 6.77087 2.49276 4.12651 4.82659 1.57018 6.55253 1.49532 3.32593 4.51467 4.67063 2.68407 6.55530 5.89126 2.69655 2.29106 3.57269 2.74299 129 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 126 2.48910 2.81626 7.38826 6.77205 1.85210 6.58005 6.91415 2.80725 6.55302 1.64069 4.59211 4.89570 5.01909 3.72221 6.55579 3.50109 2.11896 1.47225 4.71479 6.22724 130 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 127 1.76511 3.19369 4.22647 7.04135 6.20680 0.62835 7.21325 3.52778 6.82924 3.58739 6.22967 6.93114 7.08432 6.95470 6.83271 2.69393 3.48550 2.63016 7.73449 6.56377 131 G - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 128 4.80112 7.83433 8.52401 8.59610 8.85049 0.05791 8.97248 8.39786 8.47064 8.08595 8.78810 7.62084 7.52727 8.45747 8.33964 3.16113 6.52844 7.33943 10.16747 9.23449 132 G - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 129 2.86303 9.71340 1.28127 0.52435 9.09511 3.71837 7.60623 8.76448 6.96715 8.16006 9.16304 6.27816 7.57435 6.81466 7.84398 3.17564 7.26174 8.20165 10.31367 8.57317 133 E - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 130 1.15472 7.72021 3.85731 2.48695 7.06519 4.09783 6.16964 3.34299 2.38575 4.65512 4.02158 3.70828 3.30625 2.81758 1.96293 3.52321 4.45549 2.88343 3.07544 6.74149 134 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01942 8.15558 3.96620 0.61948 0.77267 0.48579 0.95505 + 131 2.63280 3.18662 7.37392 6.75771 2.34167 6.56641 6.90050 2.62725 6.53868 0.65109 3.87214 6.73345 6.92892 6.66691 6.54145 3.48121 3.73559 2.30840 4.65748 6.21359 135 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.13660 8.85886 0.61948 0.77267 1.29423 0.32036 + 132 2.94942 7.70372 2.33876 3.74100 7.05008 3.24332 3.47968 4.15342 3.37848 3.89141 5.07946 3.35353 1.90277 3.70149 3.57187 1.51738 1.81613 2.82188 8.13624 6.72430 136 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.13660 8.85886 0.61948 0.77267 1.29423 0.32036 + 133 1.66132 7.66189 4.55313 2.57558 4.11506 3.25417 4.06099 3.72828 3.31239 2.71421 4.12823 4.66611 1.50813 4.06030 3.16475 3.03028 2.82165 2.22138 8.10551 6.70604 137 p - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.13660 8.85886 0.61948 0.77267 1.29423 0.32036 + 134 1.98509 7.70425 2.26443 2.15838 4.91156 2.95619 2.81271 6.53768 2.40444 6.01019 6.74493 4.45477 3.02481 2.71012 2.65675 1.93726 2.72329 3.92105 8.13677 6.72483 138 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.04839 8.13673 3.05873 0.61948 0.77267 0.18805 1.76358 + 135 2.51125 7.65510 1.91930 3.84972 6.99106 3.56414 2.47798 4.04103 4.03271 1.27121 5.18333 4.43265 4.93934 3.31738 3.24807 2.84188 2.48422 2.67830 8.09386 4.51237 139 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00045 8.10819 8.83114 0.61948 0.77267 1.88212 0.16519 + 136 1.21791 3.59887 7.33146 4.21369 2.44408 6.53365 6.86706 2.89879 3.95168 2.13147 4.90476 4.79455 3.40132 4.67255 3.91217 5.84466 3.66957 1.45635 4.58036 3.54064 140 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00045 8.10819 8.83114 0.61948 0.77267 1.88212 0.16519 + 137 2.08137 7.67646 2.70382 2.20614 4.30429 3.65897 4.45956 4.38678 2.18534 2.68510 4.83386 2.77868 4.68067 2.78353 1.77361 3.34221 3.75116 2.82304 3.75602 6.69773 141 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00045 8.10819 8.83114 0.61948 0.77267 0.10008 2.35142 + 138 2.57652 7.72175 3.64258 3.00211 4.89648 4.57833 4.81816 4.96441 1.68998 3.04370 5.12869 3.49147 3.69942 2.34431 1.13131 4.10976 2.83784 3.94687 4.88470 6.74233 142 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 139 2.30883 4.29955 4.13804 3.98971 2.44261 6.57654 4.72306 2.15772 4.27321 1.38902 3.37350 4.27113 6.93975 6.66734 4.17894 4.18310 3.74295 1.94285 2.12653 4.68217 143 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 140 3.39573 4.53318 3.46643 3.23215 3.03737 3.87747 4.10690 3.28275 2.17371 2.14321 4.38139 4.38139 3.81647 2.65129 1.45700 3.18987 3.48584 3.08104 4.76470 2.90775 144 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 141 1.98712 4.42699 2.84246 1.84502 7.06858 3.40876 4.25925 5.21372 2.45014 3.68463 6.76290 3.05595 2.80711 2.43627 2.22209 2.56381 2.92148 3.41292 8.15405 6.74210 145 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.07531 8.15533 2.62748 0.61948 0.77267 0.48579 0.95505 + 142 1.96443 4.58383 3.64600 3.02009 6.91696 3.20655 3.10535 3.65432 2.93622 1.91591 4.25597 3.96901 6.32086 3.17327 1.59498 4.48818 2.60143 3.18575 3.06168 4.71137 146 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01553 8.08099 4.19313 0.61948 0.77267 0.68900 0.69731 + 143 2.41081 2.30338 7.13323 4.70444 1.39605 4.69266 2.96533 3.49906 4.95605 2.39210 4.12497 3.77770 4.53670 3.35696 2.88631 3.25853 3.32092 2.39071 4.66632 3.04504 147 f - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.30998 8.10304 1.32336 0.61948 0.77267 1.96855 0.15043 + 144 3.02288 3.71325 4.20747 3.12269 4.35927 1.78007 3.00971 6.21413 2.44549 3.37985 6.42346 3.89555 1.12920 3.81792 2.95010 3.76871 3.18854 3.55314 7.81530 6.40405 148 p - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.17260 7.79375 1.84447 0.61948 0.77267 3.11334 0.04547 + 145 3.45184 2.57362 1.94147 2.53481 6.58140 1.96227 3.26954 4.07221 3.32985 4.31273 6.27572 2.02326 4.26837 2.48490 3.78039 2.22705 3.36797 3.72355 7.66687 6.25493 149 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 3.50721 3.76368 0.05464 0.43056 1.05023 2.90710 0.05618 + 146 2.97219 5.44742 3.16905 1.96852 4.79101 2.37193 2.39341 4.26560 2.63532 3.74921 4.50058 3.19400 4.12559 3.02002 1.94564 2.08358 3.20370 3.84071 5.88202 4.48533 152 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00944 5.06386 5.78612 0.61948 0.77267 0.43397 1.04394 + 147 1.52207 4.81521 4.34803 3.89402 6.66384 2.34553 4.54835 4.08394 2.59922 3.27573 4.54073 3.61329 1.71130 3.39495 2.99085 3.26811 2.55209 2.48277 7.76178 6.35469 153 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.56752 7.74714 0.83785 0.61948 0.77267 0.91922 0.50888 + 148 3.04978 4.22466 2.48071 3.21128 6.44481 1.86103 2.47169 3.57796 2.32267 5.40440 6.13914 3.26951 3.03037 3.35684 2.60339 1.99620 2.56458 3.53568 7.53028 6.11834 154 g - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.30822 7.47744 1.32923 0.61948 0.77267 2.62848 0.07493 + 149 3.34705 3.85374 3.22229 5.89645 5.02100 3.12455 6.05518 3.24100 5.68227 1.65647 2.37803 5.88259 2.46884 5.81466 2.75996 3.81008 2.57905 1.57884 4.07139 3.05732 155 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00107 7.24099 7.96325 0.61948 0.77267 0.05554 2.91824 + 150 3.26611 5.38368 3.89133 2.49187 4.42991 3.24532 3.75409 6.52876 2.62911 3.12194 4.86243 3.16145 3.19611 2.31928 1.32392 2.86548 2.44474 3.10669 8.12785 4.02234 156 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00044 8.12759 8.84985 0.61948 0.77267 0.84966 0.55785 + 151 3.57939 7.00978 7.64539 7.03612 3.05398 4.22055 7.19900 1.41192 6.82609 0.89206 3.32431 7.02433 7.19762 6.94046 6.82748 6.17453 5.88826 1.67324 7.65371 3.91141 157 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.13660 8.85886 0.61948 0.77267 0.18984 1.75497 + 152 3.14133 6.78173 7.38131 6.76510 2.60344 6.57864 2.33312 1.61571 4.83191 3.07548 4.12559 4.05767 6.94254 2.68385 3.74436 5.89035 2.71851 1.84445 3.51770 1.92624 158 i - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 153 3.98657 8.64521 4.08638 5.54338 8.21408 4.59931 2.95725 7.74967 6.12216 7.21802 8.01999 0.30041 7.18267 2.24123 6.70232 3.85418 4.15570 4.77884 9.35430 7.86404 159 N - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 154 1.83950 4.09778 7.31537 2.38917 4.49218 2.18469 2.71703 3.84616 6.50022 2.10498 2.28589 3.95083 5.26850 6.64163 6.52795 2.67821 2.83486 1.97467 7.40963 6.23267 160 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 155 8.40499 9.15082 9.02258 9.09744 2.69554 3.02547 7.15802 7.91424 8.85276 3.93350 8.38558 8.13535 8.91099 8.35231 8.54986 8.10832 8.57828 7.89622 7.21763 0.15099 161 Y - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 156 4.19423 8.17913 8.63384 8.66156 8.91387 0.03119 9.08854 8.70870 8.38846 8.32677 9.09409 7.92198 7.83048 8.61096 5.38298 6.52598 6.90374 7.69809 5.21524 9.15508 162 G - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 157 3.05869 6.78159 7.37908 6.76356 3.54459 6.57849 4.82622 2.85352 6.54592 4.26754 2.66290 6.74277 0.41988 4.60926 5.18457 4.07069 3.62985 3.01641 7.40403 6.22776 163 P - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 158 2.12452 7.80070 8.59366 8.66159 8.82171 6.60364 8.96242 8.36007 8.45988 5.16882 8.75378 7.60385 7.49988 8.44879 8.31710 2.88005 0.20658 7.30233 10.15671 9.21334 164 T - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.15530 8.87756 0.61948 0.77267 0.48579 0.95505 + 159 7.86844 9.62418 7.18084 0.00954 9.72469 7.80190 8.96223 9.72746 8.36889 9.03501 10.12255 7.81645 8.53455 8.32245 8.83677 5.15061 8.32384 9.09670 10.38318 9.63458 165 E - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14976 8.87271 0.61948 0.77267 0.48579 0.95505 + 160 1.31149 4.00229 7.75707 7.27533 6.63556 2.37617 7.54288 3.58848 7.08679 5.74001 6.65358 1.79046 7.19492 7.21849 7.11452 2.54252 1.19851 3.95862 8.14800 4.22756 166 t - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14976 8.87271 0.61948 0.77267 0.48579 0.95505 + 161 2.24995 2.82664 7.40973 6.85591 6.25218 3.77903 3.19471 5.59785 6.68401 2.61870 6.27159 6.83927 7.05276 3.72496 6.75679 2.05309 0.68690 3.58425 7.76948 6.59390 167 T - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14976 8.87271 0.61948 0.77267 0.48579 0.95505 + 162 4.04352 6.78561 7.39420 6.77868 3.53960 4.05253 6.92286 1.46086 6.56034 2.59970 4.20572 6.75442 6.94989 6.68857 4.41990 4.46634 2.77021 0.65888 7.41083 6.23525 168 V - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14976 8.87271 0.61948 0.77267 0.48579 0.95505 + 163 2.42038 2.14936 2.15767 3.90649 2.55693 2.82656 2.97559 3.07055 6.42677 3.45663 3.00955 2.63664 6.91474 4.86441 6.48569 4.01115 3.51347 2.22352 2.86815 3.04837 169 c - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14976 8.87271 0.61948 0.77267 0.48579 0.95505 + 164 1.35353 2.83479 7.38183 6.76562 3.77331 6.57501 6.90980 4.39783 6.54728 3.72617 5.89642 3.51962 6.93822 6.67552 6.55075 1.39581 1.86507 2.04321 7.39916 3.01154 170 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14976 8.87271 0.61948 0.77267 0.48579 0.95505 + 165 2.60360 4.61720 7.37038 6.75555 5.87110 3.45756 3.88177 2.83581 6.53929 3.14911 4.06268 3.75561 6.93716 6.66960 6.54692 2.33743 0.55882 3.19278 7.40087 6.22460 171 T - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00043 8.14976 8.87271 0.61948 0.77267 0.48579 0.95505 + 166 2.13155 3.05829 7.29550 3.97047 2.70132 3.60934 3.15256 1.91390 3.85610 3.22534 3.75421 6.70147 6.92605 4.34824 2.46288 2.51902 4.22693 2.59665 3.09918 2.03243 172 i - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00000 * * 0.00000 * 0.00000 * +// +HMMER3/f [3.4 | Aug 2023] +NAME AMP-binding_C +LENG 50 +ALPH amino +RF no +MM no +CONS yes +CS no +MAP yes +DATE Tue Jul 27 20:16:06 2004 +COM [1] hmmbuild aroundLys517.hmm aroundLys517.aln +COM [2] hmmcalibrate aroundLys517.hmm +NSEQ 849 +STATS LOCAL MSV -7.7800 0.71027 +STATS LOCAL VITERBI -7.7798 0.71027 +STATS LOCAL FORWARD -5.2714 0.71027 +HMM A C D E F G H I K L M N P Q R S T V W Y + m->m m->i m->d i->m i->i d->m d->d + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00285 * 5.86340 0.00000 * 0.00000 * + 1 2.76078 5.46822 1.53391 2.77326 4.55396 4.26561 5.04262 4.07430 4.06321 1.94356 4.09163 2.95140 3.04983 3.44492 3.68544 2.15567 2.23261 3.42828 4.36195 5.43425 25 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01764 9.44305 4.05106 0.61948 0.77267 0.48579 0.95505 + 2 1.70580 5.37255 4.17340 2.55490 4.20736 4.22677 4.37649 2.22288 3.98556 2.40102 4.38065 5.41691 2.03504 3.86564 2.98119 2.86820 2.47242 2.33032 8.44665 5.77734 26 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00350 9.43307 5.68106 0.61948 0.77267 0.52539 0.89484 + 3 1.55738 8.33844 2.38500 1.89217 5.47643 2.67959 4.61347 4.65713 3.19113 3.87388 5.66982 2.96308 3.51136 2.64770 3.11973 2.20131 2.71632 4.21421 8.54361 7.70698 27 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00791 9.45065 4.85370 0.61948 0.77267 0.40964 1.09031 + 4 1.77158 5.96096 2.03151 1.51165 5.13473 3.06915 3.58971 5.53121 3.01925 3.41642 5.87778 4.44574 5.47922 2.25956 2.94716 2.76486 3.16481 3.87459 6.83571 4.89837 28 e - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.45187 10.17413 0.61948 0.77267 0.11060 2.25659 + 5 3.21125 6.08296 5.32188 7.74859 4.00352 7.60303 5.95819 1.59553 7.58154 0.56897 3.78379 7.72988 6.22228 6.08227 7.58362 7.03257 4.72301 2.31224 4.45476 4.34593 29 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 6 3.30199 8.33216 4.84633 3.33111 4.64809 7.11084 4.12615 3.65342 2.29832 2.91383 4.53441 4.24260 4.89693 3.09405 0.55990 3.75947 4.23289 4.13308 6.46067 4.87405 30 R - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 7 1.61282 6.16056 2.95128 1.67867 7.95303 2.98456 3.56957 4.99538 2.71631 4.27450 5.52147 3.97784 5.80636 2.24358 2.55827 2.16248 2.83969 4.99399 7.10116 5.67119 31 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 8 2.10433 5.21726 4.12486 3.08722 2.08423 3.64659 1.71617 4.75770 3.21822 3.27783 5.51115 4.07149 7.44087 2.16256 2.67271 3.59737 3.64381 4.44163 3.31526 2.44328 32 h - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 9 2.62159 4.12364 8.81971 8.20835 6.17466 8.02120 8.36501 3.39237 7.99487 0.28152 3.28285 8.19033 5.67975 8.11963 7.99902 5.62430 5.70401 2.65833 8.83981 7.66909 33 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00238 9.46314 6.07573 0.61948 0.77267 0.48579 0.95505 + 10 1.35029 6.06716 6.15657 2.96948 8.33028 2.80035 5.30885 5.36361 2.11137 3.21278 4.84999 3.91286 7.63852 2.19246 1.99630 1.91936 4.00366 5.63810 9.41644 4.74117 34 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46088 10.18314 0.61948 0.77267 0.92590 0.50447 + 11 1.67529 8.98522 2.76423 1.92483 8.33159 3.05743 4.92685 4.63919 2.37191 3.47678 4.53868 3.25359 4.83119 2.08910 2.57569 2.65956 2.83839 3.61125 9.41706 5.98112 35 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46088 10.18314 0.61948 0.77267 0.28474 1.39516 + 12 3.24536 4.91723 4.26360 2.82670 4.51729 4.33915 2.89602 3.67442 2.30199 2.82046 5.09468 3.47895 7.64685 2.41359 1.54576 2.22436 2.34912 2.85859 5.05448 4.73008 36 r - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.02416 3.87667 5.76064 0.62656 0.76449 0.48579 0.95505 + 13 5.59770 6.35531 8.66280 5.82159 5.46392 7.85459 4.33271 4.34934 5.95120 0.12253 4.03742 8.02163 8.21780 5.96992 5.62265 7.16629 6.91537 3.11415 8.67735 5.91308 39 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00089 9.45957 7.12089 0.61948 0.77267 1.05159 0.42983 + 14 3.18063 8.41043 5.76538 4.66674 5.75775 4.65565 7.74778 5.49852 6.69836 5.64616 6.08492 5.26562 0.12108 5.13739 6.09740 5.00985 4.67298 5.18244 8.97742 7.72075 40 P - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.02296 9.45877 3.78883 0.61948 0.77267 0.88666 0.53108 + 15 1.87756 8.96152 1.51989 1.56495 8.30789 2.67191 3.97779 4.95444 4.09563 4.37913 8.00221 3.71786 3.13077 3.43645 2.97134 2.35029 3.90640 3.84263 9.39335 5.10693 41 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00275 9.43648 5.92639 0.61948 0.77267 2.46813 0.08855 + 16 3.31335 8.03992 8.64504 8.03022 3.04510 6.00692 2.32700 5.04899 7.81118 4.94017 7.15963 5.92860 5.40319 7.93872 7.81396 4.27544 4.83065 4.80223 5.00740 0.28014 42 Y - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00284 5.87796 10.15676 0.11414 2.22685 0.15684 1.92991 + 17 4.52782 5.97719 8.65759 5.02619 5.43792 5.24384 4.77943 4.89726 5.29998 2.90932 0.15406 8.01851 8.21536 4.85013 7.82720 4.68863 5.63339 4.32195 8.67630 5.93213 44 M - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.45950 10.18245 0.61948 0.77267 1.05159 0.42983 + 18 4.24663 4.67014 8.66128 8.04507 5.73066 6.07515 6.29003 1.34026 4.62717 2.61288 3.77390 8.02012 8.21628 4.27227 3.67894 5.13801 4.60498 0.61592 5.46934 7.50095 45 V - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.45950 10.18245 0.61948 0.77267 0.24663 1.52064 + 19 8.06269 10.62941 5.26515 5.50567 10.37087 5.68242 8.96655 10.02152 5.84046 6.21337 10.41662 7.64403 0.02634 8.17983 9.10934 5.87442 8.47442 5.53963 11.58873 9.91131 46 P - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00260 5.96990 10.18493 0.12035 2.17692 0.48579 0.95505 + 20 1.19783 6.88164 3.58919 3.54691 5.58199 3.72851 4.23312 6.34445 3.83664 4.47711 6.20998 3.47621 5.14253 3.45264 4.20817 1.12644 2.11625 3.49423 6.55725 6.25087 48 s - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 21 1.17971 5.64219 4.47424 4.78338 2.90703 4.57890 2.20765 3.78594 4.03132 2.74692 4.37858 4.94627 7.93512 3.36104 2.41005 3.25846 3.25361 2.50293 5.39404 2.50223 49 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 6.11067 3.66178 0.02830 0.01418 4.26317 0.48579 0.95505 + 22 3.01842 5.68773 6.67269 6.21729 5.05211 6.18887 6.97144 1.85740 6.17848 1.46646 4.76723 6.36424 6.37672 6.41137 6.32820 5.63643 4.77000 0.66241 7.21473 6.01073 51 V - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 1.25113 5.88551 0.34103 0.61948 0.77267 5.96087 0.00258 + 23 3.28011 5.40461 3.48944 3.07355 4.97624 3.84849 4.12991 4.40370 1.08699 3.88939 4.75235 3.51647 1.43911 3.28219 2.83857 3.29674 3.54143 4.02455 5.93694 4.72463 52 k - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 1.29619 0.93368 1.09864 0.09981 2.35393 0.00158 6.45062 + 24 5.78791 5.34221 8.67209 8.05658 0.93518 5.63680 8.19729 2.19948 7.83754 1.82518 3.92750 6.12685 8.22709 7.96508 7.83962 7.17628 6.92467 2.28197 2.83995 1.98253 54 f - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00581 5.15797 10.18260 0.05921 2.85606 0.26525 1.45677 + 25 3.83649 4.89977 4.92750 4.64262 4.16712 5.21724 4.12969 1.65862 5.41478 2.38504 3.06640 8.02240 8.21926 3.94808 4.37645 7.16706 2.90836 0.72425 6.11278 4.72441 56 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00103 9.46318 6.95607 0.61948 0.77267 0.48579 0.95505 + 26 2.60273 4.22054 5.68447 3.49551 2.63393 4.08538 3.81713 2.93059 3.22726 2.66997 3.76999 5.05440 2.14872 2.31300 2.29082 3.24667 3.07685 1.95949 3.69721 4.87002 57 v - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46227 10.18453 0.61948 0.77267 0.37763 1.15673 + 27 4.46412 3.71552 8.66529 8.04908 4.46897 7.85708 5.99736 2.36111 5.31045 0.47991 2.44221 8.02413 5.01517 7.95758 5.09766 5.92320 4.98398 2.13376 8.67984 5.89686 58 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 28 2.58257 5.50973 0.96961 1.50819 8.33361 3.65279 5.52844 7.82068 3.32493 5.24633 8.02793 3.64655 2.71080 4.33415 3.73181 2.64703 3.22234 4.96630 9.41908 8.00714 59 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 29 1.36538 5.32533 3.34362 1.96842 5.13194 3.41294 3.50928 6.02263 2.67612 6.13284 8.02791 3.84823 7.64044 2.34410 2.24082 2.32054 2.97279 4.09222 5.73221 3.54325 60 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01532 4.18876 10.18517 0.02368 3.75489 0.48579 0.95505 + 30 5.12032 8.08214 8.69072 8.07451 1.86946 7.88321 6.12469 1.98937 5.91051 0.73616 1.75994 8.05095 8.24434 7.98163 7.85756 6.72288 5.48769 4.09862 3.80819 7.52901 62 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00276 9.46284 5.92155 0.61948 0.77267 0.48579 0.95505 + 31 5.33343 9.16099 9.09722 5.98568 10.14318 6.07371 10.07941 9.68709 9.46597 5.52335 10.07525 8.74510 0.02670 9.49370 6.94153 4.56473 7.83223 8.64598 11.46085 10.45579 63 P - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00481 6.01856 6.04490 0.12548 2.13769 0.98431 0.46792 + 32 4.96626 6.08430 8.65865 8.04244 4.13379 7.85114 4.88169 3.50025 3.30062 0.41921 3.42678 6.01222 8.21434 4.36183 2.80572 4.80267 3.22368 2.52361 6.09193 5.88884 65 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.45811 10.18037 0.61948 0.77267 0.62656 0.76449 + 33 5.05299 8.31772 5.26232 5.81545 7.45475 4.14774 6.66179 3.94465 5.18954 3.24734 4.21151 2.32130 5.97141 7.16155 7.21492 2.35457 0.37841 5.79743 8.89996 5.61444 66 T - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46019 10.18245 0.61948 0.77267 0.98431 0.46792 + 34 1.30244 8.86675 3.99393 4.82778 5.23813 3.24048 4.76055 3.22453 4.62469 3.55863 5.47518 4.44101 1.22480 4.97612 4.37031 2.37751 2.82528 2.42326 9.33116 7.95249 67 p - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00227 6.10405 10.18253 0.13443 2.07316 0.98431 0.46792 + 35 4.07582 9.25917 7.10348 5.16614 8.79615 4.98800 3.35425 5.83849 7.00228 7.81396 8.60969 0.37164 8.19450 5.83294 7.49372 1.59851 3.50951 5.91058 10.00916 6.29250 69 N - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46019 10.18245 0.61948 0.77267 0.98431 0.46792 + 36 4.80890 6.03716 3.94177 6.21737 8.25592 0.11837 5.90338 7.72150 5.07715 5.23103 5.48888 4.02218 7.67229 4.93367 4.62799 4.64116 4.35073 6.11410 9.38644 7.99460 70 G - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46019 10.18245 0.61948 0.77267 0.98431 0.46792 + 37 8.18241 9.09875 5.90958 5.13534 9.04677 8.43056 5.91236 8.66484 0.01852 8.17548 8.74525 8.07498 8.58444 7.42550 5.63510 8.14290 8.17340 8.51928 9.09737 8.73416 71 K - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46019 10.18245 0.61948 0.77267 0.98431 0.46792 + 38 3.97384 6.05744 8.90350 8.30046 5.38370 8.11955 8.47722 1.51871 8.09321 0.98707 5.29013 8.28730 6.02556 8.22768 4.84721 7.43958 3.33199 1.11253 8.95410 7.77714 72 l - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46019 10.18245 0.61948 0.77267 0.26622 1.45358 + 39 5.49608 6.15665 0.12974 4.63520 9.62724 5.68947 5.95772 9.16699 5.12802 8.60069 9.42970 2.71656 8.53900 4.41963 4.49102 7.51869 7.92972 6.16844 10.72380 7.06468 73 D - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 40 4.72777 8.80833 5.74462 4.50458 5.38418 7.30697 3.60141 5.13950 2.73428 3.38030 5.58728 5.73977 5.41191 4.23217 0.31035 4.53161 3.51061 3.48288 6.41004 4.93087 74 R - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 41 2.25768 8.98745 3.85053 4.12225 5.44962 3.38266 4.04253 5.95631 1.04051 4.11739 8.02813 2.85587 5.59449 3.37989 1.37323 3.03816 3.83875 5.13563 9.41928 5.19385 75 k - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 42 0.47107 5.43747 5.12902 3.15424 5.02574 4.33675 5.36469 7.81358 2.75291 3.57567 4.79908 4.75680 7.64098 3.07799 2.68775 3.13968 4.06088 4.55440 9.41613 5.59273 76 A - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 43 6.00191 4.16368 10.37706 9.87660 8.02174 6.22926 10.40547 4.27181 9.80521 0.08451 4.30370 10.04573 9.86551 9.71441 9.78719 9.24307 4.91298 3.67709 10.33339 9.36783 77 L - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.46296 10.18522 0.61948 0.77267 0.48579 0.95505 + 44 2.79484 8.96939 4.84170 4.23936 5.77607 4.76823 5.03024 4.96855 3.22113 3.40689 5.09332 5.71022 0.46379 3.41521 2.31033 4.18391 4.06538 4.34472 9.40608 6.03322 78 P - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.01198 9.46304 4.43703 0.61948 0.77267 0.48579 0.95505 + 45 1.09792 6.91481 2.89040 1.83127 8.32051 4.00290 5.43771 3.76861 2.53550 3.25984 5.57010 3.81575 5.25749 2.65958 2.83702 3.31114 3.14270 3.15102 6.27434 6.13294 79 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00012 9.45118 10.17344 0.61948 0.77267 1.84289 0.17240 + 46 3.34148 5.96158 4.93711 4.36249 4.22802 6.03921 4.56766 3.60765 5.48885 2.17283 3.40248 5.95049 0.56265 3.95145 2.92975 4.17187 4.08038 3.32900 3.95284 5.49994 80 P - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.07901 9.44759 2.57850 0.61948 0.77267 0.61884 0.77342 + 47 2.12428 4.95856 1.39301 1.91842 3.68317 2.63444 5.95254 4.02905 3.82042 2.88813 4.63694 3.77467 3.68248 3.38581 3.58613 3.41978 3.26105 2.97408 4.84974 3.37957 81 d - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.04117 6.05491 3.27054 0.13868 2.04414 0.12946 2.10840 + 48 1.77092 8.92212 3.08860 2.24157 2.76143 3.40051 4.00979 3.48300 3.67431 2.37604 4.99544 3.51142 3.21752 2.65746 2.70182 2.88204 3.48438 2.97977 4.15327 3.05671 83 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.12402 9.37823 2.14940 0.61948 0.77267 0.03372 3.40657 + 49 1.88184 5.98319 2.36704 2.51676 4.68492 2.38922 4.49015 3.28269 3.38319 3.33745 6.14954 2.86195 3.11703 2.34555 3.42964 2.31020 2.75659 3.11079 4.85821 5.36213 84 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.04457 3.13393 9.92262 0.01094 4.52124 0.01426 4.25770 + 50 1.51653 6.21260 2.95065 2.92431 3.93145 2.76974 3.81084 3.31455 4.12068 2.33513 3.73390 3.24939 3.17037 3.04699 3.21889 2.23671 2.90490 2.98600 6.55571 4.33902 86 a - - - + 2.54091 4.18909 2.92766 2.70561 3.22625 2.66633 3.77575 2.83006 2.82275 2.33953 3.73926 3.18354 3.03052 3.22984 2.91696 2.68331 2.91750 2.69798 4.47296 3.49288 + 0.00000 * * 0.00000 * 0.00000 * +// diff --git a/src/biocracker/paras.py b/src/biocracker/paras.py index c4c7ec9..31622f0 100644 --- a/src/biocracker/paras.py +++ b/src/biocracker/paras.py @@ -134,4 +134,4 @@ def predict_amp_domain_substrate( # Format predictions preds = [{"substrate_name": name, "substrate_smiles": smiles, "score": score} for name, smiles, score in preds] - return preds + return preds \ No newline at end of file diff --git a/src/biocracker/paras_fast.py b/src/biocracker/paras_fast.py new file mode 100644 index 0000000..68e65be --- /dev/null +++ b/src/biocracker/paras_fast.py @@ -0,0 +1,345 @@ +"""Module for fast PARAS inference of substrate specificity A domains.""" + +from dataclasses import dataclass +from importlib.resources import files +from typing import Any + +import numpy as np +from pyhmmer import easel, plan7, hmmer + +import biocracker.data + + +HMM_DB_PATH = str(files(biocracker.data).joinpath("AMP-binding_converted.hmm")) +with plan7.HMMFile(HMM_DB_PATH) as hmm_file: + HMM_DB = list(hmm_file) + + +VALID = set("ACDEFGHIKLMNPQRSTVWY-") +FEATURE_NAMES = [ + "WOLS870101", + "WOLS870102", + "WOLS870103", + "FAUJ880109", + "GRAR740102", + "RADA880108", + "ZIMJ680103", + "TSAJ990101", + "CHOP780201", + "CHOP780202", + "CHOP780203", + "ZIMJ680104", + "NEU1", + "NEU2", + "NEU3", +] +FEATURES = { + "-": [0.00, 0.00, 0.00, 1, 8.3, 0.21, 13.59, 145.2, 1.00, 1.03, 0.99, 6.03, 0.06, 0.00, 0.10], + "A": [0.07, -1.73, 0.09, 0, 8.1, -0.06, 0.00, 90.0, 1.42, 0.83, 0.66, 6.00, 0.06, -0.25, 0.25], + "C": [0.71, -0.97, 4.13, 0, 5.5, 1.36, 1.48, 103.3, 0.70, 1.19, 1.19, 5.05, -0.56, -0.40, -0.14], + "D": [3.64, 1.13, 2.36, 1, 13.0, -0.80, 49.70, 117.3, 1.01, 0.54, 1.46, 2.77, 0.97, -0.08, 0.08], + "E": [3.08, 0.39, -0.07, 1, 12.3, -0.77, 49.90, 142.2, 1.51, 0.37, 0.74, 3.22, 0.85, -0.10, -0.05], + "F": [-4.92, 1.30, 0.45, 0, 5.2, 1.27, 0.35, 191.9, 1.13, 1.38, 0.60, 5.48, -0.99, 0.18, 0.15], + "G": [2.23, -5.36, 0.30, 0, 9.0, -0.41, 0.00, 64.9, 0.57, 0.75, 1.56, 5.97, 0.32, -0.32, 0.28], + "H": [2.41, 1.74, 1.11, 1, 10.4, 0.49, 51.60, 160.0, 1.00, 0.87, 0.95, 7.59, 0.15, -0.03, -0.10], + "I": [-4.44, -1.68, -1.03, 0, 5.2, 1.31, 0.13, 163.9, 1.08, 1.60, 0.47, 6.02, -1.00, -0.03, 0.10], + "K": [2.84, 1.41, -3.14, 2, 11.3, -1.18, 49.50, 167.3, 1.16, 0.74, 1.01, 9.74, 1.00, 0.32, 0.11], + "L": [-4.19, -1.03, -0.98, 0, 4.9, 1.21, 0.13, 164.0, 1.21, 1.30, 0.59, 5.98, -0.83, 0.05, 0.01], + "M": [-2.49, -0.27, -0.41, 0, 5.7, 1.27, 1.43, 167.0, 1.45, 1.05, 0.60, 5.74, -0.68, -0.01, 0.04], + "N": [3.22, 1.45, 0.84, 2, 11.6, -0.48, 3.38, 124.7, 0.67, 0.89, 1.56, 5.41, 0.70, -0.06, 0.17], + "P": [-1.22, 0.88, 2.23, 0, 8.0, 1.1, 1.58, 122.9, 0.57, 0.55, 1.52, 6.30, 0.45, 0.23, 0.41], + "Q": [2.18, 0.53, -1.14, 2, 10.5, -0.73, 3.53, 149.4, 1.11, 1.10, 0.98, 5.65, 0.71, -0.02, 0.12], + "R": [2.88, 2.52, -3.44, 4, 10.5, -0.84, 52.00, 194.0, 0.98, 0.93, 0.95, 10.76, 0.80, 0.19, -0.41], + "S": [1.96, -1.63, 0.57, 1, 9.2, -0.50, 1.67, 95.4, 0.77, 0.75, 1.43, 5.68, 0.48, -0.15, 0.23], + "T": [0.92, -2.09, -1.40, 1, 8.6, -0.27, 1.66, 121.5, 0.83, 1.19, 0.96, 5.66, 0.38, -0.10, 0.29], + "V": [-2.69, -2.53, -1.29, 0, 5.9, 1.09, 0.13, 139.0, 1.06, 1.70, 0.50, 5.96, -0.75, -0.19, 0.03], + "W": [-4.75, 3.65, 0.85, 1, 5.4, 0.88, 2.10, 228.2, 1.08, 1.37, 0.96, 5.89, -0.57, 0.31, 0.34], + "Y": [1.39, 2.32, 0.01, 1, 6.2, 0.33, 1.61, 197.0, 0.69, 1.47, 1.14, 5.66, -0.35, 0.40, -0.02], +} +POSITIONS_ACTIVE_SITE = [ + 13, + 16, + 17, + 41, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 55, + 93, + 94, + 125, + 126, + 127, + 128, + 129, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, +] + + +@dataclass +class ADomain: + """ + Dataclass representing an A domain. + + :param protein: name of the protein containing the A domain + :param start: start position of the A domain + :param end: end position of the A domain + :param domain_nr: domain number of A domain in NRPS (optional) + :param sequence: amino acid sequence of the A domain (optional) + :param extended_signature: extended signature of the A domain (optional) + """ + + protein: str + start: int + end: int + domain_nr: int | None = None + sequence: str | None = None + extended_signature: str | None = None + + +def _b2s(x: Any) -> str: + """ + Convert input to string. + + :param x: input object + :return: string representation + """ + if isinstance(x, (bytes, bytearray)): + return x.decode() + + if hasattr(x, "sequence"): + s = x.sequence + return s.decode() if isinstance(s, (bytes, bytearray)) else str(s) + + return str(x) + + +def extract_domain_hits( + seq_id: str, + sequence: str, + evalue_cutoff: float = 1e-5, +) -> list[dict[str, Any]]: + """ + Extract domain hits from a given protein sequence using HMMER. + + :param seq_id: identifier for the protein sequence + :param sequence: amino acid sequence of the protein + :param evalue_cutoff: e-value cutoff for HMMER hits + :return: list of dictionaries representing domain hits + """ + alphabet = easel.Alphabet.amino() + text_seq = easel.TextSequence(name=seq_id.encode(), sequence=sequence) + seq = text_seq.digitize(alphabet) + + hits_iter = hmmer.hmmscan([seq], HMM_DB, cpus=1, E=evalue_cutoff) + + query_hits = next(hits_iter) # expect only one sequence + + out = [] + for hit in query_hits: + model_name = _b2s(hit.name) + + for dom in hit.domains: + q_from = int(dom.env_from) + q_to = int(dom.env_to) + + aln = dom.alignment + hmm_aln = _b2s(aln.hmm_sequence) + query_aln = _b2s(aln.target_sequence) + + out.append( + dict( + seq_id=seq_id, + model=model_name, + q_from=q_from, + q_to=q_to, + evalue=float(dom.i_evalue), + score=float(dom.score), + hmm_aln=hmm_aln, + query_aln=query_aln, + domain_obj=dom, + ) + ) + + out.sort(key=lambda d: (d["q_from"], d["q_to"], d["model"])) + + return out + + +def pair_domains( + domain_hits: list[dict[str, Any]], + max_gap: int = 200, +) -> list[tuple[ADomain, str, str]]: + """ + Pair AMP-binding and AMP-binding_C domain hits. + + :param domain_hits: list of domain hit dictionaries + :param max_gap: maximum allowed gap between paired domains + :return: list of tuples containing ADomain objects and their alignments + """ + hits = sorted(domain_hits, key=lambda d: d["q_from"]) + + a_domains: list[ADomain] = [] + for h1 in hits: + if h1["model"] != "AMP-binding": + continue + + n_from, n_to = h1["q_from"], h1["q_to"] + + matched = None + for h2 in hits: + if h2["model"] != "AMP-binding_C": + continue + + c_from = h2["q_from"] + + if c_from > n_to and (c_from - n_to) <= max_gap: + matched = h2 + break + + start0 = n_from - 1 + end0 = matched["q_to"] if matched is not None else n_to + a_domains.append((ADomain( + protein=h1["seq_id"], + start=start0, + end=end0), + h1["hmm_aln"], + h1["query_aln"] + )) + + a_domains.sort(key=lambda t: t[0].start) + for i, (d, _, _) in enumerate(a_domains, start=1): + d.domain_nr = i + + return a_domains + + +def extract_signature_from_alignment(hmm_aln: str, query_aln: str) -> str | None: + """ + Extract the extended signature from the given HMM and query alignments. + + :param hmm_aln: HMM alignment string + :param query_aln: query alignment string + :return: extended signature string or None if invalid + """ + wanted = set(POSITIONS_ACTIVE_SITE) + picked: dict[int, str] = {} + + hmm_pos = 0 # 1-based counter, increment when HMM char is not a gap + + for h, q in zip(hmm_aln, query_aln): + if h != "-": + hmm_pos += 1 + if hmm_pos in wanted and hmm_pos not in picked: + picked[hmm_pos] = q + + # Quick fix + missing = wanted - set(picked.keys()) + for m in missing: + picked[m] = "-" + + out = [] + for p in POSITIONS_ACTIVE_SITE: + if p not in picked: + return None + out.append(picked[p]) + + sig = "".join(out).upper() + if not sig or not all(c in VALID for c in sig): + return None + + return sig + + +def fill_domain_sequences( + domains: list[ADomain], + protein_seq: str, + min_len: int = 100, +) -> list[ADomain]: + """ + Fill in the sequences for the given domains from the protein sequence. + + :param domains: list of ADomain objects + :param protein_seq: amino acid sequence of the protein + :param min_len: minimum length of domain sequence to keep + :return: list of ADomain objects with sequences filled in + """ + out = [] + + for d in domains: + seq = protein_seq[d.start:d.end] + if len(seq) >= min_len: + d.sequence = seq + out.append(d) + + return out + + +def find_a_domains( + seq_id: str, + protein_seq: str, + evalue_cutoff: float = 1e-5, +) -> list[ADomain]: + """ + Find A domains in a given protein sequence using HMMER. + + :param seq_id: identifier for the protein sequence + :param protein_seq: amino acid sequence of the protein + :param evalue_cutoff: e-value cutoff for HMMER hits + :return: list of ADomain objects representing found A domains + """ + hits = extract_domain_hits(seq_id, protein_seq, evalue_cutoff) + + hits = [h for h in hits if h["model"] in {"AMP-binding", "AMP-binding_C"}] + + paired = pair_domains(hits, max_gap=200) + + domains_only: list[ADomain] = [] + for d, hmm_aln, query_aln in paired: + d.extended_signature = extract_signature_from_alignment(hmm_aln, query_aln) + domains_only.append(d) + + domains_only = fill_domain_sequences(domains_only, protein_seq, min_len=100) + + domains_only = [d for d in domains_only if d.extended_signature is not None] + + domains_only.sort(key=lambda d: (d.protein, d.start)) + + return domains_only + + +def featurize_signature(sig: str) -> np.ndarray: + """ + Featurize the given extended signature into a numerical feature array. + + :param sig: extended signature string + :return: numpy array of features + """ + assert len(sig) == len(POSITIONS_ACTIVE_SITE), "signature length mismatch" + + features: np.ndarray = np.zeros((len(POSITIONS_ACTIVE_SITE), len(FEATURE_NAMES)), dtype=np.float32) + for i, aa in enumerate(sig): + aa_feats = FEATURES.get(aa) + if aa_feats is None: + raise ValueError(f"invalid amino acid '{aa}' in signature") + features[i, :] = np.array(aa_feats, dtype=np.float32) + + return features.flatten() # shape (n_positions * n_features,) From c9d915d32265795ece39f1a319907c37043b556d Mon Sep 17 00:00:00 2001 From: David Meijer Date: Wed, 24 Dec 2025 11:46:21 +0100 Subject: [PATCH 02/12] ENH: update paras method --- scripts/run_paras_on_gbks.py | 2 +- src/biocracker/paras.py | 445 ++++++++++++++++++++++++++--------- src/biocracker/paras_fast.py | 345 --------------------------- 3 files changed, 341 insertions(+), 451 deletions(-) delete mode 100644 src/biocracker/paras_fast.py diff --git a/scripts/run_paras_on_gbks.py b/scripts/run_paras_on_gbks.py index 25a1cab..5f1a708 100644 --- a/scripts/run_paras_on_gbks.py +++ b/scripts/run_paras_on_gbks.py @@ -5,7 +5,7 @@ from tqdm import tqdm from biocracker.antismash import parse_region_gbk_file -from biocracker.paras_fast import find_a_domains, featurize_signature +from biocracker.paras import find_a_domains, featurize_signature def cli(): diff --git a/src/biocracker/paras.py b/src/biocracker/paras.py index 31622f0..ea1e415 100644 --- a/src/biocracker/paras.py +++ b/src/biocracker/paras.py @@ -1,45 +1,364 @@ -"""Module contains methods for making substrate specificity predictions with paras.""" +"""Module for fast PARAS inference of substrate specificity A domains.""" -import logging -import os +from dataclasses import dataclass +from importlib.resources import files +from typing import Any from pathlib import Path +import numpy as np import joblib +from pyhmmer import easel, plan7, hmmer -try: - from parasect.api import run_paras # module is called parasect, but we are using the paras model +import biocracker.data +from biocracker.config import PARAS_MODEL_DOWNLOAD_URL +from biocracker.helpers import download_and_prepare - _HAS_PARAS = True -except ImportError: - run_paras = None - _HAS_PARAS = False - -from biocracker.antismash import DomainRec -from biocracker.config import LOGGER_NAME, PARAS_CACHE_DIR_NAME, PARAS_MODEL_DOWNLOAD_URL -from biocracker.helpers import download_and_prepare, get_biocracker_cache_dir _PARAS_MODEL_CACHE: dict[str, object] = {} -def has_paras() -> bool: +HMM_DB_PATH = str(files(biocracker.data).joinpath("AMP-binding_converted.hmm")) +with plan7.HMMFile(HMM_DB_PATH) as hmm_file: + HMM_DB = list(hmm_file) + + +VALID = set("ACDEFGHIKLMNPQRSTVWY-") +FEATURE_NAMES = [ + "WOLS870101", + "WOLS870102", + "WOLS870103", + "FAUJ880109", + "GRAR740102", + "RADA880108", + "ZIMJ680103", + "TSAJ990101", + "CHOP780201", + "CHOP780202", + "CHOP780203", + "ZIMJ680104", + "NEU1", + "NEU2", + "NEU3", +] +FEATURES = { + "-": [0.00, 0.00, 0.00, 1, 8.3, 0.21, 13.59, 145.2, 1.00, 1.03, 0.99, 6.03, 0.06, 0.00, 0.10], + "A": [0.07, -1.73, 0.09, 0, 8.1, -0.06, 0.00, 90.0, 1.42, 0.83, 0.66, 6.00, 0.06, -0.25, 0.25], + "C": [0.71, -0.97, 4.13, 0, 5.5, 1.36, 1.48, 103.3, 0.70, 1.19, 1.19, 5.05, -0.56, -0.40, -0.14], + "D": [3.64, 1.13, 2.36, 1, 13.0, -0.80, 49.70, 117.3, 1.01, 0.54, 1.46, 2.77, 0.97, -0.08, 0.08], + "E": [3.08, 0.39, -0.07, 1, 12.3, -0.77, 49.90, 142.2, 1.51, 0.37, 0.74, 3.22, 0.85, -0.10, -0.05], + "F": [-4.92, 1.30, 0.45, 0, 5.2, 1.27, 0.35, 191.9, 1.13, 1.38, 0.60, 5.48, -0.99, 0.18, 0.15], + "G": [2.23, -5.36, 0.30, 0, 9.0, -0.41, 0.00, 64.9, 0.57, 0.75, 1.56, 5.97, 0.32, -0.32, 0.28], + "H": [2.41, 1.74, 1.11, 1, 10.4, 0.49, 51.60, 160.0, 1.00, 0.87, 0.95, 7.59, 0.15, -0.03, -0.10], + "I": [-4.44, -1.68, -1.03, 0, 5.2, 1.31, 0.13, 163.9, 1.08, 1.60, 0.47, 6.02, -1.00, -0.03, 0.10], + "K": [2.84, 1.41, -3.14, 2, 11.3, -1.18, 49.50, 167.3, 1.16, 0.74, 1.01, 9.74, 1.00, 0.32, 0.11], + "L": [-4.19, -1.03, -0.98, 0, 4.9, 1.21, 0.13, 164.0, 1.21, 1.30, 0.59, 5.98, -0.83, 0.05, 0.01], + "M": [-2.49, -0.27, -0.41, 0, 5.7, 1.27, 1.43, 167.0, 1.45, 1.05, 0.60, 5.74, -0.68, -0.01, 0.04], + "N": [3.22, 1.45, 0.84, 2, 11.6, -0.48, 3.38, 124.7, 0.67, 0.89, 1.56, 5.41, 0.70, -0.06, 0.17], + "P": [-1.22, 0.88, 2.23, 0, 8.0, 1.1, 1.58, 122.9, 0.57, 0.55, 1.52, 6.30, 0.45, 0.23, 0.41], + "Q": [2.18, 0.53, -1.14, 2, 10.5, -0.73, 3.53, 149.4, 1.11, 1.10, 0.98, 5.65, 0.71, -0.02, 0.12], + "R": [2.88, 2.52, -3.44, 4, 10.5, -0.84, 52.00, 194.0, 0.98, 0.93, 0.95, 10.76, 0.80, 0.19, -0.41], + "S": [1.96, -1.63, 0.57, 1, 9.2, -0.50, 1.67, 95.4, 0.77, 0.75, 1.43, 5.68, 0.48, -0.15, 0.23], + "T": [0.92, -2.09, -1.40, 1, 8.6, -0.27, 1.66, 121.5, 0.83, 1.19, 0.96, 5.66, 0.38, -0.10, 0.29], + "V": [-2.69, -2.53, -1.29, 0, 5.9, 1.09, 0.13, 139.0, 1.06, 1.70, 0.50, 5.96, -0.75, -0.19, 0.03], + "W": [-4.75, 3.65, 0.85, 1, 5.4, 0.88, 2.10, 228.2, 1.08, 1.37, 0.96, 5.89, -0.57, 0.31, 0.34], + "Y": [1.39, 2.32, 0.01, 1, 6.2, 0.33, 1.61, 197.0, 0.69, 1.47, 1.14, 5.66, -0.35, 0.40, -0.02], +} +POSITIONS_ACTIVE_SITE = [ + 13, + 16, + 17, + 41, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 55, + 93, + 94, + 125, + 126, + 127, + 128, + 129, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, +] + + +@dataclass +class ADomain: + """ + Dataclass representing an A domain. + + :param protein: name of the protein containing the A domain + :param start: start position of the A domain + :param end: end position of the A domain + :param domain_nr: domain number of A domain in NRPS (optional) + :param sequence: amino acid sequence of the A domain (optional) + :param extended_signature: extended signature of the A domain (optional) + """ + + protein: str + start: int + end: int + domain_nr: int | None = None + sequence: str | None = None + extended_signature: str | None = None + + +def _b2s(x: Any) -> str: + """ + Convert input to string. + + :param x: input object + :return: string representation + """ + if isinstance(x, (bytes, bytearray)): + return x.decode() + + if hasattr(x, "sequence"): + s = x.sequence + return s.decode() if isinstance(s, (bytes, bytearray)) else str(s) + + return str(x) + + +def extract_domain_hits( + seq_id: str, + sequence: str, + evalue_cutoff: float = 1e-5, +) -> list[dict[str, Any]]: + """ + Extract domain hits from a given protein sequence using HMMER. + + :param seq_id: identifier for the protein sequence + :param sequence: amino acid sequence of the protein + :param evalue_cutoff: e-value cutoff for HMMER hits + :return: list of dictionaries representing domain hits + """ + alphabet = easel.Alphabet.amino() + text_seq = easel.TextSequence(name=seq_id.encode(), sequence=sequence) + seq = text_seq.digitize(alphabet) + + hits_iter = hmmer.hmmscan([seq], HMM_DB, cpus=1, E=evalue_cutoff) + + query_hits = next(hits_iter) # expect only one sequence + + out = [] + for hit in query_hits: + model_name = _b2s(hit.name) + + for dom in hit.domains: + q_from = int(dom.env_from) + q_to = int(dom.env_to) + + aln = dom.alignment + hmm_aln = _b2s(aln.hmm_sequence) + query_aln = _b2s(aln.target_sequence) + + out.append( + dict( + seq_id=seq_id, + model=model_name, + q_from=q_from, + q_to=q_to, + evalue=float(dom.i_evalue), + score=float(dom.score), + hmm_aln=hmm_aln, + query_aln=query_aln, + domain_obj=dom, + ) + ) + + out.sort(key=lambda d: (d["q_from"], d["q_to"], d["model"])) + + return out + + +def pair_domains( + domain_hits: list[dict[str, Any]], + max_gap: int = 200, +) -> list[tuple[ADomain, str, str]]: + """ + Pair AMP-binding and AMP-binding_C domain hits. + + :param domain_hits: list of domain hit dictionaries + :param max_gap: maximum allowed gap between paired domains + :return: list of tuples containing ADomain objects and their alignments + """ + hits = sorted(domain_hits, key=lambda d: d["q_from"]) + + a_domains: list[ADomain] = [] + for h1 in hits: + if h1["model"] != "AMP-binding": + continue + + n_from, n_to = h1["q_from"], h1["q_to"] + + matched = None + for h2 in hits: + if h2["model"] != "AMP-binding_C": + continue + + c_from = h2["q_from"] + + if c_from > n_to and (c_from - n_to) <= max_gap: + matched = h2 + break + + start0 = n_from - 1 + end0 = matched["q_to"] if matched is not None else n_to + a_domains.append((ADomain( + protein=h1["seq_id"], + start=start0, + end=end0), + h1["hmm_aln"], + h1["query_aln"] + )) + + a_domains.sort(key=lambda t: t[0].start) + for i, (d, _, _) in enumerate(a_domains, start=1): + d.domain_nr = i + + return a_domains + + +def extract_signature_from_alignment(hmm_aln: str, query_aln: str) -> str | None: + """ + Extract the extended signature from the given HMM and query alignments. + + :param hmm_aln: HMM alignment string + :param query_aln: query alignment string + :return: extended signature string or None if invalid + """ + wanted = set(POSITIONS_ACTIVE_SITE) + picked: dict[int, str] = {} + + hmm_pos = 0 # 1-based counter, increment when HMM char is not a gap + + for h, q in zip(hmm_aln, query_aln): + if h != "-": + hmm_pos += 1 + if hmm_pos in wanted and hmm_pos not in picked: + picked[hmm_pos] = q + + # Quick fix + missing = wanted - set(picked.keys()) + for m in missing: + picked[m] = "-" + + out = [] + for p in POSITIONS_ACTIVE_SITE: + if p not in picked: + return None + out.append(picked[p]) + + sig = "".join(out).upper() + if not sig or not all(c in VALID for c in sig): + return None + + return sig + + +def fill_domain_sequences( + domains: list[ADomain], + protein_seq: str, + min_len: int = 100, +) -> list[ADomain]: """ - Check if parasect is installed. + Fill in the sequences for the given domains from the protein sequence. - :return: True if parasect is installed, False otherwise + :param domains: list of ADomain objects + :param protein_seq: amino acid sequence of the protein + :param min_len: minimum length of domain sequence to keep + :return: list of ADomain objects with sequences filled in """ - return _HAS_PARAS + out = [] + + for d in domains: + seq = protein_seq[d.start:d.end] + if len(seq) >= min_len: + d.sequence = seq + out.append(d) + + return out -def _load_paras_model(cache_dir: Path) -> object: +def find_a_domains( + seq_id: str, + protein_seq: str, + evalue_cutoff: float = 1e-5, +) -> list[ADomain]: + """ + Find A domains in a given protein sequence using HMMER. + + :param seq_id: identifier for the protein sequence + :param protein_seq: amino acid sequence of the protein + :param evalue_cutoff: e-value cutoff for HMMER hits + :return: list of ADomain objects representing found A domains + """ + hits = extract_domain_hits(seq_id, protein_seq, evalue_cutoff) + + hits = [h for h in hits if h["model"] in {"AMP-binding", "AMP-binding_C"}] + + paired = pair_domains(hits, max_gap=200) + + domains_only: list[ADomain] = [] + for d, hmm_aln, query_aln in paired: + d.extended_signature = extract_signature_from_alignment(hmm_aln, query_aln) + domains_only.append(d) + + domains_only = fill_domain_sequences(domains_only, protein_seq, min_len=100) + + domains_only = [d for d in domains_only if d.extended_signature is not None] + + domains_only.sort(key=lambda d: (d.protein, d.start)) + + return domains_only + + +def featurize_signature(sig: str) -> np.ndarray: + """ + Featurize the given extended signature into a numerical feature array. + + :param sig: extended signature string + :return: numpy array of features + """ + assert len(sig) == len(POSITIONS_ACTIVE_SITE), "signature length mismatch" + + features: np.ndarray = np.zeros((len(POSITIONS_ACTIVE_SITE), len(FEATURE_NAMES)), dtype=np.float32) + for i, aa in enumerate(sig): + aa_feats = FEATURES.get(aa) + if aa_feats is None: + raise ValueError(f"invalid amino acid '{aa}' in signature") + features[i, :] = np.array(aa_feats, dtype=np.float32) + + return features.flatten() # shape (n_positions * n_features,) + + +def load_paras_model(cache_dir: Path) -> object: """ Load the paras model from disk (cached in memory for reuse). :param cache_dir: Path to the cache directory :return: loaded paras model """ - if not has_paras(): - raise ImportError("paras is not installed, cannot load paras model") - global _PARAS_MODEL_CACHE # If model already loaded, return it immediately @@ -51,87 +370,3 @@ def _load_paras_model(cache_dir: Path) -> object: model = joblib.load(model_path) _PARAS_MODEL_CACHE[PARAS_MODEL_DOWNLOAD_URL] = model return model - - -def predict_amp_domain_substrate( - domain: DomainRec, - cache_dir_override: Path | str | None = None, - *, - model: object | None = None, - pred_threshold: float = 0.5, -) -> list[dict] | None: - """ - Predict substrate specificity for a given AMP-binding domain using paras. - - :param domain: DomainRec object representing the AMP-binding domain - :param cache_dir_override: Optional path to override the default cache directory - :param model: Optional already loaded paras model, skip download and loading if provided - :param pred_threshold: prediction threshold for substrate specificity (default: 0.5) - :return: list of dictionaries with all predicted substrates above the threshold, each with keys: - 'substrate_name' (str): substrate name, - 'substrate_smiles' (str): substrate SMILES, - 'score' (float): prediction score - :raises TypeError: if domain is not an instance of DomainRec - .. note:: returns None if the domain is not of type "AMP-binding" - .. note:: returns empty list if no predictions are above the threshold, or an error occurs - """ - logger = logging.getLogger(LOGGER_NAME) - - if not isinstance(domain, DomainRec): - raise TypeError("Domain must be an instance of DomainRec") - - if domain.kind != "AMP-binding": - return None - - # If parasect is missing, log and return None - if not has_paras(): - logger.warning("parasect not installed — skipping substrate prediction.") - return None - - # Define cache directory - cache_dir = ( - Path(cache_dir_override) - if cache_dir_override is not None - else get_biocracker_cache_dir() / PARAS_CACHE_DIR_NAME - ) - - # Load paras model if not provided - if model is None: - os.makedirs(cache_dir, exist_ok=True) - model: object = _load_paras_model(str(cache_dir)) - - tmp_dir = cache_dir / "temp_paras" - os.makedirs(tmp_dir, exist_ok=True) - - # Prep fasta - header = f">{domain.name if domain.name else 'AMP_domain'}|{domain.start}_{domain.end}" - seq = domain.aa_seq - fasta = f"{header}\n{seq}\n" - - # Ensure sequence is not empty - if not seq: - return [] - - # Make prediction with paras - try: - results = run_paras( - selected_input=fasta, - selected_input_type="fasta", - path_temp_dir=tmp_dir, - model=model, - use_structure_guided_alignment=False, - ) - assert len(results) == 1, "Expected exactly one paras result for singular AMP-binding domain" - result = results[0] - preds = list(zip(result.prediction_labels, result._prediction_smiles, result.predictions, strict=True)) - preds = [(name, smiles, round(score, 3)) for name, smiles, score in preds if score >= pred_threshold] - # Highest score first - preds.sort(key=lambda x: x[2], reverse=True) - except Exception as e: - logger.error(f"{e}\nError during paras prediction for domain {domain.name}, returning no predictions") - preds = [] - - # Format predictions - preds = [{"substrate_name": name, "substrate_smiles": smiles, "score": score} for name, smiles, score in preds] - - return preds \ No newline at end of file diff --git a/src/biocracker/paras_fast.py b/src/biocracker/paras_fast.py deleted file mode 100644 index 68e65be..0000000 --- a/src/biocracker/paras_fast.py +++ /dev/null @@ -1,345 +0,0 @@ -"""Module for fast PARAS inference of substrate specificity A domains.""" - -from dataclasses import dataclass -from importlib.resources import files -from typing import Any - -import numpy as np -from pyhmmer import easel, plan7, hmmer - -import biocracker.data - - -HMM_DB_PATH = str(files(biocracker.data).joinpath("AMP-binding_converted.hmm")) -with plan7.HMMFile(HMM_DB_PATH) as hmm_file: - HMM_DB = list(hmm_file) - - -VALID = set("ACDEFGHIKLMNPQRSTVWY-") -FEATURE_NAMES = [ - "WOLS870101", - "WOLS870102", - "WOLS870103", - "FAUJ880109", - "GRAR740102", - "RADA880108", - "ZIMJ680103", - "TSAJ990101", - "CHOP780201", - "CHOP780202", - "CHOP780203", - "ZIMJ680104", - "NEU1", - "NEU2", - "NEU3", -] -FEATURES = { - "-": [0.00, 0.00, 0.00, 1, 8.3, 0.21, 13.59, 145.2, 1.00, 1.03, 0.99, 6.03, 0.06, 0.00, 0.10], - "A": [0.07, -1.73, 0.09, 0, 8.1, -0.06, 0.00, 90.0, 1.42, 0.83, 0.66, 6.00, 0.06, -0.25, 0.25], - "C": [0.71, -0.97, 4.13, 0, 5.5, 1.36, 1.48, 103.3, 0.70, 1.19, 1.19, 5.05, -0.56, -0.40, -0.14], - "D": [3.64, 1.13, 2.36, 1, 13.0, -0.80, 49.70, 117.3, 1.01, 0.54, 1.46, 2.77, 0.97, -0.08, 0.08], - "E": [3.08, 0.39, -0.07, 1, 12.3, -0.77, 49.90, 142.2, 1.51, 0.37, 0.74, 3.22, 0.85, -0.10, -0.05], - "F": [-4.92, 1.30, 0.45, 0, 5.2, 1.27, 0.35, 191.9, 1.13, 1.38, 0.60, 5.48, -0.99, 0.18, 0.15], - "G": [2.23, -5.36, 0.30, 0, 9.0, -0.41, 0.00, 64.9, 0.57, 0.75, 1.56, 5.97, 0.32, -0.32, 0.28], - "H": [2.41, 1.74, 1.11, 1, 10.4, 0.49, 51.60, 160.0, 1.00, 0.87, 0.95, 7.59, 0.15, -0.03, -0.10], - "I": [-4.44, -1.68, -1.03, 0, 5.2, 1.31, 0.13, 163.9, 1.08, 1.60, 0.47, 6.02, -1.00, -0.03, 0.10], - "K": [2.84, 1.41, -3.14, 2, 11.3, -1.18, 49.50, 167.3, 1.16, 0.74, 1.01, 9.74, 1.00, 0.32, 0.11], - "L": [-4.19, -1.03, -0.98, 0, 4.9, 1.21, 0.13, 164.0, 1.21, 1.30, 0.59, 5.98, -0.83, 0.05, 0.01], - "M": [-2.49, -0.27, -0.41, 0, 5.7, 1.27, 1.43, 167.0, 1.45, 1.05, 0.60, 5.74, -0.68, -0.01, 0.04], - "N": [3.22, 1.45, 0.84, 2, 11.6, -0.48, 3.38, 124.7, 0.67, 0.89, 1.56, 5.41, 0.70, -0.06, 0.17], - "P": [-1.22, 0.88, 2.23, 0, 8.0, 1.1, 1.58, 122.9, 0.57, 0.55, 1.52, 6.30, 0.45, 0.23, 0.41], - "Q": [2.18, 0.53, -1.14, 2, 10.5, -0.73, 3.53, 149.4, 1.11, 1.10, 0.98, 5.65, 0.71, -0.02, 0.12], - "R": [2.88, 2.52, -3.44, 4, 10.5, -0.84, 52.00, 194.0, 0.98, 0.93, 0.95, 10.76, 0.80, 0.19, -0.41], - "S": [1.96, -1.63, 0.57, 1, 9.2, -0.50, 1.67, 95.4, 0.77, 0.75, 1.43, 5.68, 0.48, -0.15, 0.23], - "T": [0.92, -2.09, -1.40, 1, 8.6, -0.27, 1.66, 121.5, 0.83, 1.19, 0.96, 5.66, 0.38, -0.10, 0.29], - "V": [-2.69, -2.53, -1.29, 0, 5.9, 1.09, 0.13, 139.0, 1.06, 1.70, 0.50, 5.96, -0.75, -0.19, 0.03], - "W": [-4.75, 3.65, 0.85, 1, 5.4, 0.88, 2.10, 228.2, 1.08, 1.37, 0.96, 5.89, -0.57, 0.31, 0.34], - "Y": [1.39, 2.32, 0.01, 1, 6.2, 0.33, 1.61, 197.0, 0.69, 1.47, 1.14, 5.66, -0.35, 0.40, -0.02], -} -POSITIONS_ACTIVE_SITE = [ - 13, - 16, - 17, - 41, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 55, - 93, - 94, - 125, - 126, - 127, - 128, - 129, - 152, - 153, - 154, - 155, - 156, - 157, - 158, - 159, - 160, - 161, - 162, - 163, - 164, - 165, - 166, -] - - -@dataclass -class ADomain: - """ - Dataclass representing an A domain. - - :param protein: name of the protein containing the A domain - :param start: start position of the A domain - :param end: end position of the A domain - :param domain_nr: domain number of A domain in NRPS (optional) - :param sequence: amino acid sequence of the A domain (optional) - :param extended_signature: extended signature of the A domain (optional) - """ - - protein: str - start: int - end: int - domain_nr: int | None = None - sequence: str | None = None - extended_signature: str | None = None - - -def _b2s(x: Any) -> str: - """ - Convert input to string. - - :param x: input object - :return: string representation - """ - if isinstance(x, (bytes, bytearray)): - return x.decode() - - if hasattr(x, "sequence"): - s = x.sequence - return s.decode() if isinstance(s, (bytes, bytearray)) else str(s) - - return str(x) - - -def extract_domain_hits( - seq_id: str, - sequence: str, - evalue_cutoff: float = 1e-5, -) -> list[dict[str, Any]]: - """ - Extract domain hits from a given protein sequence using HMMER. - - :param seq_id: identifier for the protein sequence - :param sequence: amino acid sequence of the protein - :param evalue_cutoff: e-value cutoff for HMMER hits - :return: list of dictionaries representing domain hits - """ - alphabet = easel.Alphabet.amino() - text_seq = easel.TextSequence(name=seq_id.encode(), sequence=sequence) - seq = text_seq.digitize(alphabet) - - hits_iter = hmmer.hmmscan([seq], HMM_DB, cpus=1, E=evalue_cutoff) - - query_hits = next(hits_iter) # expect only one sequence - - out = [] - for hit in query_hits: - model_name = _b2s(hit.name) - - for dom in hit.domains: - q_from = int(dom.env_from) - q_to = int(dom.env_to) - - aln = dom.alignment - hmm_aln = _b2s(aln.hmm_sequence) - query_aln = _b2s(aln.target_sequence) - - out.append( - dict( - seq_id=seq_id, - model=model_name, - q_from=q_from, - q_to=q_to, - evalue=float(dom.i_evalue), - score=float(dom.score), - hmm_aln=hmm_aln, - query_aln=query_aln, - domain_obj=dom, - ) - ) - - out.sort(key=lambda d: (d["q_from"], d["q_to"], d["model"])) - - return out - - -def pair_domains( - domain_hits: list[dict[str, Any]], - max_gap: int = 200, -) -> list[tuple[ADomain, str, str]]: - """ - Pair AMP-binding and AMP-binding_C domain hits. - - :param domain_hits: list of domain hit dictionaries - :param max_gap: maximum allowed gap between paired domains - :return: list of tuples containing ADomain objects and their alignments - """ - hits = sorted(domain_hits, key=lambda d: d["q_from"]) - - a_domains: list[ADomain] = [] - for h1 in hits: - if h1["model"] != "AMP-binding": - continue - - n_from, n_to = h1["q_from"], h1["q_to"] - - matched = None - for h2 in hits: - if h2["model"] != "AMP-binding_C": - continue - - c_from = h2["q_from"] - - if c_from > n_to and (c_from - n_to) <= max_gap: - matched = h2 - break - - start0 = n_from - 1 - end0 = matched["q_to"] if matched is not None else n_to - a_domains.append((ADomain( - protein=h1["seq_id"], - start=start0, - end=end0), - h1["hmm_aln"], - h1["query_aln"] - )) - - a_domains.sort(key=lambda t: t[0].start) - for i, (d, _, _) in enumerate(a_domains, start=1): - d.domain_nr = i - - return a_domains - - -def extract_signature_from_alignment(hmm_aln: str, query_aln: str) -> str | None: - """ - Extract the extended signature from the given HMM and query alignments. - - :param hmm_aln: HMM alignment string - :param query_aln: query alignment string - :return: extended signature string or None if invalid - """ - wanted = set(POSITIONS_ACTIVE_SITE) - picked: dict[int, str] = {} - - hmm_pos = 0 # 1-based counter, increment when HMM char is not a gap - - for h, q in zip(hmm_aln, query_aln): - if h != "-": - hmm_pos += 1 - if hmm_pos in wanted and hmm_pos not in picked: - picked[hmm_pos] = q - - # Quick fix - missing = wanted - set(picked.keys()) - for m in missing: - picked[m] = "-" - - out = [] - for p in POSITIONS_ACTIVE_SITE: - if p not in picked: - return None - out.append(picked[p]) - - sig = "".join(out).upper() - if not sig or not all(c in VALID for c in sig): - return None - - return sig - - -def fill_domain_sequences( - domains: list[ADomain], - protein_seq: str, - min_len: int = 100, -) -> list[ADomain]: - """ - Fill in the sequences for the given domains from the protein sequence. - - :param domains: list of ADomain objects - :param protein_seq: amino acid sequence of the protein - :param min_len: minimum length of domain sequence to keep - :return: list of ADomain objects with sequences filled in - """ - out = [] - - for d in domains: - seq = protein_seq[d.start:d.end] - if len(seq) >= min_len: - d.sequence = seq - out.append(d) - - return out - - -def find_a_domains( - seq_id: str, - protein_seq: str, - evalue_cutoff: float = 1e-5, -) -> list[ADomain]: - """ - Find A domains in a given protein sequence using HMMER. - - :param seq_id: identifier for the protein sequence - :param protein_seq: amino acid sequence of the protein - :param evalue_cutoff: e-value cutoff for HMMER hits - :return: list of ADomain objects representing found A domains - """ - hits = extract_domain_hits(seq_id, protein_seq, evalue_cutoff) - - hits = [h for h in hits if h["model"] in {"AMP-binding", "AMP-binding_C"}] - - paired = pair_domains(hits, max_gap=200) - - domains_only: list[ADomain] = [] - for d, hmm_aln, query_aln in paired: - d.extended_signature = extract_signature_from_alignment(hmm_aln, query_aln) - domains_only.append(d) - - domains_only = fill_domain_sequences(domains_only, protein_seq, min_len=100) - - domains_only = [d for d in domains_only if d.extended_signature is not None] - - domains_only.sort(key=lambda d: (d.protein, d.start)) - - return domains_only - - -def featurize_signature(sig: str) -> np.ndarray: - """ - Featurize the given extended signature into a numerical feature array. - - :param sig: extended signature string - :return: numpy array of features - """ - assert len(sig) == len(POSITIONS_ACTIVE_SITE), "signature length mismatch" - - features: np.ndarray = np.zeros((len(POSITIONS_ACTIVE_SITE), len(FEATURE_NAMES)), dtype=np.float32) - for i, aa in enumerate(sig): - aa_feats = FEATURES.get(aa) - if aa_feats is None: - raise ValueError(f"invalid amino acid '{aa}' in signature") - features[i, :] = np.array(aa_feats, dtype=np.float32) - - return features.flatten() # shape (n_positions * n_features,) From 3baf0e9cde4465bc6dc289b35539f21bbf1e5ae5 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Thu, 25 Dec 2025 01:26:45 +0100 Subject: [PATCH 03/12] WIP --- .github/workflows/tests.yml | 12 +- .gitignore | 3 +- README.md | 24 - environment.dev.yml | 9 - environment.yml | 11 +- examples/list_domains_antismash_gbk.py | 103 --- examples/readout_antismash_gbk.py | 89 -- scripts/parse_gbks.py | 54 ++ scripts/read_gbks.py | 41 + scripts/run_paras_on_gbks.py | 48 -- src/biocracker/antismash.py | 472 ---------- src/biocracker/config.py | 16 - src/biocracker/inference/__init__.py | 0 src/biocracker/inference/base.py | 98 +++ src/biocracker/inference/model_paras.py | 808 ++++++++++++++++++ src/biocracker/inference/registry.py | 61 ++ src/biocracker/io/__init__.py | 0 src/biocracker/io/gbk_antismash.py | 237 +++++ src/biocracker/io/options.py | 37 + src/biocracker/io/readers.py | 27 + src/biocracker/model/__init__.py | 0 src/biocracker/model/annotations.py | 51 ++ src/biocracker/model/domain.py | 55 ++ src/biocracker/model/gene.py | 82 ++ src/biocracker/model/inference.py | 57 ++ src/biocracker/model/region.py | 60 ++ src/biocracker/paras.py | 372 -------- src/biocracker/pipelines/__init__.py | 0 src/biocracker/pipelines/annotate_region.py | 48 ++ src/biocracker/query/__init__.py | 0 src/biocracker/query/modules.py | 19 + src/biocracker/readout.py | 744 ---------------- src/biocracker/text_mining.py | 316 ------- src/biocracker/utils/__init__.py | 0 .../{helpers.py => utils/download.py} | 7 +- src/biocracker/utils/json.py | 25 + src/biocracker/utils/logging.py | 116 +++ tests/__init__.py | 0 tests/test_antismash.py | 294 ------- tests/test_helpers.py | 295 ------- 40 files changed, 1889 insertions(+), 2802 deletions(-) delete mode 100644 examples/list_domains_antismash_gbk.py delete mode 100644 examples/readout_antismash_gbk.py create mode 100644 scripts/parse_gbks.py create mode 100644 scripts/read_gbks.py delete mode 100644 scripts/run_paras_on_gbks.py delete mode 100644 src/biocracker/antismash.py delete mode 100644 src/biocracker/config.py create mode 100644 src/biocracker/inference/__init__.py create mode 100644 src/biocracker/inference/base.py create mode 100644 src/biocracker/inference/model_paras.py create mode 100644 src/biocracker/inference/registry.py create mode 100644 src/biocracker/io/__init__.py create mode 100644 src/biocracker/io/gbk_antismash.py create mode 100644 src/biocracker/io/options.py create mode 100644 src/biocracker/io/readers.py create mode 100644 src/biocracker/model/__init__.py create mode 100644 src/biocracker/model/annotations.py create mode 100644 src/biocracker/model/domain.py create mode 100644 src/biocracker/model/gene.py create mode 100644 src/biocracker/model/inference.py create mode 100644 src/biocracker/model/region.py delete mode 100644 src/biocracker/paras.py create mode 100644 src/biocracker/pipelines/__init__.py create mode 100644 src/biocracker/pipelines/annotate_region.py create mode 100644 src/biocracker/query/__init__.py create mode 100644 src/biocracker/query/modules.py delete mode 100644 src/biocracker/readout.py delete mode 100644 src/biocracker/text_mining.py create mode 100644 src/biocracker/utils/__init__.py rename src/biocracker/{helpers.py => utils/download.py} (98%) create mode 100644 src/biocracker/utils/json.py create mode 100644 src/biocracker/utils/logging.py create mode 100644 tests/__init__.py delete mode 100644 tests/test_antismash.py delete mode 100644 tests/test_helpers.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 10cade8..c4fc871 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -30,11 +30,11 @@ jobs: python -m pip install --upgrade pip pip install -e .[dev] - - name: ruff lint - run: ruff check examples src tests + # - name: ruff lint + # run: ruff check examples src tests - - name: ruff format check - run: ruff format --check examples src tests + # - name: ruff format check + # run: ruff format --check examples src tests - - name: run tests - run: pytest -q \ No newline at end of file + # - name: run tests + # run: pytest -q \ No newline at end of file diff --git a/.gitignore b/.gitignore index ec40882..3040041 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ __pycache__/ .ruff_cache/ .vscode/ dist/ -scratch/ \ No newline at end of file +scratch/ +*cache*/ \ No newline at end of file diff --git a/README.md b/README.md index d0cacf4..e5afb6f 100644 --- a/README.md +++ b/README.md @@ -13,38 +13,14 @@ Parser for antiSMASH output GenBank files. -See the [examples folder](https://github.com/moltools/biocracker/tree/main/examples) for usage examples. - ## Installation -Some BioCracker depdendencies rely on various command line tools to operate. These tools might not be available on all platforms. The `pyproject.toml` file specifies the core parser that is platform independent, but some functionality might be limited without the command line tools. BioCracker is designed to fail gracefully when some of these third party dependencies are not available. - We recommend installing BioCracker in a virtual conda environment, based on the provided `environment.yml` file to make sure all modules are available: ```bash conda env create -f environment.yml ``` -### Installing PARAS - -PARAS is used by BioCracker to predict substrate specificities of NRPS adenylation domains. - -PARAS has no PyPI package ans must be installed from source manually: - -```bash -pip install "paras @ git+https://github.com/bthedragonmaster/parasect.git@v2.0.0" -``` - -### Installing HMMER2 on macOS Arm64 - -Use Rosetta to install the x86_64 version of HMMER2: - -```bash -conda activate biocracker -conda config --env --set subdir osx-64 -conda install hmmer2 -``` - ## Development To set up a development environment, use the provided `environment.dev.yml` file: diff --git a/environment.dev.yml b/environment.dev.yml index 9293b4b..4a093c1 100644 --- a/environment.dev.yml +++ b/environment.dev.yml @@ -3,17 +3,8 @@ channels: - conda-forge - bioconda dependencies: - # Python runtime - python=3.10 - - # Bio tools (from bioconda) - - hmmer # HMMER v3 (hmmsearch, hmmscan, hmmpress, etc.) - - hmmer2 # HMMER v2 (e.g., hmmpfam, hmmsearch from v2 toolset) - - muscle=3.8.1551 # MUSCLE aligner - - # Extra PyPI deps - pip - pip: - -e . - - "paras @ git+https://github.com/bthedragonmaster/parasect.git@v2.0.0" - hatch \ No newline at end of file diff --git a/environment.yml b/environment.yml index 466cb9d..c0148de 100644 --- a/environment.yml +++ b/environment.yml @@ -3,16 +3,7 @@ channels: - conda-forge - bioconda dependencies: - # Python runtime - python=3.10 - - # Bio tools (from bioconda) - - hmmer # HMMER v3 (hmmsearch, hmmscan, hmmpress, etc.) - - hmmer2 # HMMER v2 (e.g., hmmpfam, hmmsearch from v2 toolset) - - muscle=3.8.1551 # MUSCLE aligner - - # Extra PyPI deps - pip - pip: - - biocracker - - "paras @ git+https://github.com/bthedragonmaster/parasect.git@v2.0.0" \ No newline at end of file + - biocracker \ No newline at end of file diff --git a/examples/list_domains_antismash_gbk.py b/examples/list_domains_antismash_gbk.py deleted file mode 100644 index 8abb025..0000000 --- a/examples/list_domains_antismash_gbk.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 - -"""Parse antiSMASH GenBank files and extract relevant features.""" - -import argparse -import logging -import os - -from biocracker.antismash import parse_region_gbk_file -from biocracker.config import LOGGER_LEVEL, LOGGER_NAME -from biocracker.paras import predict_amp_domain_substrate -from biocracker.text_mining import get_default_tokenspecs, mine_virtual_tokens - -# Setup logging -logger = logging.getLogger(LOGGER_NAME) -logging.basicConfig(level=LOGGER_LEVEL) - - -def cli() -> argparse.Namespace: - """ - Parse command line arguments. - - :return: Parsed arguments - """ - parser = argparse.ArgumentParser() - parser.add_argument("--gbk", type=str, required=True, help="Path to the antiSMASH GenBank file") - parser.add_argument( - "--toplevel", - type=str, - choices=["cand_cluster", "region"], - default="cand_cluster", - help="Top level feature to parse (default: cand_cluster)", - ) - parser.add_argument("--thresh", type=float, default=0.1, help="Threshold for substrate prediction (default: 0.1)") - parser.add_argument("--outfile", type=str, default=None, help="Path to log output file (default: None)") - return parser.parse_args() - - -def main() -> None: - """ - Main function to parse the antiSMASH GenBank file. - """ - args = cli() - - if args.outfile is not None: - # Delete if file exists and create new log file - if os.path.exists(args.outfile): - os.remove(args.outfile) - - file_handler = logging.FileHandler(args.outfile) - file_handler.setLevel(LOGGER_LEVEL) - logger.addHandler(file_handler) - - gbk_path = args.gbk - target_name = args.toplevel - targets = parse_region_gbk_file(gbk_path, top_level=target_name) - logger.info(f" > Parsed {len(targets)} {target_name}(s) from {gbk_path}") - - for target in targets: - region_accession = target.accession if target.accession is not None else 0 - logger.info( - f" " - f"> ({target_name} at {target.start} - {target.end}) " - f"{target.record_id}.{target_name}{region_accession:03d} : {target.product_tags}" - ) - len_start = max([len(str(gene.start)) for gene in target.genes]) - len_end = max([len(str(gene.end)) for gene in target.genes]) - for gene in target.genes: - logger.info( - f" " - f"> (gene at {gene.start:>{len_start}} - {gene.end:>{len_end}} on strand {gene.strand:>2}) " - f"{gene.name} : {gene.product}" - ) - - for domain in gene.domains: - domain_len_start = max([len(str(d.start)) for d in gene.domains]) - domain_len_end = max([len(str(d.end)) for d in gene.domains]) - if domain.kind == "AMP-binding": - domain_preds = predict_amp_domain_substrate(domain, pred_threshold=args.thresh) - else: - domain_preds = [] - logger.info( - f" " - f"> (domain at {domain.start:>{domain_len_start}} - {domain.end:>{domain_len_end}}) " - f"{domain.kind}" - ) - if domain_preds is not None: - if domain_preds: - for domain_pred in domain_preds: - name = domain_pred.get("substrate_name", "Unknown") - smiles = domain_pred.get("substrate_smiles", "N/A") - score = domain_pred.get("score", 0.0) - logger.info(f" > Predicted substrate: {name} (SMILES: '{smiles}') with score {score}") - - tokenspecs = get_default_tokenspecs() - for mined_tokenspec in mine_virtual_tokens(target, tokenspecs): - token_name = mined_tokenspec.get("token", "Unknown") - token_score = mined_tokenspec.get("score", 0.0) - logger.info(f" > Mined tokenspec: {token_name} ({token_score})") - - -if __name__ == "__main__": - main() diff --git a/examples/readout_antismash_gbk.py b/examples/readout_antismash_gbk.py deleted file mode 100644 index 5bee5f2..0000000 --- a/examples/readout_antismash_gbk.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 - -"""Parse antiSMASH GenBank files and extract linear readout information.""" - -import argparse -import logging -import os - -from biocracker.antismash import parse_region_gbk_file -from biocracker.config import LOGGER_LEVEL, LOGGER_NAME -from biocracker.readout import NRPSModuleReadout, PKSModuleReadout, linear_readouts -from biocracker.text_mining import get_default_tokenspecs, mine_virtual_tokens - -# Setup logging -logger = logging.getLogger(LOGGER_NAME) -logging.basicConfig(level=LOGGER_LEVEL) - - -def cli() -> argparse.Namespace: - """ - Parse command line arguments. - - :return: Parsed arguments - """ - parser = argparse.ArgumentParser() - parser.add_argument("--gbk", type=str, required=True, help="Path to the antiSMASH GenBank file") - parser.add_argument( - "--toplevel", - type=str, - choices=["cand_cluster", "region"], - default="cand_cluster", - help="Top level feature to parse (default: cand_cluster)", - ) - parser.add_argument( - "--readlevel", - type=str, - choices=["rec", "gene"], - default="rec", - help='Level of readout, either "rec" for region/cluster level or "gene" for gene level (default: rec)', - ) - parser.add_argument("--thresh", type=float, default=0.1, help="Threshold for substrate prediction (default: 0.1)") - parser.add_argument("--outfile", type=str, default=None, help="Path to log output file (default: None)") - return parser.parse_args() - - -def main() -> None: - """ - Main function to parse the antiSMASH GenBank file. - """ - args = cli() - - if args.outfile is not None: - # Delete if file exists and create new log file - if os.path.exists(args.outfile): - os.remove(args.outfile) - - file_handler = logging.FileHandler(args.outfile) - file_handler.setLevel(LOGGER_LEVEL) - logger.addHandler(file_handler) - - gbk_path = args.gbk - target_name = args.toplevel - targets = parse_region_gbk_file(gbk_path, top_level=target_name) - logger.info(f" > Parsed {len(targets)} {target_name}(s) from {gbk_path}") - for target in targets: - for readout in linear_readouts(target, level=args.readlevel): - name = readout["rec"].name if args.readlevel == "gene" else readout["rec"].record_id - for module_idx, module in enumerate(readout["readout"], start=1): - if isinstance(module, PKSModuleReadout): - logger.info( - f" > {name} Module {module_idx:0>2} ({module['role']}): " - f"{module['at_source']}_{module['module_type']}" - ) - elif isinstance(module, NRPSModuleReadout): - substrate = module.get("substrate_name", "Unknown") - score = module.get("score", 0.0) - logger.info(f" > {name} Module {module_idx:0>2} ({module['role']}): {substrate} ({score})") - else: - logger.warning(f" > {name} Module {module_idx:0>2}: Unknown module type") - - tokenspecs = get_default_tokenspecs() - for mined_tokenspec in mine_virtual_tokens(target, tokenspecs): - token_name = mined_tokenspec.get("token", "Unknown") - token_score = mined_tokenspec.get("score", 0.0) - logger.info(f" > Mined tokenspec: {token_name} ({token_score})") - - -if __name__ == "__main__": - main() diff --git a/scripts/parse_gbks.py b/scripts/parse_gbks.py new file mode 100644 index 0000000..ea8082a --- /dev/null +++ b/scripts/parse_gbks.py @@ -0,0 +1,54 @@ +"""Parse regions from GenBank files and annotate them using gene and domain models.""" + +import argparse +import os +import json +import glob + +from biocracker.utils.logging import setup_logging, add_file_handler +from biocracker.io.readers import load_regions +from biocracker.io.options import AntiSmashOptions +from biocracker.inference.registry import register_domain_model +from biocracker.inference.model_paras import ParasModel +from biocracker.pipelines.annotate_region import annotate_region + + +def cli() -> argparse.Namespace: + """ + Command line interface for parsing and annotating GenBank files. + + :return: parsed command line arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument("--gbks", type=str, required=True) + parser.add_argument("--out", type=str, required=True, help="output directory") + return parser.parse_args() + + +def main() -> None: + """ + Main function to parse and annotate GenBank files. + """ + args = cli() + os.makedirs(args.out, exist_ok=True) + + setup_logging(level="INFO") + add_file_handler(os.path.join(args.out, "parse_gbks.log"), level="INFO") + + register_domain_model(ParasModel(cache_dir=None, threshold=0.1, keep_top=3)) + + options = AntiSmashOptions(readout_level="cand_cluster") + + gbk_iter = glob.iglob(f"{args.gbks}/*.gbk") + + out_jsonl = os.path.join(args.out, "regions.jsonl") + with open(out_jsonl, "w") as out_f: + for gbk_file in gbk_iter: + regions = load_regions(gbk_file, options) + for region in regions: + annotate_region(region) + out_f.write(json.dumps(region.to_dict()) + "\n") + + +if __name__ == "__main__": + main() diff --git a/scripts/read_gbks.py b/scripts/read_gbks.py new file mode 100644 index 0000000..d4184c3 --- /dev/null +++ b/scripts/read_gbks.py @@ -0,0 +1,41 @@ +"""Parse linear readouts from parsed GenBank files.""" + +import argparse +import os + +from biocracker.utils.logging import setup_logging, add_file_handler +from biocracker.utils.json import iter_json +from biocracker.model.region import Region +from biocracker.query.modules import linear_readout + + +def cli() -> argparse.Namespace: + """ + Command line interface for parsing linear readouts from GenBank files. + + :return: parsed command line arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument("--jsonl", type=str, required=True) + parser.add_argument("--out", type=str, required=True, help="output directory") + return parser.parse_args() + + +def main() -> None: + """ + Main function to parse linear readouts from GenBank files. + """ + args = cli() + os.makedirs(args.out, exist_ok=True) + + setup_logging(level="INFO") + add_file_handler(os.path.join(args.out, "read_gbks.log"), level="INFO") + + for region_record in iter_json(args.jsonl, jsonl=True): + region = Region.from_dict(region_record) + readout = linear_readout(region) + print(readout) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_paras_on_gbks.py b/scripts/run_paras_on_gbks.py deleted file mode 100644 index 5f1a708..0000000 --- a/scripts/run_paras_on_gbks.py +++ /dev/null @@ -1,48 +0,0 @@ -import argparse -import glob - -import joblib -from tqdm import tqdm - -from biocracker.antismash import parse_region_gbk_file -from biocracker.paras import find_a_domains, featurize_signature - - -def cli(): - parser = argparse.ArgumentParser() - parser.add_argument("--gbks", type=str, required=True) - parser.add_argument("--model", type=str, required=True) - return parser.parse_args() - - -def main(): - args = cli() - - model = joblib.load(args.model) - - gbk_iter = glob.iglob(f"{args.gbks}/*.gbk") - for gbk_file in tqdm(gbk_iter): - for region in parse_region_gbk_file(gbk_file, top_level="region"): - for gene in region.genes: - name, protein_seq = gene.name, gene.protein_seq - a_domains = find_a_domains(seq_id=name, protein_seq=protein_seq, evalue_cutoff=1e-5) - for a_domain in a_domains: - protein_name = a_domain.protein - signature = a_domain.extended_signature - if signature is None: - print(protein_name, "N/A", "0.0000", "N/A", sep="\t") - else: - if model is not None: - features = featurize_signature(signature) - features_reshaped = features.reshape(1, -1) # reshape for single sample - prediction = model.predict_proba(features_reshaped) - pred_names = model.classes_ - prediction = {name: prob for name, prob in zip(pred_names, prediction[0])} - top_pred = max(prediction.items(), key=lambda x: x[1]) - print(protein_name, top_pred[0], f"{top_pred[1]:.4f}", signature, sep="\t") - else: - print(protein_name, "N/A", "0.0000", signature, sep="\t") - - -if __name__ == "__main__": - main() diff --git a/src/biocracker/antismash.py b/src/biocracker/antismash.py deleted file mode 100644 index 00276aa..0000000 --- a/src/biocracker/antismash.py +++ /dev/null @@ -1,472 +0,0 @@ -"""Module contains methods for parsing antismash output gbk files.""" - -from collections.abc import Iterable -from dataclasses import asdict, dataclass, field -from io import StringIO -from typing import Any, Literal - -from Bio import SeqIO -from Bio.SeqFeature import FeatureLocation, SeqFeature -from Bio.SeqRecord import SeqRecord - - -def _q1(feat: SeqFeature, keys: Iterable[str]) -> str | None: - """ - Return the first available qualifier value for any of `keys`, else None. - - :param feat: Biopython SeqFeature object - :param keys: Iterable of qualifier keys to check - :return: the first found qualifier value or None - """ - for k in keys: - vals = feat.qualifiers.get(k) - if vals: - return vals[0] - - return None - - -def _start_end(feat: SeqFeature) -> tuple[int, int, int]: - """ - Return (strand, start, end) as ints (0-based, end-exclusive like Biopython). - - :param feat: Biopython SeqFeature object - :return: tuple of (strand, start, end) - """ - loc: FeatureLocation = feat.location - strand = int(loc.strand) if loc.strand in (1, -1) else 1 - - return strand, int(loc.start), int(loc.end) - - -def _gene_name(feat: SeqFeature) -> str: - """ - Get gene name from feature qualifiers, prioritizing common antiSMASH/CDS name fields. - - :param feat: Biopython SeqFeature object - :return: gene name as string - .. note:: fallback to a deterministic coordinate-based label if no name found - """ - name = _q1(feat, ("locus_tag", "gene", "protein_id", "Name")) - if name: - return name - - strand, s, e = _start_end(feat) - - return f"CDS_{s}_{e}_{'rev' if strand == -1 else 'fwd'}" - - -@dataclass -class DomainRec: - """ - Data class representing a domain record parsed from antiSMASH output. - - :param start: domain start position - :param end: domain end position - :param kind: domain kind (e.g. "AMP-binding", "PKS_KS", etc.) - :param aa_seq: domain-level translation (AA) - :param name: optional label if present - :param raw_qualifiers: raw qualifiers dictionary - """ - - start: int - end: int - kind: str | None - aa_seq: str | None - name: str | None = None - raw_qualifiers: dict[str, Any] = field(default_factory=dict) - - def to_dict(self) -> dict[str, Any]: - """ - Convert DomainRec instance to dictionary. - - :return: dictionary representation of DomainRec - """ - d = asdict(self) - return d - - -@dataclass -class GeneRec: - """ - Data class representing a gene record parsed from antiSMASH output. - - :param name: gene name - :param strand: gene strand - :param start: gene start position - :param end: gene end position - :param protein_seq: CDS-level AA translation - :param product: optional product description - :param note: optional note - :param description: optional description - :param locus_tag: optional locus tag - :param gene_symbol: optional gene symbol ('gene' qualifier) - :param protein_id: optional protein ID - :param ec_number: optional first EC number if present - :param raw_qualifiers: raw qualifiers dictionary - :param domains: list of DomainRec instances - """ - - name: str - strand: int - start: int - end: int - protein_seq: str | None - product: str | None = None - note: str | None = None - description: str | None = None - locus_tag: str | None = None - gene_symbol: str | None = None - protein_id: str | None = None - ec_number: str | None = None - - # antiSMASH often tucks many details into qualifiers; keep them - raw_qualifiers: dict[str, Any] = field(default_factory=dict) - - domains: list["DomainRec"] = field(default_factory=list) - - def to_dict(self) -> dict[str, Any]: - """ - Convert GeneRec instance to dictionary. - - :return: dictionary representation of GeneRec - """ - d = asdict(self) - d["domains"] = [dom.to_dict() for dom in self.domains] - return d - - -@dataclass -class CandidateClusterRec: - """ - Data class representing a candidate cluster record parsed from antiSMASH output. - - :param record_id: cluster record ID - :param accession: cluster number/accession in gbk file - :param start: cluster start position - :param end: cluster end position - :param product_tags: list of product tags - :param genes: list of GeneRec instances - """ - - record_id: str - accession: int | None - start: int - end: int - product_tags: list[str] - genes: list[GeneRec] = field(default_factory=list) - - def to_dict(self) -> dict[str, Any]: - """ - Convert CandidateClusterRec instance to dictionary. - - :return: dictionary representation of CandidateClusterRec - """ - d = asdict(self) - d["genes"] = [g.to_dict() for g in self.genes] - return d - - -@dataclass -class RegionRec: - """ - Data class representing a region record parsed from antiSMASH output. - - :param record_id: region record ID - :param accession: region number/accession in gbk file - :param start: region start position - :param end: region end position - :param product_tags: list of product tags - :param genes: list of GeneRec instances - """ - - record_id: str - accession: int | None - start: int - end: int - product_tags: list[str] - genes: list[GeneRec] = field(default_factory=list) - - def to_dict(self) -> dict[str, Any]: - """ - Convert RegionRec instance to dictionary. - - :return: dictionary representation of RegionRec - """ - d = asdict(self) - d["genes"] = [g.to_dict() for g in self.genes] - return d - - -def _iter_regions(record: SeqRecord) -> list[SeqFeature]: - """ - Iterate over region features in a Biopython SeqRecord. - - :param record: Biopython SeqRecord object - :return: list of region SeqFeature objects - """ - return [f for f in record.features if f.type == "region"] - - -def _iter_candidate_clusters(record: SeqRecord) -> list[SeqFeature]: - """ - Iterate over candidate cluster features in a Biopython SeqRecord. - - :param record: Biopython SeqRecord object - :return: list of candidate cluster SeqFeature objects - """ - return [f for f in record.features if f.type == "cand_cluster"] - - -def _iter_cds(record: SeqRecord) -> list[SeqFeature]: - """ - Iterate over CDS features in a Biopython SeqRecord. - - :param record: Biopython SeqRecord object - :return: list of CDS SeqFeature objects - """ - return [f for f in record.features if f.type == "CDS"] - - -def _iter_domains(record: SeqRecord) -> list[SeqFeature]: - """ - Iterate over domain features in a Biopython SeqRecord. - - :param record: Biopython SeqRecord object - :return: list of domain SeqFeature objects - .. note:: antiSMASH domain features are usually 'aSDomain' - """ - return [f for f in record.features if f.type == "aSDomain"] - - -def _in_bounds(child: SeqFeature, parent: SeqFeature) -> bool: - """ - Check if a child feature is within the bounds of a parent feature. - - :param child: child SeqFeature object - :param parent: parent SeqFeature object - :return: True if child is within parent bounds, else False - """ - _, cs, ce = _start_end(child) - _, ps, pe = _start_end(parent) - return (ps <= cs) and (ce <= pe) - - -def _domain_rec_from_feat(feat: SeqFeature) -> DomainRec: - """ - Create a DomainRec instance from a SeqFeature. - - :param feat: Biopython SeqFeature object - :return: DomainRec instance - """ - _, s, e = _start_end(feat) - kind = _q1(feat, ("aSDomain", "domain", "label")) - aa_seq = _q1(feat, ("translation",)) - name = _q1(feat, ("label", "product", "note")) - return DomainRec( - start=s, - end=e, - kind=kind, - aa_seq=aa_seq, - name=name, - raw_qualifiers={k: v for k, v in feat.qualifiers.items()}, - ) - - -def _gene_rec_from_feat(feat: SeqFeature) -> GeneRec: - """ - Create a GeneRec instance from a SeqFeature. - - :param feat: Biopython SeqFeature object - :return: GeneRec instance - """ - strand, s, e = _start_end(feat) - name = _gene_name(feat) - prot = _q1(feat, ("translation",)) - - # Common textual fields for search - product = _q1(feat, ("product",)) - note = _q1(feat, ("note",)) - description = _q1(feat, ("function", "inference", "standard_name", "comment")) - locus_tag = _q1(feat, ("locus_tag",)) - gene_symbol = _q1(feat, ("gene",)) - protein_id = _q1(feat, ("protein_id",)) - ec = _q1(feat, ("EC_number",)) # EC number(s) are often a list in 'EC_number' - - return GeneRec( - name=name, - strand=strand, - start=s, - end=e, - protein_seq=prot, - product=product, - note=note, - description=description, - locus_tag=locus_tag, - gene_symbol=gene_symbol, - protein_id=protein_id, - ec_number=ec, - raw_qualifiers={k: v for k, v in feat.qualifiers.items()}, - ) - - -def _collect_candidate_cluster(record: SeqRecord) -> list[CandidateClusterRec]: - """ """ - clusters = _iter_candidate_clusters(record) - cds_list = _iter_cds(record) - dom_list = _iter_domains(record) - - cluster_recs: list[CandidateClusterRec] = [] - - for cc in clusters: - _, cs, ce = _start_end(cc) - - # Try several common qualifier names for the index - acc_vals = ( - cc.qualifiers.get("candidate_cluster_number") - or cc.qualifiers.get("cand_cluster_number") - or cc.qualifiers.get("cluster_number") - or cc.qualifiers.get("cluster_idx") - or [0] - ) - accession = int(acc_vals[0]) if acc_vals else None - - products = cc.qualifiers.get("product", []) or [] - - # Genes inside cluster, sorted by coordinate - gene_feats = [g for g in cds_list if _in_bounds(g, cc)] - gene_feats.sort(key=lambda gf: (int(gf.location.start), int(gf.location.end))) - - genes: list[GeneRec] = [] - for gf in gene_feats: - g = _gene_rec_from_feat(gf) - - # Domains inside this gene, sorted by genomic start - gene_doms = [df for df in dom_list if _in_bounds(df, gf)] - gene_doms.sort(key=lambda df: (int(df.location.start), int(df.location.end))) - dom_recs = [_domain_rec_from_feat(dd) for dd in gene_doms] - - # Oritentation normalization - if g.strand == -1: - dom_recs = dom_recs[::-1] - - g.domains = dom_recs - genes.append(g) - - cluster_recs.append( - CandidateClusterRec( - record_id=record.id, - accession=accession, - start=cs, - end=ce, - product_tags=products, - genes=genes, - ) - ) - - return cluster_recs - - -def _collect_region(record: SeqRecord) -> list[RegionRec]: - """ - Collect region records from a Biopython SeqRecord. - - :param record: Biopython SeqRecord object - :return: list of RegionRec instances - """ - regions = _iter_regions(record) - cds_list = _iter_cds(record) - dom_list = _iter_domains(record) - - region_recs: list[RegionRec] = [] - - for reg in regions: - _, rs, re = _start_end(reg) - - accessions = reg.qualifiers.get("region_number", [0]) - accession = int(accessions[0]) if accessions else None - - products = reg.qualifiers.get("product", []) or [] - - # Genes inside region, sorted by coordinate - gene_feats = [g for g in cds_list if _in_bounds(g, reg)] - gene_feats.sort(key=lambda gf: (int(gf.location.start), int(gf.location.end))) - - # Make gene recs - genes: list[GeneRec] = [] - for gf in gene_feats: - g = _gene_rec_from_feat(gf) - - # Domains inside this gene, sorted by genomic start - gene_doms = [df for df in dom_list if _in_bounds(df, gf)] - gene_doms.sort(key=lambda df: (int(df.location.start), int(df.location.end))) - dom_recs = [_domain_rec_from_feat(dd) for dd in gene_doms] - - # Orientation normalization - # If gene is reverse, reverse the domain list so the readout is consistent (left to right) - if g.strand == -1: - dom_recs = dom_recs[::-1] - - g.domains = dom_recs - genes.append(g) - - region_recs.append( - RegionRec( - record_id=record.id, - accession=accession, - start=rs, - end=re, - product_tags=products, - genes=genes, - ) - ) - - return region_recs - - -def parse_region_gbk_string( - src: str, - top_level: Literal["region", "cand_cluster"] = "region", -) -> list[RegionRec]: - """ - Parse antiSMASH region GenBank string into RegionRec instances. - - :param src: GenBank formatted string - :param top_level: top-level feature to parse ('region' or 'cand_cluster') - :return: list of RegionRec instances - :raises AssertionError: if top_level is not 'region' or 'cand_cluster' - """ - handle = StringIO(src) - out: list[RegionRec] = [] - - for record in SeqIO.parse(handle, "genbank"): - if top_level == "region": - out.extend(_collect_region(record)) - elif top_level == "cand_cluster": - out.extend(_collect_candidate_cluster(record)) - else: - raise ValueError(f"Unknown top_level '{top_level}'; must be 'region' or 'cand_cluster'") - - return out - - -def parse_region_gbk_file(filepath: str, top_level: Literal["region", "cand_cluster"] = "region") -> list[RegionRec]: - """ - Parse antiSMASH region GenBank file into RegionRec instances. - - :param filepath: path to GenBank file - :param top_level: top-level feature to parse ('region' or 'cand_cluster') - :return: list of RegionRec instances - """ - out: list[RegionRec] = [] - - with open(filepath) as handle: - for record in SeqIO.parse(handle, "genbank"): - if top_level == "region": - out.extend(_collect_region(record)) - elif top_level == "cand_cluster": - out.extend(_collect_candidate_cluster(record)) - else: - raise ValueError(f"Unknown top_level '{top_level}'; must be 'region' or 'cand_cluster'") - - return out diff --git a/src/biocracker/config.py b/src/biocracker/config.py deleted file mode 100644 index df64052..0000000 --- a/src/biocracker/config.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Module contains configuration settings for BioCracker.""" - -import logging -import os - -LOGGER_NAME = "biocracker" -LOGGER_LEVEL = os.getenv("BIOCRACKER_LOG_LEVEL", "INFO").upper() -NAME_CACHE_DIR = os.getenv("NAME_CACHE_DIR", "biocracker_cache") -PARAS_CACHE_DIR_NAME = os.getenv("PARAS_CACHE_DIR_NAME", "paras_cache") -PARAS_MODEL_DOWNLOAD_URL = "https://zenodo.org/records/17224548/files/all_substrates_model.paras.gz?download=1" - - -# Setup logger -logging.basicConfig() -logger = logging.getLogger(LOGGER_NAME) -logger.setLevel(LOGGER_LEVEL) diff --git a/src/biocracker/inference/__init__.py b/src/biocracker/inference/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/biocracker/inference/base.py b/src/biocracker/inference/base.py new file mode 100644 index 0000000..f4df55e --- /dev/null +++ b/src/biocracker/inference/base.py @@ -0,0 +1,98 @@ +"""Base class for domain inference models.""" + +from abc import ABC, abstractmethod +from typing import Any + +from biocracker.model.domain import Domain +from biocracker.model.gene import Gene +from biocracker.model.inference import InferenceResult, TargetType + + +class GeneInferenceModel(ABC): + """ + Base class for gene inference models. + + :param name: the name of the inference model + :param target: the target type of the inference model + """ + + name: str + target: TargetType = TargetType.GENE + + @abstractmethod + def predict(self, gene: Gene) -> list[InferenceResult]: + """ + Make predictions for a given gene. + + :param gene: The gene to make predictions for. + :return: a list of InferenceResult objects containing the predictions. + """ + ... + + def result( + self, + *, + label: str, + score: float | None = None, + metadata: dict[str, Any] | None = None + ) -> InferenceResult: + """ + Helper method to create an InferenceResult object. + + :param label: the predicted label or class + :param score: the confidence score of the prediction (optional) + :param metadata: additional metadata related to the inference (optional) + :return: an InferenceResult object + """ + return InferenceResult( + model=self.name, + target=self.target, + label=label, + score=score, + metadata=metadata + ) + + +class DomainInferenceModel(ABC): + """ + Base class for domain inference models. + + :param name: the name of the inference model + :param target: the target type of the inference model + """ + + name: str + target: TargetType = TargetType.DOMAIN + + @abstractmethod + def predict(self, domain: Domain) -> list[InferenceResult]: + """ + Make predictions for a given domain. + + :param domain: The domain to make predictions for. + :return: A list of InferenceResult objects containing the predictions. + """ + ... + + def result( + self, + *, + label: str, + score: float | None = None, + metadata: dict[str, Any] | None = None + ) -> InferenceResult: + """ + Helper method to create an InferenceResult object. + + :param label: the predicted label or class + :param score: the confidence score of the prediction (optional) + :param metadata: additional metadata related to the inference (optional) + :return: an InferenceResult object + """ + return InferenceResult( + model=self.name, + target=self.target, + label=label, + score=score, + metadata=metadata + ) diff --git a/src/biocracker/inference/model_paras.py b/src/biocracker/inference/model_paras.py new file mode 100644 index 0000000..954c920 --- /dev/null +++ b/src/biocracker/inference/model_paras.py @@ -0,0 +1,808 @@ +"""Module for the PARAS domain inference model.""" + +import logging +import os +from dataclasses import dataclass +from importlib.resources import files +from pathlib import Path +from typing import Any + +import joblib +import numpy as np +from pyhmmer import easel, plan7, hmmer +from sklearn.ensemble import RandomForestClassifier + +import biocracker.data +from biocracker.inference.base import DomainInferenceModel +from biocracker.model.domain import Domain +from biocracker.model.inference import InferenceResult +from biocracker.utils.download import download_and_prepare + + +log = logging.getLogger(__name__) + + +PARAS_CACHE_DIR = os.getenv("PARAS_CACHE_DIR", "paras_cache") +PARAS_DOWNLOAD_URL = "https://zenodo.org/records/17224548/files/all_substrates_model.paras.gz?download=1" + + +_PARAS_MODEL_CACHE: dict[str, object] = {} + + +HMM_DB_PATH = str(files(biocracker.data).joinpath("AMP-binding_converted.hmm")) +with plan7.HMMFile(HMM_DB_PATH) as hmm_file: + HMM_DB = list(hmm_file) + + +VALID = set("ACDEFGHIKLMNPQRSTVWY-") + + +FEATURE_NAMES = [ + "WOLS870101", + "WOLS870102", + "WOLS870103", + "FAUJ880109", + "GRAR740102", + "RADA880108", + "ZIMJ680103", + "TSAJ990101", + "CHOP780201", + "CHOP780202", + "CHOP780203", + "ZIMJ680104", + "NEU1", + "NEU2", + "NEU3", +] + + +FEATURES = { + "-": [0.00, 0.00, 0.00, 1, 8.3, 0.21, 13.59, 145.2, 1.00, 1.03, 0.99, 6.03, 0.06, 0.00, 0.10], + "A": [0.07, -1.73, 0.09, 0, 8.1, -0.06, 0.00, 90.0, 1.42, 0.83, 0.66, 6.00, 0.06, -0.25, 0.25], + "C": [0.71, -0.97, 4.13, 0, 5.5, 1.36, 1.48, 103.3, 0.70, 1.19, 1.19, 5.05, -0.56, -0.40, -0.14], + "D": [3.64, 1.13, 2.36, 1, 13.0, -0.80, 49.70, 117.3, 1.01, 0.54, 1.46, 2.77, 0.97, -0.08, 0.08], + "E": [3.08, 0.39, -0.07, 1, 12.3, -0.77, 49.90, 142.2, 1.51, 0.37, 0.74, 3.22, 0.85, -0.10, -0.05], + "F": [-4.92, 1.30, 0.45, 0, 5.2, 1.27, 0.35, 191.9, 1.13, 1.38, 0.60, 5.48, -0.99, 0.18, 0.15], + "G": [2.23, -5.36, 0.30, 0, 9.0, -0.41, 0.00, 64.9, 0.57, 0.75, 1.56, 5.97, 0.32, -0.32, 0.28], + "H": [2.41, 1.74, 1.11, 1, 10.4, 0.49, 51.60, 160.0, 1.00, 0.87, 0.95, 7.59, 0.15, -0.03, -0.10], + "I": [-4.44, -1.68, -1.03, 0, 5.2, 1.31, 0.13, 163.9, 1.08, 1.60, 0.47, 6.02, -1.00, -0.03, 0.10], + "K": [2.84, 1.41, -3.14, 2, 11.3, -1.18, 49.50, 167.3, 1.16, 0.74, 1.01, 9.74, 1.00, 0.32, 0.11], + "L": [-4.19, -1.03, -0.98, 0, 4.9, 1.21, 0.13, 164.0, 1.21, 1.30, 0.59, 5.98, -0.83, 0.05, 0.01], + "M": [-2.49, -0.27, -0.41, 0, 5.7, 1.27, 1.43, 167.0, 1.45, 1.05, 0.60, 5.74, -0.68, -0.01, 0.04], + "N": [3.22, 1.45, 0.84, 2, 11.6, -0.48, 3.38, 124.7, 0.67, 0.89, 1.56, 5.41, 0.70, -0.06, 0.17], + "P": [-1.22, 0.88, 2.23, 0, 8.0, 1.1, 1.58, 122.9, 0.57, 0.55, 1.52, 6.30, 0.45, 0.23, 0.41], + "Q": [2.18, 0.53, -1.14, 2, 10.5, -0.73, 3.53, 149.4, 1.11, 1.10, 0.98, 5.65, 0.71, -0.02, 0.12], + "R": [2.88, 2.52, -3.44, 4, 10.5, -0.84, 52.00, 194.0, 0.98, 0.93, 0.95, 10.76, 0.80, 0.19, -0.41], + "S": [1.96, -1.63, 0.57, 1, 9.2, -0.50, 1.67, 95.4, 0.77, 0.75, 1.43, 5.68, 0.48, -0.15, 0.23], + "T": [0.92, -2.09, -1.40, 1, 8.6, -0.27, 1.66, 121.5, 0.83, 1.19, 0.96, 5.66, 0.38, -0.10, 0.29], + "V": [-2.69, -2.53, -1.29, 0, 5.9, 1.09, 0.13, 139.0, 1.06, 1.70, 0.50, 5.96, -0.75, -0.19, 0.03], + "W": [-4.75, 3.65, 0.85, 1, 5.4, 0.88, 2.10, 228.2, 1.08, 1.37, 0.96, 5.89, -0.57, 0.31, 0.34], + "Y": [1.39, 2.32, 0.01, 1, 6.2, 0.33, 1.61, 197.0, 0.69, 1.47, 1.14, 5.66, -0.35, 0.40, -0.02], +} + +POSITIONS_ACTIVE_SITE = [ + 13, + 16, + 17, + 41, + 46, + 47, + 48, + 49, + 50, + 51, + 52, + 55, + 93, + 94, + 125, + 126, + 127, + 128, + 129, + 152, + 153, + 154, + 155, + 156, + 157, + 158, + 159, + 160, + 161, + 162, + 163, + 164, + 165, + 166, +] + + +LABEL_TO_SMILES = { + "(2S,3R)-2-amino-3-hydroxy-4-(4-nitrophenyl)butanoic acid": r"C1=CC(=CC=C1C[C@H]([C@@H](C(=O)O)N)O)[N+](=O)[O-]", + "(2S,6R)-diamino-(5R,7)-dihydroxy-heptanoic acid": r"C(C[C@@H](C(=O)O)N)[C@H]([C@@H](CO)N)O", + "(4S)-5,5,5-trichloroleucine": r"CCC(=O)CCCCC[C@@H](C(=O)O)N", + "(E)-4-methylhex-2-enoic acid": r"CCC(C)/C=C/C(=O)O", + "(S,E)-2-amino-4-decenoic acid": r"CCCCC/C=C/C[C@@H](C(=O)O)N", + "1-(1,1-dimethylallyl)-tryptophan": r"CC(C)(C=C)N1C=C(C2=CC=CC=C21)C[C@@H](C(=O)O)N", + "1-aminocyclopropane-1-carboxylic acid": r"C(O)(=O)C1(CC1)(N)", + "1-pyrroline-5-carboxylic acid": r"O=C(O)C1/N=C\CC1", + "10,14-dimethyloctadecanoic acid": r"OC(CCCCCCCCC(C)CCCC(C)CCCC)=O", + "2,3-diaminobutyric acid": r"NC(C)[C@@H](C(=O)O)N", + "2,3-diaminopropionic acid": r"C([C@@H](C(=O)O)N)N", + "2,3-dihydroxy-para-aminobenzoic acid": r"C1=CC(=C(C(=C(N)1)O)O)C(=O)O", + "2,3-dihydroxybenzoic acid": r"C1=CC(=C(C(=C1)O)O)C(=O)O", + "2,3-dihydroxyhexadecanoic acid": r"CCCCCCCCCCCCCC(C(C(=O)O)O)O", + "2,4-diaminobutyric acid": r"C(CN)[C@@H](C(=O)O)N", + "2,4-dihydroxypentanoic acid": r"CC(CC(C(=O)O)O)O", + "2-(1-methylcyclopropyl)-D-glycine": r"CC1(CC1)[C@H](C(=O)O)N", + "2-amino-3,5-dimethyl-4-hexenoic Acid": r"CC(C=C(C)C)C(C(=O)O)N", + "2-amino-3-hydroxycyclopent-2-enone": r"C1CC(=O)C(=C1O)N", + "2-amino-6-hydroxy-4-methyl-8-oxodecanoic acid": r"CCC(=O)CC(CC(C)CC(C(=O)O)N)O", + "2-aminoadipic acid": r"C(C[C@@H](C(=O)O)N)CC(=O)O", + "2-aminobutyric acid": r"CC[C@@H](C(=O)O)N", + "2-aminoisobutyric acid": r"O=C(O)C(N)(C)C", + "2-carboxy-6-hydroxyoctahydroindole": r"N1[C@H](C(=O)O)C[C@@H]2CC[C@@H](O)C[C@H]12", + "2-chloro-3,5-dihydroxy-4-methylphenylglycine": r"CC1=C(O)C(Cl)=C(C=C(O)1)[C@@H](C(=O)O)N", + "2-chlorobenzoic acid": r"C1=CC=C(C(=C1)C(=O)O)Cl", + "2-hydroxy-4-methylpentanoic acid": r"CC(C)CC(C(=O)O)O", + "2-hydroxypent-4-enoic acid": r"C=CCC(C(=O)O)O", + "2-ketoglutaric acid": r"C(CC(=O)O)C(=O)C(=O)O", + "2-ketoisocaproic acid": r"O=C(C(=O)O)CC(C)C", + "2-ketoisovaleric acid": r"O=C(C(=O)O)C(C)C", + "2-methylserine": r"C[C@](CO)(C(=O)O)N", + "2-sulfamoylacetic acid": r"C(C(=O)O)S(=O)(=O)N", + "2R-hydroxy-3-methylpentanoic acid": r"CCC(C)[C@H](C(=O)O)O", + "2R-hydroxyisovaleric acid": r"CC(C)[C@H](C(=O)O)O", + "2S,3S-diaminobutyric acid": r"C[C@@H]([C@@H](C(=O)O)N)N", + "2S-amino decanoic acid": r"CCCCCCCC[C@H](N)C(=O)O", + "2S-amino-4-hexenoic acid": r"C/C=C/CC(C(=O)O)N", + "2S-amino-8-oxodecanoic acid": r"CCC(=O)CCCCC[C@@H](C(=O)O)N", + "2S-amino-9,10-epoxy-8-oxodecanoic acid": r"C1C(O1)C(=O)CCCCC[C@@H](C(=O)O)N", + "2S-amino-dodecanoic acid": r"CCCCCCCCCC[C@@H](C(=O)O)N", + "2S-amino-octanoic-acid": r"CCCCCC[C@@H](C(=O)O)N", + "2S-aminodecanoic acid": r"CCCCCCCC[C@@H](C(=O)O)N", + "2S-aminododecanoic acid": r"CCCCCCCCCC[C@@H](C(=O)O)N", + "2S-aminooctanoic acid": r"CCCCCC[C@@H](C(=O)O)N", + "2S-hydroxyisocaproic acid": r"CC(C)C[C@@H](C(=O)O)O", + "2S-hydroxyisovaleric acid": r"CC(C)[C@@H](C(=O)O)O", + "2S-methyl-3-oxobutyrine": r"CC(=O)[C@](C)(N)C(=O)O", + "3,3-dihomo-4-methoxytyrosine": r"N[C@@H](CCCC1=CC=C(OC)C=C1)C(=O)O", + "3,3-dihomophenylalanine": r"N[C@@H](CCCC1=CC=CC=C1)C(=O)O", + "3,3-dihomotyrosine": r"N[C@@H](CCCC1=CC=C(O)C=C1)C(=O)O", + "3,4-dehydrolysine": r"C(CCN)=C[C@@H](C(=O)O)N", + "3,4-dihydroxybenzoic acid": r"C1=CC(=C(C=C1C(=O)O)O)O", + "3,5-dichloro-4-hydroxyphenylglycine": r"C1=C(Cl)C(=C(Cl)C=C1[C@@H](C(=O)O)N)O", + "3,5-dihydroxyphenylglycine": r"N[C@H](C(=O)O)c1cc(O)cc(O)c1", + "3-(2-nitrocyclopropylalanine)": r"C1[C@H]([C@@H]1[N+](=O)[O-])C[C@@H](C(=O)O)N", + "3-(3-pyridyl)-alanine": r"C1=CC(=CN=C1)C[C@@H](C(=O)O)N", + "3-amino-2,4-dihydroxybenzoic acid": r"C1=CC(=C(C(=C1C(=O)O)O)N)O", + "3-amino-4-hydroxybenzoic acid": r"C1=CC(=C(C=C1C(=O)O)N)O", + "3-amino-6-hydroxy-2-piperidone": r"C1CC(NC(=O)C1N)O", + "3-aminoisobutyric acid": r"CC(CN)C(=O)O", + "3-chlorotyrosine": r"C1=C(Cl)C(=CC=C1C[C@@H](C(=O)O)N)O", + "3-hydroxy-4-methylproline": r"CC1C(O)[C@H](NC1)C(=O)O", + "3-hydroxy-O-methyl-5-methyltyrosine": r"C1=C(O)C(=C(C)C=C1C[C@@H](C(=O)O)N)OC", + "3-hydroxy-O-methyltyrosine": r"C1=C(O)C(=CC=C1C[C@@H](C(=O)O)N)OC", + "3-hydroxy-para-aminobenzoic acid": r"C1=CC(=C(C=C1C(=O)O)O)N", + "3-hydroxyasparagine": r"N[C@H](C(O)=O)C(O)C(N)=O", + "3-hydroxyaspartic acid": r"N[C@@H](C(C(=O)O)O)(C(=O)O)", + "3-hydroxyglutamine": r"C(C([C@@H](C(=O)O)N)O)C(=O)N", + "3-hydroxykynurenine": r"C1=CC(=C(C(=C1)O)N)C(=O)C[C@@H](C(=O)O)N", + "3-hydroxyleucine": r"CC(C)C([C@@H](C(=O)O)N)O", + "3-hydroxypicolinic acid": r"C1=CC(=C(N=C1)C(=O)O)O", + "3-hydroxyquinaldic acid": r"c1ccc2c(c1)cc(c(n2)C(=O)O)O", + "3-hydroxytyrosine": r"C1=CC(=C(C=C1C[C@@H](C(=O)O)N)O)O", + "3-hydroxyvaline": r"CC(O)(C)[C@@H](C(=O)O)N", + "3-methoxyanthranilic acid": r"COC1=CC=CC(=C1N)C(=O)O", + "3-methoxyaspartic acid": r"N[C@H](C(C(=O)O)OC)(C(=O)O)", + "3-methyl-D-aspartic acid wonky": r"C[C@@H]([C@H](C(=O)O)N)C(=O)O", + "3-methylasparagine": r"CC([C@@H](C(=O)O)N)C(=O)N", + "3-methylaspartic acid": r"CC([C@@H](C(=O)O)N)C(=O)O", + "3-methylglutamic acid": r"CC(CC(=O)O)[C@@H](C(=O)O)N", + "3-methylleucine": r"CC(C)C(C)[C@@H](C(=O)O)N", + "3-nitrotyrosine": r"C1=CC(=C(C=C1C[C@@H](C(=O)O)N)[N+](=O)[O-])O", + "3R-aminoisobutyric acid": r"C[C@H](CN)C(=O)O", + "3R-chloroproline": r"C1[C@@H](Cl)[C@H](NC1)C(=O)O", + "3R-hydroxy-2,4-diaminobutyric acid": r"NC[C@@H](O)[C@@H](C(=O)O)N", + "3R-hydroxyasparagine": r"N[C@H](C(O)=O)[C@@H](O)C(N)=O", + "3R-hydroxyaspartic acid": r"N[C@@H]([C@H](C(=O)O)O)(C(=O)O)", + "3R-hydroxyglutamine": r"C([C@H]([C@@H](C(=O)O)N)O)C(=O)N", + "3R-hydroxyhomotyrosine": r"C1=CC(=CC=C1C[C@H]([C@@H](C(=O)O)N)O)O", + "3R-hydroxyleucine": r"CC(C)[C@H]([C@@H](C(=O)O)N)O", + "3R-methyl-D-aspartic acid wonky": r"N[C@H]([C@@H](O)C(O)=O)C(O)=O", + "3R-methylbeta-alanine": r"NC[C@@H](C)C(=O)O", + "3R-methylglutamic acid": r"C[C@H](CC(=O)O)[C@@H](C(=O)O)N", + "3S,4R-dichloroproline": r"Cl[C@H]1[C@@H](Cl)[C@H](NC1)C(=O)O", + "3S,4S-dihydroxyhomotyrosine": r"C1=CC(=CC=C1[C@H](O)[C@H]([C@@H](C(=O)O)N)O)O", + "3S-aminobutyric acid": r"C[C@@H](CC(=O)O)N", + "3S-carboxypiperazine": r"C1NN[C@H](C(=O)O)CC1", + "3S-cyclohex-2-enylalanine": r"C1C=C[C@H](CC1)C[C@@H](C(=O)O)N", + "3S-hydroxy-4R-methyloctanoic acid": r"CCCC[C@H]([C@H](CC(O)=O)O)C", + "3S-hydroxy-4S-methylproline": r"C[C@@H]1[C@H](O)[C@H](NC1)C(=O)O", + "3S-hydroxy-6-chlorohistidine": r"C1=C(NC(Cl)=N1)[C@H]([C@@H](C(=O)O)N)O", + "3S-hydroxyasparagine": r"N[C@H](C(O)=O)[C@H](O)C(N)=O", + "3S-hydroxyleucine": r"CC(C)[C@@H]([C@@H](C(=O)O)N)O", + "3S-hydroxypipecolic acid": r"C1C[C@@H]([C@H](NC1)C(=O)O)O", + "3S-hydroxyproline": r"O[C@@H]1[C@H](NCC1)C(=O)O", + "3S-methyl-D-aspartic acid branched": r"C[C@@H]([C@H](C(=O)O)N)C(=O)O", + "3S-methyl-D-aspartic acid wonky": r"N[C@H]([C@H](O)C(O)=O)C(O)=O", + "3S-methylaspartic acid": r"C[C@@H]([C@@H](C(=O)O)N)C(=O)O", + "3S-methylaspartic acid branched": r"C[C@@H]([C@@H](C(=O)O)N)C(=O)O", + "3S-methylleucine": r"CC(C)[C@H](C)[C@@H](C(=O)O)N", + "3S-methylproline": r"C[C@@H]1[C@H](NCC1)C(=O)O", + "4,5-dehydroarginine": r"O=C(O)[C@@H](N)C/C=C/NC(N)=N", + "4,5-dihydroxyornithine": r"C([C@@H](C(=O)O)N)C(C(N)O)O", + "4-acetamidopyrrole-2-carboxylic acid": r"CC(=O)NC1=CNC(=C1)C(=O)O", + "4-amino-2-hydroxy-3-isopropoxybenzoic acid": r"CC(C)OC1=C(C=CC(=C1O)C(=O)O)N", + "4-aminobutyric acid": r"NCCCC(=O)O", + "4-aminophenylalanine": r"C1=CC(=CC=C1C[C@@H](C(=O)O)N)N", + "4-chlorobenzoic acid": r"C1=CC(=CC=C1C(=O)O)Cl", + "4-hydroxy-3-nitrobenzoic acid": r"C1=CC(=C(C=C1C(=O)O)[N+](=O)[O-])O", + "4-hydroxy-D-kynurenine": r"C1=C(O)C=C(C(=C1)C(=O)C[C@H](C(=O)O)N)N", + "4-hydroxybenzoic acid": r"C1=CC(=CC=C1C(=O)O)O", + "4-hydroxyglutamine": r"C(C(O)C(=O)N)[C@@H](C(=O)O)N", + "4-hydroxyindole-3-carboxylic acid": r"c1cc2c(c(c1)O)c(c[nH]2)C(=O)O", + "4-hydroxyphenylglycine": r"C1=CC(=CC=C1[C@@H](C(=O)O)N)O", + "4-hydroxyphenylpyruvic acid": r"C1=CC(=CC=C1CC(=O)C(=O)O)O", + "4-hydroxyproline": r"C1[C@H](NCC1O)C(=O)O", + "4-hydroxythreonine": r"C([C@H]([C@@H](C(=O)O)N)O)O", + "4-hydroxyvaline": r"CC(CO)[C@@H](C(=O)O)N", + "4-methoxytryptophan": r"C1=CC=C2C(=C1OC)C(=CN2)C[C@@H](C(=O)O)N", + "4-methylproline": r"CC1C[C@H](NC1)C(=O)O", + "4-nitrotryptophan": r"C1=CC=C2C(=C1[N+](=O)[O-])C(=CN2)C[C@@H](C(=O)O)N", + "4-oxoproline": r"C1[C@H](NCC1=O)C(=O)O", + "4R-E-butenyl-4R-methylthreonine": r"C/C=C/C[C@@H](C)[C@H]([C@@H](C(=O)O)N)O", + "4R-hydroxyproline": r"C1[C@H](NC[C@@H]1O)C(=O)O", + "4R-methylproline": r"C[C@@H]1C[C@H](NC1)C(=O)O", + "4R-propylproline": r"CCC[C@@H]1C[C@H](NC1)C(=O)O", + "4S,5-dihydroxy-2S-aminopentanoic acid": r"O[C@@H](C[C@@H](C(=O)O)N)CO", + "4S-acetyl-5S-methylproline": r"CC(=O)O[C@H]1C[C@H](N[C@H](C)1)C(=O)O", + "4S-hydroxylysine": r"NCC[C@H](O)C[C@@H](C(=O)O)N", + "4S-methylazetidine-2S-carboxylic acid": r"C[C@H]1C[C@H](N1)C(=O)O", + "4S-methylproline": r"C[C@H]1C[C@H](NC1)C(=O)O", + "4S-propenylproline": r"C/C=C\[C@H]1C[C@H](NC1)C(=O)O", + "5,5-dimethylpipecolic acid": r"C1C(C)(C)CN[C@@H](C1)C(=O)O", + "5-aminolevulinic acid": r"C(CC(=O)O)C(=O)CN", + "5-chloroanthranilic acid": r"C1=CC(=C(C=C1Cl)C(=O)O)N", + "5-chlorotryptophan": r"C1=CC2=C(C=C1Cl)C(=CN2)C[C@@H](C(=O)O)N", + "5-methoxytyrosine": r"C1=C(OC)C(=CC=C1C[C@@H](C(=O)O)N)O", + "5-methylorsellinic acid": r"C=1(C=C(C(=C(C1C)C)C(=O)O)O)O", + "5S-methylproline": r"C1C[C@H](N[C@@H](C)1)C(=O)O", + "6,7-dichlorotryptophan": r"C1=C(Cl)C(Cl)=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N", + "6-chloro-4-hydroxy-1-methyl-indole-3-carboxylic acid": r"C(O)1=C(Cl)C=C2C(=C1)C(=CN(C)2)C(=O)O", + "6-chloro-4-hydroxyindole-3-carboxylic acid": r"c(Cl)1cc2c(c(c1)O)c(c[nH]2)C(=O)O", + "6-chlorotryptophan": r"C1=C(Cl)C=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N", + "6-hydroxy-tetrahydro-isoquinoline-3-carboxylic acid": r"C1C(NCC2=C1C=C(C=C2)O)C(=O)O", + "6-methylsalicylic acid": r"CC1=C(C(=CC=C1)O)C(=O)O", + "6S-methyl-pipecolic acid": r"C1C[C@H](C)N[C@@H](C1)C(=O)O", + "Acetyl-Coa": r"CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O", + "An acid hydrazine polyene (intermediate 14)": r"OC(=O)CCC(=O)NNCC(=O)O", + "Compound 4 (formed by the decarboxylative condensation of L-Phe and succinyl-CoA)": r"C1=CC=C(C=C1)C[C@@H](C(=O)CCC(=O)O)N", + "D-alanine": r"C[C@H](C(=O)O)N", + "D-aspartic acid branched": r"C([C@H](C(=O)O)N)C(=O)O", + "D-glutamic acid branched": r"C(CC(=O)O)[C@H](C(=O)O)N", + "D-isovaline": r"CC[C@](C)(C(=O)O)N", + "D-leucine": r"CC(C)C[C@H](C(=O)O)N", + "D-lysergic acid": r"CN1C[C@@H](C=C2C1CC3=CNC4=CC=CC2=C34)C(=O)O", + "D-phenylalanine": r"C1=CC=C(C=C1)C[C@H](C(=O)O)N", + "D-phenyllactic acid": r"C1=CC=C(C=C1)C[C@H](C(=O)O)O", + "D-pipecolic acid": r"C1CCN[C@H](C1)C(=O)O", + "D-serine": r"C([C@H](C(=O)O)N)O", + "Malonyl-CoA": r"CC(C)(COP(=O)(O)OP(=O)(O)OC[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)[C@H](C(=O)NCCC(=O)NCCSC(=O)CC(=O)O)O", + "N-(1-methyl)-tryptophan": r"C1=CC=C2C(=C1)C(=CN(C)2)C[C@@H](C(=O)O)N", + "N-(1-propargyl)-tryptophan": r"C1=CC=C2C(=C1)C(=CN(CC#C)2)C[C@@H](C(=O)O)N", + "N-formylglycine": r"C(C(=O)O)NC=O", + "N-hydroxyvaline": r"CC(C)[C@@H](C(=O)O)NO", + "N-methylphenylalanine": r"CN[C@@H](CC1=CC=CC=C1)C(=O)O", + "N-methyltyrosine": r"C1=CC(=CC=C1C[C@@H](C(=O)O)NC)O", + "N1-methoxytryptophan": r"C1=CC=C2C(=C1)C(=CN(OC)2)C[C@@H](C(=O)O)N", + "N5-acetyl-N5-hydroxyornithine": r"CC(=O)N(CCC[C@@H](C(=O)O)N)O", + "N5-acetyl-hydroxyornithine": r"CC(=O)N(CCC[C@@H](C(=O)O)N)O", + "N5-cis-anhydromevalonyl-N5-hydroxyornithine": r"C(N(C(=O)/C=C(/CCO)\C)O)CC[C@@H](C(O)=O)N", + "N5-formyl-N5-hydroxyornithine": r"C(C[C@@H](C(=O)O)N)CN(C=O)O", + "N5-hydroxyornithine": r"C(C[C@@H](C(=O)O)N)CNO", + "N5-nitroso-N5-hydroxyornithine": r"O=NN(CCC[C@@H](C(=O)O)N)O", + "N5-trans-anhydromevalonyl-N5-hydroxyornithine": r"C(C[C@@H](C(=O)O)N)CN(O)C(=O)/C=C(C)/CCO", + "N6-hydroxylysine": r"C(CCNO)C[C@@H](C(=O)O)N", + "O-dimethylallyl-L-tyrosine": r"CC(=CCOC1=CC=C(C=C1)C[C@@H](C(=O)O)N)C", + "O-methylthreonine": r"C[C@H]([C@@H](C(=O)O)N)OC", + "O-methyltyrosine": r"COC1=CC=C(C=C1)C[C@@H](C(=O)O)N", + "R-3-hydroxy-3-methylproline": r"O[C@](C)1[C@H](NCC1)C(=O)O", + "R-aza-beta-tyrosine": r"C1=CC(=NC=C1O)[C@@H](CC(=O)O)N", + "R-beta-hydroxyphenylalanine": r"O[C@H](C1=CC=CC=C1)[C@@H](C(=O)O)N", + "R-beta-hydroxytyrosine": r"C1=CC(=CC=C1[C@H]([C@@H](C(=O)O)N)O)O", + "R-beta-methylphenylalanine": r"C[C@H](C1=CC=CC=C1)[C@@H](C(=O)O)N", + "R-beta-methyltryptophan": r"C[C@H](C1=CNC2=CC=CC=C21)[C@@H](C(=O)O)N", + "R-beta-phenylalanine": r"C1=CC=C(C=C1)[C@@H](CC(=O)O)N", + "R-beta-tyrosine": r"C1=CC(=CC=C1[C@@H](CC(=O)O)N)O", + "S-adenosylmethionine": r"C[S+](CC[C@@H](C(=O)[O-])N)C[C@@H]1[C@H]([C@H]([C@@H](O1)N2C=NC3=C(N=CN=C32)N)O)O", + "S-beta-hydroxycyclohex-2S-enylalanine": r"C1C=C[C@H](CC1)[C@H](O)[C@@H](C(=O)O)N", + "S-beta-hydroxyenduracididine": r"C1[C@H](NC(=N1)N)[C@H](O)[C@@H](C(=O)O)N", + "S-beta-hydroxyphenylalanine": r"O[C@@H](C1=CC=CC=C1)[C@@H](C(=O)O)N", + "S-beta-methylphenylalanine": r"C[C@@H](C1=CC=CC=C1)[C@@H](C(=O)O)N", + "S-beta-tyrosine": r"C1=CC(=CC=C1[C@H](CC(=O)O)N)O", + "acetic acid": r"CC(O)=O", + "alanine": r"C[C@@H](C(=O)O)N", + "alaninol": r"C[C@@H](CO)N", + "allo-isoleucine": r"CC[C@@H](C)[C@@H](C(=O)O)N", + "allo-threonine": r"C[C@@H]([C@@H](C(=O)O)N)O", + "anthanillic acid": r"C1=CC=C(C(=C1)C(=O)O)N", + "anthranilic acid": r"C1=CC=C(C(=C1)C(=O)O)N", + "arginine": r"C(C[C@@H](C(=O)O)N)CN=C(N)N", + "argininol": r"N[C@H](CO)CCCN=C(N)N", + "asparagine": r"C([C@@H](C(=O)O)N)C(=O)N", + "aspartic acid": r"C([C@@H](C(=O)O)N)C(=O)O", + "aspartic acid branched": r"C([C@@H](C(=O)O)N)C(=O)O", + "azetidine-2-carboxylic acid": r"O=C(O)[C@H]1NCC1", + "benzoic acid": r"C1=CC=C(C=C1)C(=O)O", + "benzoxazolinate": r"c1ccc2c(c1)nc(o2)C(=O)O", + "beta-alanine": r"NCCC(=O)O", + "beta-hydroxy-3-hydroxy-O-methyl-5-methyltyrosine": r"C1=C(C)C(=C(O)C=C1C(O)[C@@H](C(=O)O)N)OC", + "beta-hydroxy-gamma-methyl-hexadecanoic acid": r"CCCCCCCCCCCCC(C)C(O)CC(=O)O", + "beta-hydroxyarginine": r"C(C(O)[C@@H](C(=O)O)N)CN=C(N)N", + "beta-hydroxyphenylalanine": r"OC(C1=CC=CC=C1)[C@@H](C(=O)O)N", + "beta-hydroxytyrosine": r"C1=CC(=CC=C1C([C@@H](C(=O)O)N)O)O", + "beta-lysine": r"C(C[C@@H](CC(=O)O)N)CN", + "beta-methylphenylalanine": r"CC(C1=CC=CC=C1)C(C(=O)O)N", + "beta-tyrosine": r"C1=CC(=CC=C1C(CC(=O)O)N)O", + "betaine": r"C[N+](C)(C)CC(=O)O", + "butyric acid": r"CCCC(=O)O", + "caffeic acid": r"OC(=O)\C=C\c1ccc(O)c(O)c1", + "capreomycidine": r"C1CN=C(N[C@H]1[C@@H](C(=O)O)N)N", + "cinnamic acid": r"C1=CC=C(C=C1)/C=C/C(=O)O", + "citrulline": r"C(C[C@@H](C(=O)O)N)CNC(=O)N", + "colletorin D acid": r"CC1=CC(=C(C(=C1C(=O)O)O)CC=C(C)C)O", + "coumaric acid": r"C1=CC(=CC=C1/C=C/C(=O)O)O", + "cysteic acid": r"C([C@@H](C(=O)O)N)S(=O)(=O)O", + "cysteine": r"C([C@@H](C(=O)O)N)S", + "cysteine branched": r"C([C@@H](C(=O)O)N)S", + "dehydroarginine": r"C(CN=C(N)N)/C=C(/C(=O)O)\N", + "dehydrophenylalanine": r"N/C(=C\C1=CC=CC=C1)/C(=O)O", + "dehydrotryptophan": r"C1=CC=C2C(=C1)C(=CN2)/C=C(/C(=O)O)\N", + "dehydrovaline": r"CC(=C(C(=O)O)N)C", + "dihydrolysergic acid": r"CN1CC(CC2C1CC3=CNC4=CC=CC2=C34)C(=O)O", + "dimethylsulfoniopropionic acid": r"C[S+](C)CCC(=O)O", + "enduracididine": r"C1[C@H](NC(=N1)N)C[C@@H](C(=O)O)N", + "fatty acid": r"O=C(O)C*", + "fumaric acid": r"C(=C/C(=O)O)\C(=O)O", + "glutamic acid": r"C(CC(=O)O)[C@@H](C(=O)O)N", + "glutamine": r"C(CC(=O)N)[C@@H](C(=O)O)N", + "glycine": r"NCC(=O)O", + "glycolic acid": r"C(C(=O)O)O", + "graminine": r"O=NN(O)CCC[C@H](N)(C(=O)O", + "grifolic acid": r"CC(C)=CCC/C(C)=C/CC/C(C)=C/CC1=C(O)C=C(C)C(C(=O)O)=C(O)1", + "guanidinoacetic acid": r"C(C(=O)O)N=C(N)N", + "histidine": r"C1=C(NC=N1)C[C@@H](C(=O)O)N", + "homophenylalanine": r"C1=CC=C(C=C1)CC[C@@H](C(=O)O)N", + "homoserine": r"C(CO)[C@@H](C(=O)O)N", + "homotyrosine": r"C1=CC(=CC=C1CC[C@@H](C(=O)O)N)O", + "hydroxyproline": r"C(*)1C[C@H](NC(*)1)C(=O)O", + "indole pyruvic acid": r"C1=CC=C2C(=C1)C(=CN2)CC(=O)C(=O)O", + "isoleucine": r"CC[C@H](C)[C@@H](C(=O)O)N", + "isovaline": r"CC[C@@](C)(C(=O)O)N", + "kynurenine": r"C1=CC=C(C(=C1)C(=O)C[C@@H](C(=O)O)N)N", + "lactic acid": r"CC(C(=O)O)O", + "leucine": r"CC(C)C[C@@H](C(=O)O)N", + "leucinol": r"CC(C)C[C@@H](CO)N", + "linoleic acid": r"CCCCC/C=C\C/C=C\CCCCCCCC(=O)O", + "lysine": r"C(CCN)C[C@@H](C(=O)O)N", + "malic acid": r"C(C(C(=O)O)O)C(=O)O", + "malonamate": r"NC(=O)CC(=O)O", + "meta-tyrosine": r"C1=CC(=CC(=C1)O)C[C@@H](C(=O)O)N", + "methionine": r"CSCC[C@@H](C(=O)O)N", + "methylglutaconyl hydroxyornithine": r"C/C(=C\C(=O)N(CCCC(C(=O)O)N)O)/CC(=O)O", + "nicotinic acid": r"C1=CC(=CN=C1)C(=O)O", + "norcoronamic acid": r"C[C@H]1C[C@]1(C(=O)O)N", + "ochratoxin beta": r"CC1CC2=C(C(=C(C=C2)C(=O)O)O)C(=O)O1", + "ornithine": r"C(C[C@@H](C(=O)O)N)CN", + "p-hydroxybenzoylformic acid": r"C1=CC(=CC=C1C(=O)C(=O)O)O", + "p-hydroxymandelate": r"C1=CC(=CC=C1C(C(=O)O)O)O", + "para-aminobenzoic acid": r"O=C(O)c1ccc(N)cc1", + "pentanoic acid": r"CCCCC(=O)O", + "phenazine-1,6-dicarboxylic acid": r"C1=CC(=C2C(=C1)N=C3C(=N2)C=CC=C3C(=O)O)C(=O)O", + "phenylalanine": r"C1=CC=C(C=C1)C[C@@H](C(=O)O)N", + "phenylalaninol": r"C1=CC=C(C=C1)C[C@@H](CO)N", + "phenylglycine": r"C1=CC=C(C=C1)[C@@H](C(=O)O)N", + "phenyllactic acid": r"C1=CC=C(C=C1)C[C@@H](C(=O)O)O", + "phenylpyruvic acid": r"C1=CC=C(C=C1)CC(=O)C(=O)O", + "pipecolic acid": r"C1CCN[C@@H](C1)C(=O)O", + "piperazic acid": r"C1C[C@H](NNC1)C(=O)O", + "piperonylic acid": r"OC(=O)c1ccc2OCOc2c1", + "proline": r"C1C[C@H](NC1)C(=O)O", + "propionic acid": r"CCC(=O)O", + "pyrrole-2-carboxylic acid": r"C1=CNC(=C1)C(=O)O", + "pyruvic acid": r"CC(=O)C(=O)O", + "quinoxaline-2-carboxylic acid": r"C1=CC=C2C(=C1)N=CC(=N2)C(=O)O", + "salicylic acid": r"C1=CC=C(C(=C1)C(=O)O)O", + "serine": r"C([C@@H](C(=O)O)N)O", + "succinic semialdehyde": r"C(CC(=O)O)C=O", + "succinyl-hydrazinoacetic acid": r"N/N=C/C=C/C=C/C=C/C=C/C=C/C(=O)O", + "tetradecanoic acid": r"CCCCCCCCCCCCCC(=O)O", + "threonine": r"C[C@H]([C@@H](C(=O)O)N)O", + "trans-2-crotylglycine": r"C/C=C/C[C@@H](C(=O)O)N", + "trans-2-hexenoic acid": r"CCC\C=C\C(O)=O", + "tricarballylic acid": r"C(C(CC(=O)O)C(=O)O)C(=O)O", + "tryptophan": r"C1=CC=C2C(=C1)C(=CN2)C[C@@H](C(=O)O)N", + "tyrosine": r"C1=CC(=CC=C1C[C@@H](C(=O)O)N)O", + "ustethylinic acid": r"c1(C)c(O)c(C(=O)O)c(CC)cc(O)1", + "valine": r"CC(C)[C@@H](C(=O)O)N", + "valine isocyanide": r"CC(C)[C@H]([N+]#[C-])C(O)=O", + "valinol": r"CC(C)[C@@H](CO)N", +} + + +@dataclass +class ADomain: + """ + Dataclass representing an A domain. + + :param protein: name of the protein containing the A domain + :param start: start position of the A domain + :param end: end position of the A domain + :param domain_nr: domain number of A domain in NRPS (optional) + :param sequence: amino acid sequence of the A domain (optional) + :param extended_signature: extended signature of the A domain (optional) + """ + + protein: str + start: int + end: int + domain_nr: int | None = None + sequence: str | None = None + extended_signature: str | None = None + + +def _b2s(x: Any) -> str: + """ + Convert input to string. + + :param x: input object + :return: string representation + """ + if isinstance(x, (bytes, bytearray)): + return x.decode() + + if hasattr(x, "sequence"): + s = x.sequence + return s.decode() if isinstance(s, (bytes, bytearray)) else str(s) + + return str(x) + + +def extract_domain_hits( + seq_id: str, + sequence: str, + evalue_cutoff: float = 1e-5, +) -> list[dict[str, Any]]: + """ + Extract domain hits from a given protein sequence using HMMER. + + :param seq_id: identifier for the protein sequence + :param sequence: amino acid sequence of the protein + :param evalue_cutoff: e-value cutoff for HMMER hits + :return: list of dictionaries representing domain hits + """ + alphabet = easel.Alphabet.amino() + text_seq = easel.TextSequence(name=seq_id.encode(), sequence=sequence) + seq = text_seq.digitize(alphabet) + + hits_iter = hmmer.hmmscan([seq], HMM_DB, cpus=1, E=evalue_cutoff) + + query_hits = next(hits_iter) # expect only one sequence + + out = [] + for hit in query_hits: + model_name = _b2s(hit.name) + + for dom in hit.domains: + q_from = int(dom.env_from) + q_to = int(dom.env_to) + + aln = dom.alignment + hmm_aln = _b2s(aln.hmm_sequence) + query_aln = _b2s(aln.target_sequence) + + out.append( + dict( + seq_id=seq_id, + model=model_name, + q_from=q_from, + q_to=q_to, + evalue=float(dom.i_evalue), + score=float(dom.score), + hmm_aln=hmm_aln, + query_aln=query_aln, + domain_obj=dom, + ) + ) + + out.sort(key=lambda d: (d["q_from"], d["q_to"], d["model"])) + + return out + + +def pair_domains( + domain_hits: list[dict[str, Any]], + max_gap: int = 200, +) -> list[tuple[ADomain, str, str]]: + """ + Pair AMP-binding and AMP-binding_C domain hits. + + :param domain_hits: list of domain hit dictionaries + :param max_gap: maximum allowed gap between paired domains + :return: list of tuples containing ADomain objects and their alignments + """ + hits = sorted(domain_hits, key=lambda d: d["q_from"]) + + a_domains: list[ADomain] = [] + for h1 in hits: + if h1["model"] != "AMP-binding": + continue + + n_from, n_to = h1["q_from"], h1["q_to"] + + matched = None + for h2 in hits: + if h2["model"] != "AMP-binding_C": + continue + + c_from = h2["q_from"] + + if c_from > n_to and (c_from - n_to) <= max_gap: + matched = h2 + break + + start0 = n_from - 1 + end0 = matched["q_to"] if matched is not None else n_to + a_domains.append((ADomain( + protein=h1["seq_id"], + start=start0, + end=end0), + h1["hmm_aln"], + h1["query_aln"] + )) + + a_domains.sort(key=lambda t: t[0].start) + for i, (d, _, _) in enumerate(a_domains, start=1): + d.domain_nr = i + + return a_domains + + +def extract_signature_from_alignment(hmm_aln: str, query_aln: str) -> str | None: + """ + Extract the extended signature from the given HMM and query alignments. + + :param hmm_aln: HMM alignment string + :param query_aln: query alignment string + :return: extended signature string or None if invalid + """ + wanted = set(POSITIONS_ACTIVE_SITE) + picked: dict[int, str] = {} + + hmm_pos = 0 # 1-based counter, increment when HMM char is not a gap + + for h, q in zip(hmm_aln, query_aln): + if h != "-": + hmm_pos += 1 + if hmm_pos in wanted and hmm_pos not in picked: + picked[hmm_pos] = q + + # Quick fix + missing = wanted - set(picked.keys()) + for m in missing: + picked[m] = "-" + + out = [] + for p in POSITIONS_ACTIVE_SITE: + if p not in picked: + return None + out.append(picked[p]) + + sig = "".join(out).upper() + if not sig or not all(c in VALID for c in sig): + return None + + return sig + + +def fill_domain_sequences( + domains: list[ADomain], + protein_seq: str, + min_len: int = 100, +) -> list[ADomain]: + """ + Fill in the sequences for the given domains from the protein sequence. + + :param domains: list of ADomain objects + :param protein_seq: amino acid sequence of the protein + :param min_len: minimum length of domain sequence to keep + :return: list of ADomain objects with sequences filled in + """ + out = [] + + for d in domains: + seq = protein_seq[d.start:d.end] + if len(seq) >= min_len: + d.sequence = seq + out.append(d) + + return out + + +def find_a_domains( + seq_id: str, + protein_seq: str, + evalue_cutoff: float = 1e-5, +) -> list[ADomain]: + """ + Find A domains in a given protein sequence using HMMER. + + :param seq_id: identifier for the protein sequence + :param protein_seq: amino acid sequence of the protein + :param evalue_cutoff: e-value cutoff for HMMER hits + :return: list of ADomain objects representing found A domains + """ + hits = extract_domain_hits(seq_id, protein_seq, evalue_cutoff) + + hits = [h for h in hits if h["model"] in {"AMP-binding", "AMP-binding_C"}] + + paired = pair_domains(hits, max_gap=200) + + domains_only: list[ADomain] = [] + for d, hmm_aln, query_aln in paired: + d.extended_signature = extract_signature_from_alignment(hmm_aln, query_aln) + domains_only.append(d) + + domains_only = fill_domain_sequences(domains_only, protein_seq, min_len=100) + + domains_only = [d for d in domains_only if d.extended_signature is not None] + + domains_only.sort(key=lambda d: (d.protein, d.start)) + + return domains_only + + +def featurize_signature(sig: str) -> np.ndarray: + """ + Featurize the given extended signature into a numerical feature array. + + :param sig: extended signature string + :return: numpy array of features + """ + assert len(sig) == len(POSITIONS_ACTIVE_SITE), "signature length mismatch" + + features: np.ndarray = np.zeros((len(POSITIONS_ACTIVE_SITE), len(FEATURE_NAMES)), dtype=np.float32) + for i, aa in enumerate(sig): + aa_feats = FEATURES.get(aa) + if aa_feats is None: + raise ValueError(f"invalid amino acid '{aa}' in signature") + features[i, :] = np.array(aa_feats, dtype=np.float32) + + return features.flatten() # shape (n_positions * n_features,) + + +def load_paras_model(cache_dir: Path) -> RandomForestClassifier: + """ + Load the PARAS model from disk (cached in memory for reuse). + + :param cache_dir: Path to the cache directory + :return: loaded PARAS model + """ + global _PARAS_MODEL_CACHE + + # If model already loaded, return it immediately + if PARAS_DOWNLOAD_URL in _PARAS_MODEL_CACHE: + return _PARAS_MODEL_CACHE[PARAS_DOWNLOAD_URL] + + # Otherwise, ensure the file is downloaded and load it + model_path = download_and_prepare(PARAS_DOWNLOAD_URL, cache_dir) + model = joblib.load(model_path) + _PARAS_MODEL_CACHE[PARAS_DOWNLOAD_URL] = model + return model + + +class ParasModel(DomainInferenceModel): + """ + Model for predicting A domain substrate specificity using PARAS. + + :param domain: the domain to make predictions for + :param threshold: probability threshold for predictions + :return: a list of InferenceResult objects containing the predictions + """ + + name: str = "paras" + + def __init__(self, cache_dir: Path | str | None = None, threshold: float = 0.1, keep_top: int = 3) -> None: + """ + Initialize the ParasModel. + + :param cache_dir: directory to cache the model + :param threshold: probability threshold for predictions + """ + super().__init__() + + # Set cache directory + if cache_dir is None: + cache_dir = PARAS_CACHE_DIR + self.cache_dir = Path(cache_dir) + + # Set other parameters + self.threshold = threshold + self.keep_top = keep_top + + def __post_init__(self) -> None: + """ + Post-initialization to set up the model. + + :raises ValueError: if threshold is not between 0 and 1 + """ + super().__post_init__() + + # Make sure threshold is between 0 and 1 + if not (0.0 <= self.threshold <= 1.0): + raise ValueError("threshold must be between 0 and 1") + + # Make sure keep_top is an int and at least 1 + if not (isinstance(self.keep_top, int) and self.keep_top >= 1): + raise ValueError("keep_top must be an integer >= 1") + + + def predict(self, domain: Domain) -> list[InferenceResult]: + """ + Predict the substrate specificity for the given domain. + + :param domain: the domain to make predictions for + :return: a list of InferenceResult objects containing the predictions + """ + if domain.type == "AMP-binding": + # Prepare model + cache_dir = Path(self.cache_dir) + model = load_paras_model(cache_dir) + + # Find A domains in the sequence + a_domains = find_a_domains(seq_id=domain.id, protein_seq=domain.sequence) + + # Make predictions + unknown_prediction = self.result(label="unknown", score=0.0, metadata={}) + + match a_domains: + case []: + log.warning(f"no A domains found in sequence {domain.id}; unable to predict substrate") + return [unknown_prediction] + case [a_domain]: + sig = a_domain.extended_signature + features = featurize_signature(sig).reshape(1, -1) + pred = model.predict_proba(features) + + # Identify top predictions + lbls = model.classes_ + top_indices = np.argsort(pred, axis=1)[0][-self.keep_top:][::-1] + top_lbls = [lbls[i] for i in top_indices] + top_prbs = [pred[0, i] for i in top_indices] + + results = [] + for top_lbl, top_prb in zip(top_lbls, top_prbs): + if top_prb >= self.threshold: + smi = LABEL_TO_SMILES.get(top_lbl) + metadata = {} + if smi is None: + log.warning(f"no SMILES found for predicted label '{top_lbl}'; returning label only") + else: + metadata["smiles"] = smi + results.append(self.result(label=top_lbl, score=round(float(top_prb), 4), metadata=metadata)) + else: + log.debug(f"prediction '{top_lbl}' for sequence {domain.id} below threshold ({top_prb:.4f} < {self.threshold}); skipping") + + if not results: + return [unknown_prediction] + + return results + case _: + log.error(f"found multiple ({len(a_domains)}) A domains in sequence {domain.id}; unable to predict substrate") + return [unknown_prediction] + + else: + # Not the domain type of interest + return [] diff --git a/src/biocracker/inference/registry.py b/src/biocracker/inference/registry.py new file mode 100644 index 0000000..536a052 --- /dev/null +++ b/src/biocracker/inference/registry.py @@ -0,0 +1,61 @@ +"""Registry for inference models.""" + +from biocracker.inference.base import GeneInferenceModel, DomainInferenceModel + +GENE_MODELS: list[GeneInferenceModel] = [] +DOMAIN_MODELS: list[DomainInferenceModel] = [] + + +def register_gene_model(model: GeneInferenceModel) -> None: + """ + Register a gene inference model. + + :param model: the gene inference model to register + :raises TypeError: if the model is not an instance of GeneInferenceModel + """ + # Make sure model is an instance of GeneInferenceModel + if not isinstance(model, GeneInferenceModel): + raise TypeError("model must be an instance of GeneInferenceModel") + + # Check that the model is not already registered; if it is, do not register it again + registered = [m for m in GENE_MODELS if m.name == model.name] + if registered: + return + + GENE_MODELS.append(model) + + +def register_domain_model(model: DomainInferenceModel) -> None: + """ + Register a domain inference model. + + :param model: the domain inference model to register + """ + # Make sure model is an instance of DomainInferenceModel + if not isinstance(model, DomainInferenceModel): + raise TypeError("model must be an instance of DomainInferenceModel") + + # Check that the model is not already registered; if it is, do not register it again + registered = [m for m in DOMAIN_MODELS if m.name == model.name] + if registered: + return + + DOMAIN_MODELS.append(model) + + +def get_gene_models() -> list[GeneInferenceModel]: + """ + Get the list of registered gene inference models. + + :return: list of registered gene inference models + """ + return GENE_MODELS + + +def get_domain_models() -> list[DomainInferenceModel]: + """ + Get the list of registered domain inference models. + + :return: list of registered domain inference models + """ + return DOMAIN_MODELS diff --git a/src/biocracker/io/__init__.py b/src/biocracker/io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/biocracker/io/gbk_antismash.py b/src/biocracker/io/gbk_antismash.py new file mode 100644 index 0000000..f1984cc --- /dev/null +++ b/src/biocracker/io/gbk_antismash.py @@ -0,0 +1,237 @@ +"""Module for parsing GenBank files, including generic and antiSMASH-specific formats.""" + +import logging +from pathlib import Path +from typing import Any, Iterable + +from Bio import SeqIO +from Bio.SeqFeature import FeatureLocation, SeqFeature +from Bio.SeqRecord import SeqRecord + +from biocracker.io.options import AntiSmashOptions, AntismashReadoutLevel +from biocracker.model.region import Region +from biocracker.model.gene import Gene, Strand +from biocracker.model.domain import Domain + + +log = logging.getLogger(__name__) + + +def _iter_regions(record: SeqRecord, readout_level: AntismashReadoutLevel) -> list[SeqFeature]: + """ + Iterate over region features in a Biopython SeqRecord. + + :param record: Biopython SeqRecord object + :return: list of region SeqFeature objects + """ + return [f for f in record.features if f.type == readout_level] + + +def _iter_cds(record: SeqRecord, gene_identifiers: list[str] | None = None) -> list[SeqFeature]: + """ + Iterate over CDS features in a Biopython SeqRecord. + + :param record: Biopython SeqRecord object + :param gene_identifiers: list of gene feature types to look for + :return: list of CDS SeqFeature objects + .. note:: antiSMASH gene features are usually called 'CDS' + """ + if gene_identifiers is None: + gene_identifiers = ["CDS"] + + return [f for f in record.features if f.type in gene_identifiers] + + +def _iter_domains(record: SeqRecord, domain_identifiers: list[str] | None = None) -> list[SeqFeature]: + """ + Iterate over domain features in a Biopython SeqRecord. + + :param record: Biopython SeqRecord object + :param domain_identifiers: list of domain feature types to look for + :return: list of domain SeqFeature objects + .. note:: antiSMASH domain features are usually called 'aSDomain' + """ + if domain_identifiers is None: + domain_identifiers = ["aSDomain"] + + return [f for f in record.features if f.type in domain_identifiers] + + +def _start_end(feat: SeqFeature) -> tuple[Strand, int, int]: + """ + Return (strand, start, end) as a tuple from a SeqFeature. + + :param feat: Biopython SeqFeature object + :return: tuple of (strand, start, end) + :raises ValueError: if strand is not 1 or -1 + .. note:: location is 0-based, end-exclusive + """ + loc: FeatureLocation = feat.location + + match loc.strand: + case 1: + strand = Strand.FORWARD + case -1: + strand = Strand.REVERSE + case _: + raise ValueError(f"unexpected strand value: {loc.strand}") + + return strand, int(loc.start), int(loc.end) + + +def _in_bounds(child: SeqFeature, parent: SeqFeature) -> bool: + """ + Check if a child feature is within the bounds of a parent feature. + + :param child: child SeqFeature object + :param parent: parent SeqFeature object + :return: True if child is within parent bounds, else False + """ + _, cs, ce = _start_end(child) + _, ps, pe = _start_end(parent) + + return ps <= cs and ce <= pe + + +def _q1(feat: SeqFeature, keys: Iterable[str]) -> str | None: + """ + Return the first available qualifier value for any of the given keys, else None. + + :param feat: Biopython SeqFeature object + :param keys: iterable of qualifier keys to check + :return: qualifier value or None + """ + for k in keys: + vals = feat.qualifiers.get(k) + if vals: + return vals[0] + + return None + + +def _gene_name(feat: SeqFeature) -> str: + """ + Get a gene name from a SeqFeature, or generate a default if none found. + + :param feat: Biopython SeqFeature object + :return: gene name + """ + name = _q1(feat, ("locus_tag", "gene", "protein_id", "Name")) + + if name: + return name + + strand, s, e = _start_end(feat) + + return f"CDS_{s}_{e}_{strand.value}" + + +def _gene_rec_from_feat(feat: SeqFeature) -> Gene: + """ + Create a Gene instance from a SeqFeature. + + :param feat: Biopython SeqFeature object + :return: Gene instance + """ + strand, s, e = _start_end(feat) + name = _gene_name(feat) + aa_seq = _q1(feat, ("translation",)) + + return Gene( + id=name, + start=s, + end=e, + strand=strand, + sequence=aa_seq, + ) + + +def _domain_rec_from_feat(feat: SeqFeature) -> Domain: + """ + Create a Domain instance from a SeqFeature. + + :param feat: Biopython SeqFeature object + :return: Domain instance + """ + _, s, e = _start_end(feat) + name = _q1(feat, ("label", "product", "note")) + kind = _q1(feat, ("aSDomain", "domain", "label")) + aa_seq = _q1(feat, ("translation",)) + + return Domain( + id=name, + type=kind, + start=s, + end=e, + sequence=aa_seq, + ) + + +def collect_antismash_regions(record: SeqRecord, options: AntiSmashOptions) -> list[Region]: + """ + Collect antiSMASH regions from a GenBank record. + + :param record: GenBank record + :param options: parsing options + :return: list of parsed regions + """ + regions = _iter_regions(record, readout_level=options.readout_level) + cds_list = _iter_cds(record, gene_identifiers=options.gene_identifiers) + dom_list = _iter_domains(record, domain_identifiers=options.domain_identifiers) + + region_recs: list[Region] = [] + + for reg in regions: + _, rs, re = _start_end(reg) + + # Search for wanted qualifiers + found_qualifiers: dict[str, Any] = {} + for wanted in options.wanted_qualifiers: + qualifiers = reg.qualifiers.get(wanted, []) or [] + found_qualifiers[wanted] = qualifiers + + # Genes inside region, sorted by coordinates + gene_feats = [g for g in cds_list if _in_bounds(g, reg)] + gene_feats.sort(key=lambda gf: (int(gf.location.start), int(gf.location.end))) + + # Make gene recs + genes: list[Gene] = [] + for gf in gene_feats: + g = _gene_rec_from_feat(gf) + + # Domains inside this gene, sorted by genomic start + gene_doms = [df for df in dom_list if _in_bounds(df, gf)] + gene_doms.sort(key=lambda df: (int(df.location.start), int(df.location.end))) + dom_recs = [_domain_rec_from_feat(dd) for dd in gene_doms] + + g.domains = dom_recs + genes.append(g) + + region_recs.append(Region( + id=record.id, + start=rs, + end=re, + qualifiers=found_qualifiers, + genes=genes, + )) + + return region_recs + + +def parse_antismash_gbk(path: Path, options: AntiSmashOptions) -> list: + """ + Parse an antiSMASH-specific GenBank file. + + :param path: path to the antiSMASH GenBank file + :return: list of parsed records + """ + log.info(f"parsing antiSMASH GenBank file: {path}") + + out: list[Region] = [] + + with open(path) as handle: + for record in SeqIO.parse(handle, "genbank"): + out.extend(collect_antismash_regions(record, options)) + + return out + \ No newline at end of file diff --git a/src/biocracker/io/options.py b/src/biocracker/io/options.py new file mode 100644 index 0000000..8b3b719 --- /dev/null +++ b/src/biocracker/io/options.py @@ -0,0 +1,37 @@ +"""Module for defining options related to loading genomic regions from various sources.""" + +from dataclasses import dataclass, field +from typing import Literal, Union, TypeAlias + + +AntismashReadoutLevel: TypeAlias = Literal["region", "cand_cluster"] + + +@dataclass(frozen=True) +class AntiSmashOptions: + """ + Options for loading biosynthetic regions from antiSMASH GenBank files. + + :param source: source type + :param readout_level: parsing level for biosynthetic regions + :param wanted_qualifiers: list of feature qualifiers to extract from regions + :param gene_identifiers: list of gene feature types to look for + :param domain_identifiers: list of domain feature types to look for + """ + + source: Literal["antismash_gbk"] = "antismash_gbk" + readout_level: AntismashReadoutLevel = "region" + wanted_qualifiers: list[str] | None = field(default_factory=lambda: ["product"]) + gene_identifiers: list[str] | None = field(default_factory=lambda: ["CDS"]) + domain_identifiers: list[str] | None = field(default_factory=lambda: ["aSDomain"]) + + def __post_init__(self) -> None: + """ + Validate options after initialization. + """ + valid_levels = {"region", "cand_cluster"} + if self.readout_level not in valid_levels: + raise ValueError(f"expected readout_level to be one of {valid_levels}, got {self.readout_level}") + + +RegionLoadOptions = Union[AntiSmashOptions] diff --git a/src/biocracker/io/readers.py b/src/biocracker/io/readers.py new file mode 100644 index 0000000..7878da4 --- /dev/null +++ b/src/biocracker/io/readers.py @@ -0,0 +1,27 @@ +"""Module for loading biosynthetic regions from GenBank files.""" + +from pathlib import Path + +from biocracker.io.options import RegionLoadOptions, AntiSmashOptions +from biocracker.io.gbk_antismash import parse_antismash_gbk +from biocracker.model.region import Region + + +def load_regions(path: Path | str, options: RegionLoadOptions) -> list[Region]: + """ + Load biosynthetic regions from a GenBank file. + + :param path: path to the GenBank file + :param source: source of the biosynthetic regions + :return: list of biosynthetic regions + :raises NotImplementedError: if the source is not implementedr + """ + + if isinstance(path, str): + path = Path(path) + + match options: + case AntiSmashOptions(): + return parse_antismash_gbk(path, options) + case _: + raise NotImplementedError(f"loading regions from source {options.source} is not implemented") diff --git a/src/biocracker/model/__init__.py b/src/biocracker/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/biocracker/model/annotations.py b/src/biocracker/model/annotations.py new file mode 100644 index 0000000..c1a4b09 --- /dev/null +++ b/src/biocracker/model/annotations.py @@ -0,0 +1,51 @@ +"""Annotation set data model.""" + +from dataclasses import dataclass, field, asdict + +from biocracker.model.inference import InferenceResult + + +@dataclass +class AnnotationSet: + """ + Represents a set of annotations derived from model inferences. + + :param results: a list of inference results + """ + results: list[InferenceResult] = field(default_factory=list) + + def add(self, result: InferenceResult) -> None: + """ + Adds an inference result to the annotation set. + + :param result: the inference result to add + """ + self.results.append(result) + + def by_model(self, model_name: str) -> list[InferenceResult]: + """ + Retrieves all inference results from a specific model. + + :param model_name: the name of the model + :return: a list of inference results from the specified model + """ + return [r for r in self.results if r.model == model_name] + + def to_dict(self) -> dict: + """ + Converts the annotation set to a dictionary representation. + + :return: a dictionary representation of the annotation set + """ + return {"results": [r.to_dict() for r in self.results]} + + @classmethod + def from_dict(cls, data: dict) -> "AnnotationSet": + """ + Creates an AnnotationSet instance from a dictionary representation. + + :param data: a dictionary representation of the annotation set + :return: an AnnotationSet instance + """ + results = [InferenceResult(**r) for r in data.get("results", [])] + return cls(results=results) diff --git a/src/biocracker/model/domain.py b/src/biocracker/model/domain.py new file mode 100644 index 0000000..6f6d840 --- /dev/null +++ b/src/biocracker/model/domain.py @@ -0,0 +1,55 @@ +"""Domain data model.""" + +from dataclasses import dataclass, field, asdict + +from biocracker.model.annotations import AnnotationSet + + +@dataclass +class Domain: + """ + Represents a biological domain within a gene. + + :param id: the unique identifier of the domain + :param type: the type or name of the domain + :param start: the starting position of the domain within the gene + :param end: the ending position of the domain within the gene + :param sequence: the amino acid sequence of the domain + :param annotations: the set of annotations associated with the domain + """ + + id: str + type: str + start: int + end: int + sequence: str + + annotations: AnnotationSet = field(default_factory=AnnotationSet) + + def to_dict(self) -> dict: + """ + Converts the domain to a dictionary representation. + + :return: a dictionary representation of the domain + """ + data = asdict(self) + data["annotations"] = self.annotations.to_dict() + return data + + @classmethod + def from_dict(cls, data: dict) -> "Domain": + """ + Creates a Domain instance from a dictionary representation. + + :param data: a dictionary representation of the domain + :return: a Domain instance + """ + annotations = AnnotationSet.from_dict(data.get("annotations", {})) + return cls( + id=data["id"], + type=data["type"], + start=data["start"], + end=data["end"], + sequence=data["sequence"], + annotations=annotations, + ) diff --git a/src/biocracker/model/gene.py b/src/biocracker/model/gene.py new file mode 100644 index 0000000..644aa04 --- /dev/null +++ b/src/biocracker/model/gene.py @@ -0,0 +1,82 @@ +"""Gene data model.""" + +from dataclasses import dataclass, field, asdict +from enum import Enum + +from biocracker.model.domain import Domain +from biocracker.model.annotations import AnnotationSet + + +class Strand(Enum): + """ + Represents the strand orientation of a gene. + + :cvar FORWARD: the forward strand + :cvar REVERSE: the reverse strand + """ + FORWARD = "+" + REVERSE = "-" + + +@dataclass +class Gene: + """ + Represents a gene within a biological sequence. + + :param id: the unique identifier of the gene + :param start: the starting position of the gene within the sequence + :param end: the ending position of the gene within the sequence + :param strand: the strand orientation of the gene + :param sequence: the nucleotide sequence of the gene + :param domains: the list of domains associated with the gene + :param annotations: the set of annotations associated with the gene + """ + + id: str + start: int + end: int + strand: Strand + sequence: str + + domains: list[Domain] = field(default_factory=list) + annotations: AnnotationSet = field(default_factory=AnnotationSet) + + def iter_domains(self) -> list[Domain]: + """ + Returns the list of domains sorted by their starting position. + + :return: a list of domains sorted by start position + """ + return sorted(self.domains, key=lambda d: d.start) + + def to_dict(self) -> dict: + """ + Converts the gene to a dictionary representation. + + :return: a dictionary representation of the gene + """ + data = asdict(self) + data["strand"] = self.strand.value + data["domains"] = [d.to_dict() for d in self.domains] + data["annotations"] = self.annotations.to_dict() + return data + + @classmethod + def from_dict(cls, data: dict) -> "Gene": + """ + Creates a Gene instance from a dictionary representation. + + :param data: a dictionary representation of the gene + :return: a Gene instance + """ + domains = [Domain.from_dict(d) for d in data.get("domains", [])] + annotations = AnnotationSet.from_dict(data.get("annotations", {})) + return cls( + id=data["id"], + start=data["start"], + end=data["end"], + strand=Strand(data["strand"]), + sequence=data["sequence"], + domains=domains, + annotations=annotations, + ) \ No newline at end of file diff --git a/src/biocracker/model/inference.py b/src/biocracker/model/inference.py new file mode 100644 index 0000000..3fa7472 --- /dev/null +++ b/src/biocracker/model/inference.py @@ -0,0 +1,57 @@ +"""Inference result data model.""" + +from enum import Enum +from dataclasses import dataclass, asdict +from typing import Any + + +class TargetType(Enum): + """ + Enumeration of possible target types for model inference. + + :cvar DOMAIN: target type representing a domain-level prediction + :cvar GENE: target type representing a gene-level prediction + """ + + DOMAIN = "domain" + GENE = "gene" + + +@dataclass +class InferenceResult: + """ + Represents the result of a model inference. + + :param model: the name of the model used for inference + :param target: the target type of the inference (e.g., DOMAIN, GENE) + :param label: the predicted label or class + :param score: the confidence score of the prediction (optional) + :param metadata: additional metadata related to the inference (optional) + """ + model: str + target: TargetType + label: str + score: float | None = None + metadata: dict[str, Any] | None = None + + def to_dict(self) -> dict[str, Any]: + """ + Convert the InferenceResult instance to a dictionary. + + :return: a dictionary representation of the InferenceResult + """ + result_dict = asdict(self) + result_dict["target"] = self.target.value + return result_dict + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "InferenceResult": + """ + Create an InferenceResult instance from a dictionary. + + :param data: a dictionary containing the inference result data + :return: an InferenceResult instance + """ + data = data.copy() + data["target"] = TargetType(data["target"]) + return cls(**data) diff --git a/src/biocracker/model/region.py b/src/biocracker/model/region.py new file mode 100644 index 0000000..671bbb3 --- /dev/null +++ b/src/biocracker/model/region.py @@ -0,0 +1,60 @@ +"""Region data model.""" + +from dataclasses import dataclass, field, asdict +from typing import Any + +from biocracker.model.gene import Gene + + +@dataclass +class Region: + """ + Represents a biological region containing multiple genes. + + :param id: the unique identifier of the region + :param start: the starting position of the region within the sequence + :param end: the ending position of the region within the sequence + :param qualifiers: additional metadata or qualifiers associated with the region + :param genes: the list of genes contained within the region + """ + + id: str + start: int + end: int + qualifiers: dict[str, Any] = field(default_factory=dict) + genes: list[Gene] = field(default_factory=list) + + def iter_genes(self) -> list[Gene]: + """ + Returns the list of genes sorted by their starting position. + + :return: a list of genes sorted by start position + """ + return sorted(self.genes, key=lambda g: g.start) + + def to_dict(self) -> dict[str, Any]: + """ + Converts the Region instance to a dictionary. + + :return: a dictionary representation of the Region + """ + region_dict = asdict(self) + region_dict["genes"] = [gene.to_dict() for gene in self.genes] + return region_dict + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "Region": + """ + Creates a Region instance from a dictionary. + + :param data: a dictionary containing region data + :return: a Region instance + """ + genes = [Gene.from_dict(gene_data) for gene_data in data.get("genes", [])] + return cls( + id=data["id"], + start=data["start"], + end=data["end"], + qualifiers=data.get("qualifiers", {}), + genes=genes, + ) diff --git a/src/biocracker/paras.py b/src/biocracker/paras.py deleted file mode 100644 index ea1e415..0000000 --- a/src/biocracker/paras.py +++ /dev/null @@ -1,372 +0,0 @@ -"""Module for fast PARAS inference of substrate specificity A domains.""" - -from dataclasses import dataclass -from importlib.resources import files -from typing import Any -from pathlib import Path - -import numpy as np -import joblib -from pyhmmer import easel, plan7, hmmer - -import biocracker.data -from biocracker.config import PARAS_MODEL_DOWNLOAD_URL -from biocracker.helpers import download_and_prepare - - -_PARAS_MODEL_CACHE: dict[str, object] = {} - - -HMM_DB_PATH = str(files(biocracker.data).joinpath("AMP-binding_converted.hmm")) -with plan7.HMMFile(HMM_DB_PATH) as hmm_file: - HMM_DB = list(hmm_file) - - -VALID = set("ACDEFGHIKLMNPQRSTVWY-") -FEATURE_NAMES = [ - "WOLS870101", - "WOLS870102", - "WOLS870103", - "FAUJ880109", - "GRAR740102", - "RADA880108", - "ZIMJ680103", - "TSAJ990101", - "CHOP780201", - "CHOP780202", - "CHOP780203", - "ZIMJ680104", - "NEU1", - "NEU2", - "NEU3", -] -FEATURES = { - "-": [0.00, 0.00, 0.00, 1, 8.3, 0.21, 13.59, 145.2, 1.00, 1.03, 0.99, 6.03, 0.06, 0.00, 0.10], - "A": [0.07, -1.73, 0.09, 0, 8.1, -0.06, 0.00, 90.0, 1.42, 0.83, 0.66, 6.00, 0.06, -0.25, 0.25], - "C": [0.71, -0.97, 4.13, 0, 5.5, 1.36, 1.48, 103.3, 0.70, 1.19, 1.19, 5.05, -0.56, -0.40, -0.14], - "D": [3.64, 1.13, 2.36, 1, 13.0, -0.80, 49.70, 117.3, 1.01, 0.54, 1.46, 2.77, 0.97, -0.08, 0.08], - "E": [3.08, 0.39, -0.07, 1, 12.3, -0.77, 49.90, 142.2, 1.51, 0.37, 0.74, 3.22, 0.85, -0.10, -0.05], - "F": [-4.92, 1.30, 0.45, 0, 5.2, 1.27, 0.35, 191.9, 1.13, 1.38, 0.60, 5.48, -0.99, 0.18, 0.15], - "G": [2.23, -5.36, 0.30, 0, 9.0, -0.41, 0.00, 64.9, 0.57, 0.75, 1.56, 5.97, 0.32, -0.32, 0.28], - "H": [2.41, 1.74, 1.11, 1, 10.4, 0.49, 51.60, 160.0, 1.00, 0.87, 0.95, 7.59, 0.15, -0.03, -0.10], - "I": [-4.44, -1.68, -1.03, 0, 5.2, 1.31, 0.13, 163.9, 1.08, 1.60, 0.47, 6.02, -1.00, -0.03, 0.10], - "K": [2.84, 1.41, -3.14, 2, 11.3, -1.18, 49.50, 167.3, 1.16, 0.74, 1.01, 9.74, 1.00, 0.32, 0.11], - "L": [-4.19, -1.03, -0.98, 0, 4.9, 1.21, 0.13, 164.0, 1.21, 1.30, 0.59, 5.98, -0.83, 0.05, 0.01], - "M": [-2.49, -0.27, -0.41, 0, 5.7, 1.27, 1.43, 167.0, 1.45, 1.05, 0.60, 5.74, -0.68, -0.01, 0.04], - "N": [3.22, 1.45, 0.84, 2, 11.6, -0.48, 3.38, 124.7, 0.67, 0.89, 1.56, 5.41, 0.70, -0.06, 0.17], - "P": [-1.22, 0.88, 2.23, 0, 8.0, 1.1, 1.58, 122.9, 0.57, 0.55, 1.52, 6.30, 0.45, 0.23, 0.41], - "Q": [2.18, 0.53, -1.14, 2, 10.5, -0.73, 3.53, 149.4, 1.11, 1.10, 0.98, 5.65, 0.71, -0.02, 0.12], - "R": [2.88, 2.52, -3.44, 4, 10.5, -0.84, 52.00, 194.0, 0.98, 0.93, 0.95, 10.76, 0.80, 0.19, -0.41], - "S": [1.96, -1.63, 0.57, 1, 9.2, -0.50, 1.67, 95.4, 0.77, 0.75, 1.43, 5.68, 0.48, -0.15, 0.23], - "T": [0.92, -2.09, -1.40, 1, 8.6, -0.27, 1.66, 121.5, 0.83, 1.19, 0.96, 5.66, 0.38, -0.10, 0.29], - "V": [-2.69, -2.53, -1.29, 0, 5.9, 1.09, 0.13, 139.0, 1.06, 1.70, 0.50, 5.96, -0.75, -0.19, 0.03], - "W": [-4.75, 3.65, 0.85, 1, 5.4, 0.88, 2.10, 228.2, 1.08, 1.37, 0.96, 5.89, -0.57, 0.31, 0.34], - "Y": [1.39, 2.32, 0.01, 1, 6.2, 0.33, 1.61, 197.0, 0.69, 1.47, 1.14, 5.66, -0.35, 0.40, -0.02], -} -POSITIONS_ACTIVE_SITE = [ - 13, - 16, - 17, - 41, - 46, - 47, - 48, - 49, - 50, - 51, - 52, - 55, - 93, - 94, - 125, - 126, - 127, - 128, - 129, - 152, - 153, - 154, - 155, - 156, - 157, - 158, - 159, - 160, - 161, - 162, - 163, - 164, - 165, - 166, -] - - -@dataclass -class ADomain: - """ - Dataclass representing an A domain. - - :param protein: name of the protein containing the A domain - :param start: start position of the A domain - :param end: end position of the A domain - :param domain_nr: domain number of A domain in NRPS (optional) - :param sequence: amino acid sequence of the A domain (optional) - :param extended_signature: extended signature of the A domain (optional) - """ - - protein: str - start: int - end: int - domain_nr: int | None = None - sequence: str | None = None - extended_signature: str | None = None - - -def _b2s(x: Any) -> str: - """ - Convert input to string. - - :param x: input object - :return: string representation - """ - if isinstance(x, (bytes, bytearray)): - return x.decode() - - if hasattr(x, "sequence"): - s = x.sequence - return s.decode() if isinstance(s, (bytes, bytearray)) else str(s) - - return str(x) - - -def extract_domain_hits( - seq_id: str, - sequence: str, - evalue_cutoff: float = 1e-5, -) -> list[dict[str, Any]]: - """ - Extract domain hits from a given protein sequence using HMMER. - - :param seq_id: identifier for the protein sequence - :param sequence: amino acid sequence of the protein - :param evalue_cutoff: e-value cutoff for HMMER hits - :return: list of dictionaries representing domain hits - """ - alphabet = easel.Alphabet.amino() - text_seq = easel.TextSequence(name=seq_id.encode(), sequence=sequence) - seq = text_seq.digitize(alphabet) - - hits_iter = hmmer.hmmscan([seq], HMM_DB, cpus=1, E=evalue_cutoff) - - query_hits = next(hits_iter) # expect only one sequence - - out = [] - for hit in query_hits: - model_name = _b2s(hit.name) - - for dom in hit.domains: - q_from = int(dom.env_from) - q_to = int(dom.env_to) - - aln = dom.alignment - hmm_aln = _b2s(aln.hmm_sequence) - query_aln = _b2s(aln.target_sequence) - - out.append( - dict( - seq_id=seq_id, - model=model_name, - q_from=q_from, - q_to=q_to, - evalue=float(dom.i_evalue), - score=float(dom.score), - hmm_aln=hmm_aln, - query_aln=query_aln, - domain_obj=dom, - ) - ) - - out.sort(key=lambda d: (d["q_from"], d["q_to"], d["model"])) - - return out - - -def pair_domains( - domain_hits: list[dict[str, Any]], - max_gap: int = 200, -) -> list[tuple[ADomain, str, str]]: - """ - Pair AMP-binding and AMP-binding_C domain hits. - - :param domain_hits: list of domain hit dictionaries - :param max_gap: maximum allowed gap between paired domains - :return: list of tuples containing ADomain objects and their alignments - """ - hits = sorted(domain_hits, key=lambda d: d["q_from"]) - - a_domains: list[ADomain] = [] - for h1 in hits: - if h1["model"] != "AMP-binding": - continue - - n_from, n_to = h1["q_from"], h1["q_to"] - - matched = None - for h2 in hits: - if h2["model"] != "AMP-binding_C": - continue - - c_from = h2["q_from"] - - if c_from > n_to and (c_from - n_to) <= max_gap: - matched = h2 - break - - start0 = n_from - 1 - end0 = matched["q_to"] if matched is not None else n_to - a_domains.append((ADomain( - protein=h1["seq_id"], - start=start0, - end=end0), - h1["hmm_aln"], - h1["query_aln"] - )) - - a_domains.sort(key=lambda t: t[0].start) - for i, (d, _, _) in enumerate(a_domains, start=1): - d.domain_nr = i - - return a_domains - - -def extract_signature_from_alignment(hmm_aln: str, query_aln: str) -> str | None: - """ - Extract the extended signature from the given HMM and query alignments. - - :param hmm_aln: HMM alignment string - :param query_aln: query alignment string - :return: extended signature string or None if invalid - """ - wanted = set(POSITIONS_ACTIVE_SITE) - picked: dict[int, str] = {} - - hmm_pos = 0 # 1-based counter, increment when HMM char is not a gap - - for h, q in zip(hmm_aln, query_aln): - if h != "-": - hmm_pos += 1 - if hmm_pos in wanted and hmm_pos not in picked: - picked[hmm_pos] = q - - # Quick fix - missing = wanted - set(picked.keys()) - for m in missing: - picked[m] = "-" - - out = [] - for p in POSITIONS_ACTIVE_SITE: - if p not in picked: - return None - out.append(picked[p]) - - sig = "".join(out).upper() - if not sig or not all(c in VALID for c in sig): - return None - - return sig - - -def fill_domain_sequences( - domains: list[ADomain], - protein_seq: str, - min_len: int = 100, -) -> list[ADomain]: - """ - Fill in the sequences for the given domains from the protein sequence. - - :param domains: list of ADomain objects - :param protein_seq: amino acid sequence of the protein - :param min_len: minimum length of domain sequence to keep - :return: list of ADomain objects with sequences filled in - """ - out = [] - - for d in domains: - seq = protein_seq[d.start:d.end] - if len(seq) >= min_len: - d.sequence = seq - out.append(d) - - return out - - -def find_a_domains( - seq_id: str, - protein_seq: str, - evalue_cutoff: float = 1e-5, -) -> list[ADomain]: - """ - Find A domains in a given protein sequence using HMMER. - - :param seq_id: identifier for the protein sequence - :param protein_seq: amino acid sequence of the protein - :param evalue_cutoff: e-value cutoff for HMMER hits - :return: list of ADomain objects representing found A domains - """ - hits = extract_domain_hits(seq_id, protein_seq, evalue_cutoff) - - hits = [h for h in hits if h["model"] in {"AMP-binding", "AMP-binding_C"}] - - paired = pair_domains(hits, max_gap=200) - - domains_only: list[ADomain] = [] - for d, hmm_aln, query_aln in paired: - d.extended_signature = extract_signature_from_alignment(hmm_aln, query_aln) - domains_only.append(d) - - domains_only = fill_domain_sequences(domains_only, protein_seq, min_len=100) - - domains_only = [d for d in domains_only if d.extended_signature is not None] - - domains_only.sort(key=lambda d: (d.protein, d.start)) - - return domains_only - - -def featurize_signature(sig: str) -> np.ndarray: - """ - Featurize the given extended signature into a numerical feature array. - - :param sig: extended signature string - :return: numpy array of features - """ - assert len(sig) == len(POSITIONS_ACTIVE_SITE), "signature length mismatch" - - features: np.ndarray = np.zeros((len(POSITIONS_ACTIVE_SITE), len(FEATURE_NAMES)), dtype=np.float32) - for i, aa in enumerate(sig): - aa_feats = FEATURES.get(aa) - if aa_feats is None: - raise ValueError(f"invalid amino acid '{aa}' in signature") - features[i, :] = np.array(aa_feats, dtype=np.float32) - - return features.flatten() # shape (n_positions * n_features,) - - -def load_paras_model(cache_dir: Path) -> object: - """ - Load the paras model from disk (cached in memory for reuse). - - :param cache_dir: Path to the cache directory - :return: loaded paras model - """ - global _PARAS_MODEL_CACHE - - # If model already loaded, return it immediately - if PARAS_MODEL_DOWNLOAD_URL in _PARAS_MODEL_CACHE: - return _PARAS_MODEL_CACHE[PARAS_MODEL_DOWNLOAD_URL] - - # Otherwise, ensure the file is downloaded and load it - model_path = download_and_prepare(PARAS_MODEL_DOWNLOAD_URL, cache_dir) - model = joblib.load(model_path) - _PARAS_MODEL_CACHE[PARAS_MODEL_DOWNLOAD_URL] = model - return model diff --git a/src/biocracker/pipelines/__init__.py b/src/biocracker/pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/biocracker/pipelines/annotate_region.py b/src/biocracker/pipelines/annotate_region.py new file mode 100644 index 0000000..41c30f1 --- /dev/null +++ b/src/biocracker/pipelines/annotate_region.py @@ -0,0 +1,48 @@ +"""Module for annotating genomic regions.""" + +import logging + +from biocracker.utils.logging import Ctx +from biocracker.inference.registry import get_gene_models, get_domain_models +from biocracker.model.region import Region + + +log = logging.getLogger(__name__) + + +def annotate_region(region: Region) -> None: + """ + Annotate all domains in all genes of the given region using registered models. + + :param region: the genomic region to annotate + """ + log.debug(Ctx(region=region.id).prefix() + f"annotating {len(region.genes)} genes") + + for gene in region.iter_genes(): + gctx = Ctx(region=region.id, gene=gene.id) + log.debug(gctx.prefix() + "annotating gene") + + # Gene inference + for m in get_gene_models(): + mctx = Ctx(region=region.id, gene=gene.id, model=m.name) + log.debug(mctx.prefix() + "running gene inference") + + results = m.predict(gene) + for r in results: + gene.annotations.add(r) + + log.debug(mctx.prefix() + f"added {len(results)} results") + + # Domain inference + for domain in gene.iter_domains(): + dctx = Ctx(region=region.id, gene=gene.id, domain=domain.id) + + for m in get_domain_models(): + mctx = Ctx(region=region.id, gene=gene.id, domain=domain.id, model=m.name) + log.debug(mctx.prefix() + f"running domain inference ({domain.type})") + + results = m.predict(domain) + for r in results: + domain.annotations.add(r) + + log.debug(mctx.prefix() + f"added {len(results)} results") diff --git a/src/biocracker/query/__init__.py b/src/biocracker/query/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/biocracker/query/modules.py b/src/biocracker/query/modules.py new file mode 100644 index 0000000..ba58c22 --- /dev/null +++ b/src/biocracker/query/modules.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass + +from biocracker.model.region import Region + + +@dataclass +class Module: + ... + + +@dataclass +class LinearReadout: + ... + + +def linear_readout(region: Region) -> LinearReadout: + """ + """ + return LinearReadout() diff --git a/src/biocracker/readout.py b/src/biocracker/readout.py deleted file mode 100644 index 8c4323d..0000000 --- a/src/biocracker/readout.py +++ /dev/null @@ -1,744 +0,0 @@ -"""Module contains functions for reading out RegionRec/CandidateClusterRec objects.""" - -from __future__ import annotations - -from collections.abc import Generator, Sequence -from dataclasses import dataclass -from pathlib import Path -from typing import Literal - -from biocracker.antismash import CandidateClusterRec, DomainRec, GeneRec, RegionRec -from biocracker.paras import predict_amp_domain_substrate - -PKS_KINDS = { - "PKS_KS", - "PKS_AT", - "PKS_KR", - "PKS_DH", - "PKS_ER", -} -PKS_TE_ALIASES = {"Thioesterase", "PKS_TE", "TE"} - -# Common NRPS domain labels found in antiSMASH outputs -NRPS_A = "AMP-binding" -NRPS_C = "Condensation" -NRPS_T_ALIASES = {"PCP", "Thiolation", "T", "Peptidyl-carrier-protein"} -NRPS_E = "Epimerization" -NRPS_MT_ALIASES = {"N-Methyltransferase", "MT"} -NRPS_OX_ALIASES = {"Oxidase", "Ox", "Oxidoreductase"} -NRPS_R_ALIASES = {"Thioester-reductase", "R", "Reductase"} -NRPS_TE = "Thioesterase" - - -@dataclass -class PKSModuleReadout: - """ - Readout dictionary for a PKS module. - - :param kind: kind of the module (always "PKS_module") - :param module_type: type of the PKS module (PKS_A, PKS_B, PKS_C, PKS_D, UNCLASSIFIED) - :param present_domains: list of present domain kinds in the module - :param at_source: source of the acyltransferase domain - :param module_index_in_gene: index of the module within the gene - :param start: start position of the module - :param end: end position of the module - :param gene_name: name of the gene containing the module - :param has_KR: whether the module has a Ketoreductase domain - :param has_DH: whether the module has a Dehydratase domain - :param has_ER: whether the module has an Enoylreductase domain - :param has_AT: whether the module has an Acyltransferase domain - """ - - kind: Literal["PKS_module"] - module_type: Literal["PKS_A", "PKS_B", "PKS_C", "PKS_D", "UNCLASSIFIED"] - present_domains: list[str] - at_source: Literal["CIS", "TRANS", "UNKNOWN"] - module_index_in_gene: int - start: int - end: int - gene_name: str - - # Anatomy - has_active_KR: bool - has_active_DH: bool - has_active_ER: bool - has_AT: bool - role: Literal["starter", "elongation", "terminal", "starter+terminal", "unknown"] - - def __getitem__(self, key: str) -> object: - """ - Allow dictionary-like access to attributes. - - :param key: attribute name - :return: attribute value - :raises KeyError: if attribute is not found - """ - if not hasattr(self, key): - raise KeyError(f"{key} not found in PKSModuleReadout") - return getattr(self, key) - - def get(self, key: str, default: object = None) -> object: - """ - Allow dictionary-like access to attributes with a default value. - - :param key: attribute name - :param default: default value if attribute is not found - :return: attribute value or default - """ - return getattr(self, key, default) - - -@dataclass -class NRPSModuleReadout: - """ - Readout dictionary for an NRPS module. - - :param kind: kind of the module (always "NRPS_module") - :param gene_name: name of the gene containing the module - :param module_index_in_gene: index of the module within the gene - :param start: start position of the module - :param end: end position of the module - :param present_domains: list of present domain kinds in the module - :param has_C: whether the module has a Condensation domain - :param has_T: whether the module has a Thiolation domain - :param has_E: whether the module has an Epimerization domain - :param has_MT: whether the module has a Methyltransferase domain - :param has_Ox: whether the module has an Oxidase domain - :param has_R: whether the module has a Reductase domain - :param has_TE: whether the module has a Thioesterase domain - :param role: role of the module (starter, elongation, terminal, starter+terminal, unknown) - :param substrate: predicted substrate for the A-domain, if any - :param score: prediction score for the substrate, if any - :param raw_pred: raw prediction dictionary from the substrate predictor - """ - - kind: Literal["NRPS_module"] - gene_name: str - module_index_in_gene: int - start: int - end: int - present_domains: list[str] - - # Core anatomy - has_C: bool - has_T: bool - has_E: bool - has_MT: bool - has_Ox: bool - has_R: bool - has_TE: bool - role: Literal["starter", "elongation", "terminal", "starter+terminal", "unknown"] - - # Substrate call from A-domain predictor - substrate_name: str | None # top predicted substrate name - substrate_smiles: str | None # top predicted substrate SMILES - score: float | None # top prediction score - raw_preds: list[dict] | None # raw prediction dictionaries from the predictor - - def __getitem__(self, key: str) -> object: - """ - Allow dictionary-like access to attributes. - - :param key: attribute name - :return: attribute value - :raises KeyError: if attribute is not found - """ - if not hasattr(self, key): - raise KeyError(f"{key} not found in NRPSModuleReadout") - return getattr(self, key) - - def get(self, key: str, default: object = None) -> object: - """ - Allow dictionary-like access to attributes with a default value. - - :param key: attribute name - :param default: default value if attribute is not found - :return: attribute value or default - """ - return getattr(self, key, default) - - -def _domain_kinds(domains: Sequence[DomainRec]) -> set[str]: - """ - Helper function to get the set of domain kinds from a list of DomainRec objects. - - :param domains: list of DomainRec objects - :return: set of domain kinds - """ - return {domain.kind for domain in domains if domain.kind} - - -def _is_kind(d: DomainRec, label: str | set[str]) -> bool: - """ - Helper function to check if a DomainRec matches a given kind or set of kinds. - - :param d: DomainRec object - :param label: kind label (str) or set of kind labels (set[str]) - :return: True if the domain matches the label, False otherwise - """ - if not d.kind: - return False - - if isinstance(label, set): - return d.kind in label - - return d.kind == label - - -def _is_active_domain(d: DomainRec) -> bool: - """ """ - if not d.kind: - return True # can't tell, assume active - - if d.kind not in {"PKS_KR", "PKS_DH", "PKS_ER"}: - return True # only check PKS accessory domains - - texts = [] - if d.name: - texts.append(d.name) - for _, vals in d.raw_qualifiers.items(): - if isinstance(vals, (list, tuple)): - texts.extend(map(str, vals)) - else: - texts.append(str(vals)) - - blob = " ".join(texts).lower() - - # Common antiSMASH phrasing patterns - inactive_flags = [ - "inactive", - "nonfunctional", - "non-functional", - "inactivated", - "broken", - "truncated", - ] - return not any(flag in blob for flag in inactive_flags) - - -def _is_at_only_gene(g: GeneRec) -> bool: - """ - Helper function to determine if a gene is an acyltransferase-only gene. - - :param g: GeneRec object - :return: True if the gene is an AT-only gene, False otherwise - """ - kinds = _domain_kinds(g.domains) - return ("PKS_AT" in kinds) and all(k in {"PKS_AT"} for k in kinds) - - -def _find_upstream_at_only_gene(all_genes: Sequence[GeneRec], idx: int) -> GeneRec | None: - """ - Return the nearest upstream gene that is AT-only (relative to all_genes order). - - :param all_genes: list of GeneRec objects - :param idx: index of the current gene in all_genes - :return: GeneRec object of the nearest upstream AT-only gene, or None if not found - """ - for j in range(idx - 1, -1, -1): - if _is_at_only_gene(all_genes[j]): - return all_genes[j] - - return None - - -def _is_cstarter(d: DomainRec) -> bool: - """ - Helper function to determine if a Condensation domain is a starter C-domain. - - :param d: DomainRec object - :return: True if the domain is a starter C-domain, False otherwise - """ - if not d.kind or d.kind != "Condensation": - return False - - txts = [] - if d.name: - txts.append(d.name) - - for _, vals in d.raw_qualifiers.items(): - # Join lists and scalars; qualifiers may be list[str] - if isinstance(vals, (list, tuple)): - txts.extend(map(str, vals)) - else: - txts.append(str(vals)) - - blob = " ".join(txts).lower() - - return ("starter" in blob) or ("cstarter" in blob) or ("condensation_starter" in blob) - - -def _gene_has_loading_domains(g: GeneRec) -> bool: - """ - Helper function to determine if a gene has loading domains (CAL or ACP). - - :param g: GeneRec object - :return: True if the gene has loading domains, False otherwise - """ - kinds = {d.kind for d in g.domains if d.kind} - names = {(d.name or "") for d in g.domains} - - # Domain-kind signals - has_cal = ("CAL_domain" in kinds) or any("faal" in (n.lower()) for n in names) - has_acp = ("PP-binding" in kinds) or ("ACP" in kinds) or any("acp" in (n.lower()) for n in names) - - return has_cal or has_acp - - -def _upstream_loading_cassette(all_genes: list[GeneRec], gi: int, max_bp: int = 20000) -> bool: - """ - Check for loading cassette (CAL + ACP) in upstream genes within max_bp distance. - - :param all_genes: list of all GeneRec objects in the region/cluster - :param gi: index of the current gene in all_genes - :param max_bp: maximum base pair distance to search upstream - :return: True if loading cassette is found upstream, False otherwise - """ - cur_start = all_genes[gi].start - - seen_cal = False - seen_acp = False - for j in range(gi - 1, -1, -1): - g = all_genes[j] - if cur_start - g.end > max_bp: - break - kinds = {d.kind for d in g.domains if d.kind} - names = {(d.name or "") for d in g.domains} - if ("CAL_domain" in kinds) or any("faal" in n.lower() for n in names): - seen_cal = True - if ("PP-binding" in kinds) or ("ACP" in kinds) or any("acp" in n.lower() for n in names): - seen_acp = True - if seen_cal and seen_acp: - return True - - return False - - -def _upstream_has_nrps_A(all_genes: Sequence[GeneRec], gi: int) -> bool: - """ - Check if there is an upstream gene with an NRPS A-domain. - - :param all_genes: list of all GeneRec objects in the region/cluster - :param gi: index of the gene g in all_genes - """ - for j in range(gi - 1, -1, -1): - if any(_is_kind(d, NRPS_A) for d in all_genes[j].domains): - return True - - return False - - -def _upstream_has_pks_KS(all_genes: Sequence[GeneRec], gi: int, ks_start: int) -> bool: - """ - Check if there is an upstream gene with a PKS KS-domain. - - :param all_genes: list of all GeneRec objects in the region/cluster - :param gi: index of the gene g in all_genes - :param ks_start: start position of the KS domain in gene g - :return: True if there is an upstream KS-domain, False otherwise - """ - # Genes upstream - for j in range(gi - 1, -1, -1): - if any(d.kind == "PKS_KS" for d in all_genes[j].domains): - return True - - # Same gene, KS before this window's KS - for d in all_genes[gi].domains: - if d.kind == "PKS_KS" and d.start < ks_start: - return True - - return False - - -def _standalone_pks_at_upstream(all_genes: Sequence[GeneRec], gi: int, ks_start: int, max_bp: int = 20000) -> bool: - """ - Check for standalone PKS AT domain in upstream genes within max_bp distance. - - :param all_genes: list of all GeneRec objects in the region/cluster - :param gi: index of the gene g in all_genes - :param ks_start: start position of the KS domain in gene g - :param max_bp: maximum base pair distance to search upstream - :return: True if standalone PKS AT domain is found upstream, False otherwise - """ - cur_start = ks_start - - # Same gene, before ks_start - for d in all_genes[gi].domains: - if d.kind == "PKS_AT" and d.end <= ks_start: - return True - - # Upstream genes, within distance - for j in range(gi - 1, -1, -1): - g = all_genes[j] - if cur_start - g.end > max_bp: - break - if any(d.kind == "PKS_AT" for d in g.domains): - return True - - return False - - -def _is_last_global_KS(all_genes: Sequence[GeneRec], gi: int, ks_start: int) -> bool: - """ - Check if the given KS domain is the last KS domain in the entire gene cluster/region. - - :param all_genes: list of all GeneRec objects in the region/cluster - :param gi: index of the gene g in all_genes - :param ks_start: start position of the KS domain in gene g - :return: True if the KS domain is the last KS domain, False otherwise - """ - # Same gene, any KS after this ks_start? - for d in all_genes[gi].domains: - if d.kind == "PKS_KS" and d.start > ks_start: - return False - - # Downstream genes - for j in range(gi + 1, len(all_genes)): - if any(d.kind == "PKS_KS" for d in all_genes[j].domains): - return False - - return True - - -def _downstream_has_te(all_genes: Sequence[GeneRec], gi: int, from_bp: int, max_bp: int = 20000) -> bool: - """ - Check for downstream TE domain within max_bp distance. - - :param all_genes: list of all GeneRec objects in the region/cluster - :param gi: index of the gene g in all_genes - :param from_bp: position to start searching from - :param max_bp: maximum base pair distance to search downstream - :return: True if TE domain is found downstream, False otherwise - """ - # Same gene after from_bp - for d in all_genes[gi].domains: - if d.kind in PKS_TE_ALIASES and d.end >= from_bp: - return True - - # Next genes within window - cur_end = from_bp - for j in range(gi + 1, len(all_genes)): - g = all_genes[j] - if g.start - cur_end > max_bp: - break - if any(d.kind in PKS_TE_ALIASES for d in g.domains): - return True - - return False - - -def _split_module_on_KS(domains: Sequence[DomainRec]) -> list[list[DomainRec]]: - """ - Split a list of DomainRec objects into module windows anchored on PKS_KS domains. - - :param domains: list of DomainRec objects - :return: list of lists of DomainRec objects, each sublist representing a module window - """ - windows: list[list[DomainRec]] = [] - cur: list[DomainRec] = [] - - for d in domains: - if d.kind == "PKS_KS": - # Start new module window anchored at this KS - if cur: - windows.append(cur) - cur = [d] - else: - if cur: # only append if we have started a module - cur.append(d) - - if cur: - windows.append(cur) - - return windows - - -def _classify_pks_window(window: Sequence[DomainRec]) -> tuple[str, set[str], bool, bool, bool, bool]: - """ - Classify a PKS module based on the presence of domains in the given window. - - :param window: sequence of DomainRec objects representing a module window - :return: tuple containing: - - module type (str) - - set of present domain kinds (set[str]) - - has_active_KR (bool) - - has_active_DH (bool) - - has_active_ER (bool) - - has_AT (bool) - """ - kinds_linear = [d.kind for d in window if d.kind in PKS_KINDS] - present = set(kinds_linear) - - has_AT = "PKS_AT" in present - has_active_KR = any("PKS_KR" in present and _is_active_domain(d) for d in window if d.kind == "PKS_KR") - has_active_DH = any("PKS_DH" in present and _is_active_domain(d) for d in window if d.kind == "PKS_DH") - has_active_ER = any("PKS_ER" in present and _is_active_domain(d) for d in window if d.kind == "PKS_ER") - - # Rules: - # - KS + AT with neither KR nor DH nor ER => PKS_A - # - KS + AT + KR (no DH and no ER) => PKS_B (KR after AT is naturally true in window order) - # - KS + AT + KR + DH (no ER) => PKS_C - # - KS + AT + KR + DH + ER => PKS_D - if has_active_ER and has_active_DH and has_active_KR: - mtype = "PKS_D" - elif has_active_DH and has_active_KR and not has_active_ER: - mtype = "PKS_C" - elif has_active_KR and not has_active_DH and not has_active_ER: - mtype = "PKS_B" - elif not has_active_KR and not has_active_DH and not has_active_ER: - mtype = "PKS_A" - else: - mtype = "UNCLASSIFIED" - - return mtype, present, has_active_KR, has_active_DH, has_active_ER, has_AT - - -def _window_bounds(window: Sequence[DomainRec]) -> tuple[int, int]: - """ - Get the start and end positions of a module window. - - :param window: sequence of DomainRec objects representing a module window - :return: tuple of (start, end) positions - """ - return min(d.start for d in window), max(d.end for d in window) - - -def _pks_modules_for_gene(g: GeneRec, all_genes: Sequence[GeneRec], gi: int) -> list[PKSModuleReadout]: - """ - Get PKS module readouts for a given gene. - - :param g: GeneRec object - :param all_genes: list of all GeneRec objects in the region/cluster - :param gi: index of the gene g in all_genes - :return: list of PKSModuleReadout dictionaries - """ - out: list[PKSModuleReadout] = [] - if all(d.kind != "PKS_KS" for d in g.domains): - return out # No KS domains, no modules - - windows = _split_module_on_KS(g.domains) - for mi, win in enumerate(windows): - mtype, present, has_active_KR, has_active_DH, has_active_ER, has_AT = _classify_pks_window(win) - ks_start = win[0].start # window is KS-anchored - s, e = _window_bounds(win) - - if has_AT: - at_src: Literal["CIS", "TRANS", "UNKNOWN"] = "CIS" - else: - at_src = "TRANS" if _find_upstream_at_only_gene(all_genes, gi) is not None else "UNKNOWN" - - # Assign provisional PKS role - has_te_in_window = any(d.kind in PKS_TE_ALIASES for d in win) - upstream_has_ks = _upstream_has_pks_KS(all_genes, gi, ks_start) - starter = _standalone_pks_at_upstream(all_genes, gi, ks_start) and not upstream_has_ks - - terminal_by_TE = False - if _is_last_global_KS(all_genes, gi, ks_start): - terminal_by_TE = has_te_in_window or _downstream_has_te(all_genes, gi, from_bp=e) - - if starter and terminal_by_TE: - role: Literal["starter", "elongation", "terminal", "starter+terminal", "unknown"] = "starter+terminal" - elif starter: - role = "starter" - elif terminal_by_TE: - role = "terminal" - else: - role = "elongation" - - s, e = _window_bounds(win) - out.append( - PKSModuleReadout( - kind="PKS_module", - module_type=mtype, - present_domains=sorted(present), - at_source=at_src, - module_index_in_gene=mi, - start=s, - end=e, - gene_name=g.name, - has_active_KR=has_active_KR, - has_active_DH=has_active_DH, - has_active_ER=has_active_ER, - has_AT=has_AT, - role=role, - ) - ) - - return out - - -def _nrps_modules_for_gene( - g: GeneRec, - all_genes: Sequence[GeneRec], - gi: int, - *, - cache_dir_override: Path | str | None = None, - model: object | None = None, - pred_threshold: float = 0.5, -) -> list[NRPSModuleReadout]: - """ - Get NRPS module readouts for a given gene. - - :param g: GeneRec object - :param all_genes: list of all GeneRec objects in the region/cluster - :param gi: index of the gene g in all_genes - :param cache_dir_override: optional cache directory override for substrate prediction - :param model: optional model object for substrate prediction - :param pred_threshold: prediction threshold for substrate prediction - :return: list of NRPSModuleReadout dictionaries - """ - doms: list[DomainRec] = list(g.domains) - out: list[NRPSModuleReadout] = [] - - # Indices of A-domains in left-to-right order - a_idx = [i for i, d in enumerate(doms) if _is_kind(d, NRPS_A)] - if not a_idx: - return out # no A-domains, no modules - - for mi, ai in enumerate(a_idx): - # Extend window backward by one if there is an immediately previous C (same gene) - start_i = ai - if ai - 1 >= 0 and _is_kind(doms[ai - 1], NRPS_C): - start_i = ai - 1 - - # Extend forward until (but not including) the next A-domain - end_i = a_idx[mi + 1] if mi + 1 < len(a_idx) else len(doms) - - window = doms[start_i:end_i] - present = [d.kind for d in window if d.kind] - - has_C = any(_is_kind(d, NRPS_C) for d in window) - has_Cstarter = any(_is_cstarter(d) for d in window) - has_T = any(_is_kind(d, NRPS_T_ALIASES) for d in window) - has_E = any(_is_kind(d, NRPS_E) for d in window) - has_MT = any(_is_kind(d, NRPS_MT_ALIASES) for d in window) - has_Ox = any(_is_kind(d, NRPS_OX_ALIASES) for d in window) - has_R = any(_is_kind(d, NRPS_R_ALIASES) for d in window) - has_TE = any(_is_kind(d, NRPS_TE) for d in window) - - # Fallback evidence of a separate loading cassette upstream - loading_upstream = _upstream_loading_cassette(all_genes, gi) - upstream_has_A = _upstream_has_nrps_A(all_genes, gi) - - # Role heuristic - is_first_module_in_gene = mi == 0 - - starter = ( - has_Cstarter - or (is_first_module_in_gene and loading_upstream and not upstream_has_A) - or ((not has_C) and not upstream_has_A) - ) - terminal = has_TE or has_R - - if starter and terminal: - role: Literal["starter", "elongation", "terminal", "starter+terminal", "unknown"] = "starter+terminal" - elif starter: - role = "starter" - elif terminal: - role = "terminal" - elif has_C: - role = "elongation" - else: - role = "unknown" - - # Substrate prediction from the A-domain inside this window (the anchor) - A = doms[ai] - preds = predict_amp_domain_substrate( - domain=A, - cache_dir_override=cache_dir_override, - model=model, - pred_threshold=pred_threshold, - ) - # Pull top call and score - top_pred = preds[0] if preds else None - if top_pred: - substrate_name = top_pred.get("substrate_name") - substrate_smiles = top_pred.get("substrate_smiles") - score = top_pred.get("score") - else: - substrate_name = None - substrate_smiles = None - score = None - - s = min(d.start for d in window) - e = max(d.end for d in window) - - out.append( - NRPSModuleReadout( - kind="NRPS_module", - gene_name=g.name, - module_index_in_gene=mi, - start=s, - end=e, - present_domains=present, - has_C=has_C, - has_T=has_T, - has_E=has_E, - has_MT=has_MT, - has_Ox=has_Ox, - has_R=has_R, - has_TE=has_TE, - role=role, - substrate_name=substrate_name, - substrate_smiles=substrate_smiles, - score=score, - raw_preds=preds, - ) - ) - - return out - - -def linear_readouts( - rec: RegionRec | CandidateClusterRec, - cache_dir_override: Path | str | None = None, - *, - level: Literal["rec", "gene"] = "rec", - model: object | None = None, - pred_threshold: float = 0.5, -) -> Generator[dict, None, None]: - """ - Reads out a RegionRec or CandidateClusterRec object and returns a list substrates. - - :param rec: RegionRec or CandidateClusterRec object to read out - :param level: level of readout, either "rec" for region/cluster level or "gene" for gene level - :return: Generator of substrate specificities per level as dictionaries - :raises AssertionError: if level is not "rec" or "gene" or rec is not a RegionRec or CandidateClusterRec - """ - assert level in {"rec", "gene"}, 'Level must be either "rec" or "gene"' - assert isinstance(rec, (RegionRec, CandidateClusterRec)), "rec must be a RegionRec or CandidateClusterRec object" - - # Ensure genes are left-to-right - genes = list(rec.genes) - genes.sort(key=lambda g: (g.start, g.end)) - - readout_rec: list[dict] = [] - - for gi, gene in enumerate(genes): - items: list[PKSModuleReadout | NRPSModuleReadout] = [] - - # Collect AMP (A-domain) events for this gene - items.extend( - _nrps_modules_for_gene( - gene, - all_genes=genes, - gi=gi, - cache_dir_override=cache_dir_override, - model=model, - pred_threshold=pred_threshold, - ) - ) - - # Collect PKS module events - items.extend(_pks_modules_for_gene(gene, genes, gi)) - - # Merge in strict genomic order - items.sort(key=lambda d: (int(d.get("start", 0)), int(d.get("end", 0)))) - - if level == "gene" and items: - yield {"rec": gene, "readout": items} - else: - readout_rec.extend(items) - - if level == "rec" and readout_rec: - yield {"rec": rec, "readout": readout_rec} diff --git a/src/biocracker/text_mining.py b/src/biocracker/text_mining.py deleted file mode 100644 index 29baef7..0000000 --- a/src/biocracker/text_mining.py +++ /dev/null @@ -1,316 +0,0 @@ -"""Module contains text mining utilities for BioCracker.""" - -import re -from collections.abc import Iterable, Mapping - -from biocracker.antismash import CandidateClusterRec, RegionRec - -TOKENSPEC_SIDEROPHORE: dict = { - "any": { - # Direct words - "NI-siderophore", - "siderophore", - "iron chelator", - "feo-", - "fe3+ transporter", - # Terms often present around siderophores - "entb", - "enterobactin", - "vibriobactin", - "aerobactin", - "pyoverdine", - "yersiniabactin", - "bacillibactin", - "ferrichrome", - "desferrioxamine", - "myxochelin", - "salicylate", - "catecholate", - "dhb", - "2,3-dihydroxybenzoate", - "isochorismate", - "tonb", - "tonb-dependent receptor", - "fhu", - "fep", - "fepA", - "fepB", - }, - "rx": [ - re.compile(r"\b(iuc|ent|vib|fep|fhu)\w*\b", re.I), # common locus prefixes - re.compile(r"\b(iron[-\s]?uptake|iron[-\s]?transport)\b", re.I), - ], - "bonus_if": { - # Add weight when these appear anywhere in the record - "AMP-binding", - "Heterocyclization", - "NRPS", - "siderophore biosynthesis", - }, - "weight": 1.0, - "bonus_weight": 0.5, - "min_score": 4.0, -} - -TOKENSPEC_LIPOPEPTIDE: dict = { - "any": { - "lipopeptide", - "lipoinitiation", - "starter c", - "cstarter", - "acyltransferase", - "CAL_domain", - "FAAL", - "acyl-CoA ligase", - "acyl-CoA synthetase", - }, - "rx": [ - re.compile(r"\b(Cstarter|Condensation[_\s-]?starter)\b", re.I), - re.compile(r"\b(acyl[-\s]?(ligase|co[a\-]?synthetase))\b", re.I), - ], - "bonus_if": { - "AMP-binding", - "PP-binding", - "ACP", - "Condensation", - }, - "weight": 1.0, - "bonus_weight": 0.5, - "min_score": 2.0, -} - -TOKENSPEC_METHYLTRANSFERAESE: dict = { - "any": { - # Common names / domain kinds - "methyltransferase", - "N-methyltransferase", - "O-methyltransferase", - "C-methyltransferase", - "SAM-dependent methyltransferase", - "S-adenosylmethionine", - "AdoMet", - "MT", - "Methyltransferase", - }, - "rx": [ - # Frequent abbreviations in annotations - re.compile(r"\b[ONC]-?MT\b", re.I), # OMT / NMT / CMT - re.compile(r"\bSAM[-\s]?dependent\b", re.I), - ], - "bonus_if": { - # Mild boosts when tailoring context is present - "NRPS", - "T1PKS", - "PKS", - "ACP", - "PP-binding", - "oxidoreductase", - }, - "weight": 1.0, - "bonus_weight": 0.3, - "min_score": 4.0, -} - -TOKENSPEC_HALOGENASES: dict = { - "any": { - # Keep very specific strings only (broad 'halogenase' moved to regex to avoid 'dehalogenase') - "flavin-dependent halogenase", - "flavin dependent halogenase", - "FAD-dependent halogenase", - "FAD dependent halogenase", - "chlorinase", - "fluorinase", - "brominase", - "halogenation", - }, - "rx": [ - # Generic halogenase, but NOT 'dehalogenase' - re.compile(r"(? Mapping[str, Mapping]: - """ - Returns the default token specifications for text mining. - - :return: mapping of token names to their specifications - """ - return { - "siderophore": TOKENSPEC_SIDEROPHORE, - "lipopeptide": TOKENSPEC_LIPOPEPTIDE, - "methyltransferase": TOKENSPEC_METHYLTRANSFERAESE, - "halogenase": TOKENSPEC_HALOGENASES, - "glycosyltransferase": TOKENSPEC_GLYCOSYLTRANSFERASES, - } - - -def _harvest_text_for_token_mining(rec: RegionRec | CandidateClusterRec) -> list[tuple[str, str]]: - """ - Harvests textual fields from a RegionRec or CandidateClusterRec for token mining. - - :param rec: RegionRec or CandidateClusterRec object - :return: list of (field_label, field_value) tuples containing textual hints - """ - fields: list[tuple[str, str]] = [] - - # Record-level fields - rid = getattr(rec, "record_id", "") or "" - fields.append(("record_id", str(rid))) - - # Product tags (region/cand_cluster) - if getattr(rec, "product_tags", None): - fields.append(("product_tags", " ".join(rec.product_tags))) - - # Genes and domains - for g in rec.genes: - # Gene-level textual hints - for label, val in ( - ("gene_name", g.name), - ("gene_product", g.product or ""), - ("gene_note", g.note or ""), - ("gene_desc", g.description or ""), - ("gene_symbol", g.gene_symbol or ""), - ("locus_tag", g.locus_tag or ""), - ("protein_id", g.protein_id or ""), - ): - if val: - fields.append((label, str(val))) - - # Domain-level hints - for d in g.domains: - if d.kind: - fields.append(("domain_kind", d.kind)) - if d.name: - fields.append(("domain_name", d.name)) - # Qualifiers often hold useful textual hints - for k, v in (d.raw_qualifiers or {}).items(): - if isinstance(v, (list, tuple)): - fields.append((f"q:{k}", " ".join(map(str, v)))) - elif v: - fields.append((f"q:{k}", str(v))) - - return fields - - -def mine_virtual_tokens(rec: RegionRec | CandidateClusterRec, tokenspecs: Mapping[str, Mapping]) -> list[dict]: - """ """ - fields = _harvest_text_for_token_mining(rec) - - # Flatten corpus for bonus checks - full_blob = " ".join(t for _, t in fields).lower() - - results: list[dict] = [] - - for token, spec in tokenspecs.items(): - weight = float(spec.get("weight", 1.0)) - bonus_weight = float(spec.get("bonus_weight", 0.5)) - min_score = float(spec.get("min_score", 1.0)) - - any_terms: Iterable[str] = spec.get("any") or [] - rx_terms: Iterable[re.Pattern] = spec.get("rx") or [] - bonus_terms: Iterable[str] = spec.get("bonus_if") or [] - - score = 0.0 - matches: list[tuple[str, str]] = [] - evidence_labels: set[str] = set() - - # Substring matches (case-insensitive) - lower_terms = [t.lower() for t in any_terms] - for label, text in fields: - tl = text.lower() - hit = False - - for t in lower_terms: - if t and t in tl: - score += weight - hit = True - - for rx in rx_terms: - if rx.search(text): - score += weight - hit = True - - if hit: - matches.append((label, text)) - evidence_labels.add(label) - - # Bonus checks: bonus if any of bonus_terms appear anywhere - for b in bonus_terms: - if b.lower() in full_blob: - score += bonus_weight - - if score >= min_score: - results.append( - { - "token": token, - "score": round(score, 3), - "matches": matches[:10], # cap evidence length - "evidence": sorted(evidence_labels), - } - ) - - # Highest score first, deterministic - results.sort(key=lambda r: (-r["score"], r["token"])) - return results diff --git a/src/biocracker/utils/__init__.py b/src/biocracker/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/biocracker/helpers.py b/src/biocracker/utils/download.py similarity index 98% rename from src/biocracker/helpers.py rename to src/biocracker/utils/download.py index eb76b78..10bcf00 100644 --- a/src/biocracker/helpers.py +++ b/src/biocracker/utils/download.py @@ -1,4 +1,4 @@ -"""Module contains helper functions for various tasks.""" +"""Utility functions for downloading and preparing files.""" import os import platform @@ -12,7 +12,8 @@ from tqdm import tqdm -from biocracker.config import NAME_CACHE_DIR + +NAME_CACHE_DIR = os.getenv("NAME_CACHE_DIR", "biocracker_cache") def get_biocracker_cache_dir(path: str | Path | None = None) -> Path: @@ -287,4 +288,4 @@ def download_and_prepare(url: str, cache_dir: str | Path | None = None, *, force url_txt.write_text(url) ready_marker.touch() - return _resolve_return_path(item_dir) + return _resolve_return_path(item_dir) \ No newline at end of file diff --git a/src/biocracker/utils/json.py b/src/biocracker/utils/json.py new file mode 100644 index 0000000..8ebd918 --- /dev/null +++ b/src/biocracker/utils/json.py @@ -0,0 +1,25 @@ +"""Module for streaming JSON and JSONL files.""" + +import json +from typing import Any, Generator + +import ijson + + +def iter_json(path: str, jsonl: bool = False) -> Generator[Any, None, None]: + """ + Stream items from a JSON array or a JSON Lines (JSONL) file. + + :param path: path to the JSON or JSONL file + :param jsonl: if True, treat the file as JSONL (one JSON object per line). If False, assume a single JSON array + :yield: parsed JSON objects + """ + with open(path, "rb") as f: + if jsonl: + for line in f: + line = line.strip() + if not line: + continue + yield json.loads(line) + else: + yield from ijson.items(f, "item") diff --git a/src/biocracker/utils/logging.py b/src/biocracker/utils/logging.py new file mode 100644 index 0000000..79fc43b --- /dev/null +++ b/src/biocracker/utils/logging.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import logging +import sys +from dataclasses import dataclass + + +PACKAGE_LOGGER = "biocracker" + +STANDARD_FMT = "%(asctime)s | %(levelname)s | %(name)s | %(message)s" +STANDARD_DATEFMT = "%Y-%m-%d %H:%M:%S" + + +def setup_logging( + level: str | int = "INFO", + *, + fmt: str = STANDARD_FMT, + datefmt: str = STANDARD_DATEFMT, + stream: None | int | str | object = None, +) -> None: + """ + Set up logging for the biocracker package. + + :param level: log level for console output + :param fmt: log message format + :param datefmt: date format for log messages + :param stream: output stream for console logs; defaults to sys.stderr + .. note:: safe to call multiple times; library code should not call this function; + it is intended for use by applications using the library + """ + if stream is None: + stream = sys.stderr + + if isinstance(level, str): + level = level.upper() + + root = logging.getLogger() + root.setLevel(level) + + handler = logging.StreamHandler(stream) + handler.setLevel(level) + handler.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt)) + + # Avoid duplicate handlers if called repeatedly (common in notebooks) + # Keep it simple: remove existing handlers created by previous setup calls. + root.handlers = [handler] + + # Make sure package logger propagates to root + logging.getLogger(PACKAGE_LOGGER).propagate = True + + +def add_file_handler( + logfile: str, + *, + level: str | int = "DEBUG", + fmt: str = STANDARD_FMT, + datefmt: str = STANDARD_DATEFMT, +) -> None: + """ + Add a file handler to the root logger. + + :param logfile: path to log file + :param level: log level for file output + .. note:: intended to be called after setup_logginer(); safe to call multiple times + for the same logfile + """ + if isinstance(level, str): + level = level.upper() + + root = logging.getLogger() + + # Prevent duplicate file handlers for the same path + for h in root.handlers: + if isinstance(h, logging.FileHandler) and h.baseFilename == logfile: + return + + fh = logging.FileHandler(logfile) + fh.setLevel(level) + fh.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt)) + + root.addHandler(fh) + + +@dataclass(frozen=True) +class Ctx: + """ + Context information for logging. + + :cvar region: optional region identifier + :cvar gene: optional gene identifier + :cvar domain: optional domain identifier + :cvar model: optional model identifier + + .. note:: + Usage example: + ctx = Ctx(region="chr1", gene="BRCA1") + logger.info(f"{ctx.prefix()}This is a log message.") + """ + + region: str | None = None + gene: str | None = None + domain: str | None = None + model: str | None = None + + def prefix(self) -> str: + """ + Generate a log prefix string based on the context. + + :return: formatted prefix string + """ + parts = [] + if self.region: parts.append(f"region={self.region}") + if self.gene: parts.append(f"gene={self.gene}") + if self.domain: parts.append(f"domain={self.domain}") + if self.model: parts.append(f"model={self.model}") + return ("[" + " ".join(parts) + "] ") if parts else "" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_antismash.py b/tests/test_antismash.py deleted file mode 100644 index 5ffb87e..0000000 --- a/tests/test_antismash.py +++ /dev/null @@ -1,294 +0,0 @@ -"""Tests for antismash parsing functions.""" - -import io -from pathlib import Path - -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqFeature import FeatureLocation, SeqFeature -from Bio.SeqRecord import SeqRecord - -from biocracker.antismash import ( - DomainRec, - GeneRec, - RegionRec, - _collect_region, - _domain_rec_from_feat, - _gene_name, - _gene_rec_from_feat, - _in_bounds, - _iter_cds, - _iter_domains, - _iter_regions, - _q1, - _start_end, - parse_region_gbk_file, - parse_region_gbk_string, -) - - -def make_region(start: int, end: int, product: str = "NRPS") -> SeqFeature: - """ - Create a region feature. - - :param start: start position - :param end: end position - :param product: product qualifier - :return: SeqFeature representing the region - """ - return SeqFeature( - FeatureLocation(start, end, strand=1), - type="region", - qualifiers={"product": [product]}, - ) - - -def make_cds(start: int, end: int, strand: int = 1, qualifiers: dict | None = None) -> SeqFeature: - """ - Create a CDS feature. - - :param start: start position - :param end: end position - :param strand: strand (1 or -1) - :param qualifiers: additional qualifiers to include - :return: SeqFeature representing the CDS""" - q = { - "product": ["hypothetical protein"], - "translation": ["M" * 10], - "locus_tag": ["geneA"], - "EC_number": ["1.1.1.1"], - "protein_id": ["PID123"], - "gene": ["foo"], - } - if qualifiers: - q.update(qualifiers) - return SeqFeature( - FeatureLocation(start, end, strand=strand), - type="CDS", - qualifiers=q, - ) - - -def make_domain( - start: int, end: int, kind: str = "AMP-binding", label: str = "A", translation: str = "AAA" -) -> SeqFeature: - """ - Create a aSDomain feature. - - :param start: start position - :param end: end position - :param kind: kind of domain - :param label: label of the domain - :param translation: amino acid translation - :return: SeqFeature representing the domain - """ - return SeqFeature( - FeatureLocation(start, end, strand=1), - type="aSDomain", - qualifiers={ - "aSDomain": [kind], - "label": [label], - "translation": [translation], - "note": [f"{kind} domain"], - }, - ) - - -def make_record() -> SeqRecord: - """ - Build a small synthetic record: - - region: 100..900 - - gene1 (fwd): 120..400, domains at 140..200 (A), 220..260 (PCP) - - gene2 (rev): 500..800, domains at 650..680 (C), 700..750 (TE) (note order on genome) - - :return: SeqRecord with features - """ - rec = SeqRecord(Seq("N" * 1200), id="REC1", name="REC1", description="test") - # Required by Biopython's GenBank writer: - rec.annotations["molecule_type"] = "DNA" - # Optional but harmless - rec.annotations["topology"] = "linear" - rec.annotations["organism"] = "Unknown" - rec.annotations["data_file_division"] = "UNK" - - region = make_region(100, 900, product="NRPS") - - cds1 = make_cds(120, 400, strand=1, qualifiers={"locus_tag": ["geneFwd"], "product": ["enzyme X"]}) - dom1a = make_domain(140, 200, kind="AMP-binding", label="A", translation="AA1") - dom1b = make_domain(220, 260, kind="PCP", label="PCP", translation="BB2") - - cds2 = make_cds(500, 800, strand=-1, qualifiers={"locus_tag": ["geneRev"], "product": ["enzyme Y"]}) - # Domains on genome left-to-right: 650..680 then 700..750 - dom2a = make_domain(650, 680, kind="C", label="C", translation="CC3") - dom2b = make_domain(700, 750, kind="TE", label="TE", translation="DD4") - - rec.features.extend([region, cds1, cds2, dom1a, dom1b, dom2a, dom2b]) - return rec - - -def test_q1_and_start_end_and_gene_name_fallback() -> None: - """ - Test _q1, _start_end, and _gene_name fallback behavior. - """ - feat = make_cds(10, 50, strand=1, qualifiers={"locus_tag": ["abc"], "gene": ["gX"]}) - # _q1 returns first present key - assert _q1(feat, ("nope", "gene", "locus_tag")) == "gX" - # start/end/strand normalized - strand, s, e = _start_end(feat) - assert (strand, s, e) == (1, 10, 50) - - # Fallback name when no usual qualifiers - feat_no_name = make_cds(100, 120, strand=-1, qualifiers={"product": ["p"]}) - for k in ("locus_tag", "gene", "protein_id", "Name"): - feat_no_name.qualifiers.pop(k, None) - fallback = _gene_name(feat_no_name) - assert fallback == "CDS_100_120_rev" - - -def test_iterators_and_bounds() -> None: - """ - Test the iterators for regions, CDS, and domains, and that features are within bounds. - """ - rec = make_record() - regs = _iter_regions(rec) - cds = _iter_cds(rec) - doms = _iter_domains(rec) - assert len(regs) == 1 - assert len(cds) == 2 - assert len(doms) == 4 - - # All CDS/domains should be within the region - for f in cds + doms: - assert _in_bounds(f, regs[0]) - - -def test_domain_and_gene_rec_builders() -> None: - """ - Test the construction of GeneRec and DomainRec from features. - """ - rec = make_record() - # Use the first CDS and a domain - cds_feat = [f for f in rec.features if f.type == "CDS"][0] - dom_feat = [f for f in rec.features if f.type == "aSDomain"][0] - - grecord = _gene_rec_from_feat(cds_feat) - assert isinstance(grecord, GeneRec) - assert grecord.name == "geneFwd" - assert grecord.strand == 1 - assert grecord.start == 120 and grecord.end == 400 - assert grecord.protein_seq == "M" * 10 - assert grecord.product == "enzyme X" - assert grecord.ec_number == "1.1.1.1" - assert grecord.protein_id == "PID123" - assert "translation" in grecord.raw_qualifiers - - drecord = _domain_rec_from_feat(dom_feat) - assert isinstance(drecord, DomainRec) - assert drecord.kind == "AMP-binding" - assert drecord.name == "A" - assert drecord.aa_seq == "AA1" - assert drecord.start == 140 and drecord.end == 200 - assert "aSDomain" in drecord.raw_qualifiers - - -def test_collect_region_gene_and_domain_ordering() -> None: - """ - Test the ordering of genes and domains within a region. - """ - rec = make_record() - regions = _collect_region(rec) - assert len(regions) == 1 - R = regions[0] - assert isinstance(R, RegionRec) - assert R.record_id == "REC1" - assert R.product_tags == ["NRPS"] - assert len(R.genes) == 2 - - g1, g2 = R.genes - # Genes are sorted by genomic start - assert g1.name == "geneFwd" - assert g2.name == "geneRev" - - # Domains inside g1 (forward) remain left-to-right - kinds_g1 = [d.kind for d in g1.domains] - assert kinds_g1 == ["AMP-binding", "PCP"] - - # Domains inside g2 (reverse) should be reversed relative to genomic sort - # On genome: C (650..680), TE (700..750) => sorted asc ["C","TE"] - # For reverse gene, implementation reverses => ["TE","C"] - kinds_g2 = [d.kind for d in g2.domains] - assert kinds_g2 == ["TE", "C"] - - -def test_parse_region_gbk_string_roundtrip() -> None: - """ - Test parsing a GenBank string via the full roundtrip: build record -> write GenBank -> parse GenBank - """ - # Build record, write to a real GenBank string with Biopython, then parse via the function - rec = make_record() - buf = io.StringIO() - SeqIO.write([rec], buf, "genbank") - gbk_text = buf.getvalue() - - regions = parse_region_gbk_string(gbk_text) - assert len(regions) == 1 - R = regions[0] - assert [g.name for g in R.genes] == ["geneFwd", "geneRev"] - # Confirm domain normalization persisted through the full parse - assert [d.kind for d in R.genes[1].domains] == ["TE", "C"] - - -def test_parse_region_gbk_file(tmp_path: Path) -> None: - """ - Test parsing a GenBank file via the full roundtrip: build record -> write GenBank file -> parse GenBank file - """ - rec = make_record() - gbk = tmp_path / "mini.gbk" - with gbk.open("w") as handle: - SeqIO.write([rec], handle, "genbank") - - regions = parse_region_gbk_file(str(gbk)) - assert len(regions) == 1 - assert isinstance(regions[0], RegionRec) - assert len(regions[0].genes) == 2 - - -def test_gene_name_priority_order() -> None: - """ - Test the priority of gene name sources. - """ - # Ensure priority: locus_tag > gene > protein_id > Name - # Start by removing all, then add them progressively - base = make_cds(10, 20, qualifiers={}) - for key in ("locus_tag", "gene", "protein_id", "Name"): - base.qualifiers.pop(key, None) - - # Add only Name - base.qualifiers["Name"] = ["N1"] - assert _gene_name(base) == "N1" - - # Add protein_id overrides Name - base.qualifiers["protein_id"] = ["PID"] - assert _gene_name(base) == "PID" - - # Add gene overrides protein_id - base.qualifiers["gene"] = ["gSym"] - assert _gene_name(base) == "gSym" - - # Add locus_tag overrides gene - base.qualifiers["locus_tag"] = ["LT"] - assert _gene_name(base) == "LT" - - -def test_in_bounds_false_when_outside() -> None: - """ - Test that _in_bounds returns False when child feature is outside parent bounds. - """ - parent = make_region(100, 200) - child = make_domain(50, 120) # starts before - assert not _in_bounds(child, parent) - child2 = make_domain(150, 250) # ends after - assert not _in_bounds(child2, parent) - # Exactly on edges is acceptable - child3 = make_domain(100, 200) - assert _in_bounds(child3, parent) diff --git a/tests/test_helpers.py b/tests/test_helpers.py deleted file mode 100644 index b04c46b..0000000 --- a/tests/test_helpers.py +++ /dev/null @@ -1,295 +0,0 @@ -"""Tests for biocracker.helpers module.""" - -import io -import zipfile -from pathlib import Path -from types import TracebackType - -from pytest import MonkeyPatch - -from biocracker.helpers import ( - _collapse_singleton, - _guess_filename_from_url, - _resolve_return_path, - _slug_url, - download_and_prepare, - get_biocracker_cache_dir, -) - - -class _FakeHTTPResponse: - """A fake HTTP response object for testing purposes.""" - - def __init__(self, data: bytes, chunk: int = 1024) -> None: - """ - Initialize the fake HTTP response. - - :param data: bytes, the content to be read - :param chunk: int, the chunk size for reading - """ - self._buf = io.BytesIO(data) - self._chunk = chunk - self.headers = {"Content-Length": str(len(data))} - - def read(self, n: int = -1) -> bytes: - """ - Read up to n bytes from the response. - - :param n: int, number of bytes to read. If -1, read all - :return: bytes, the read content - """ - if n == -1: - return self._buf.read() - - return self._buf.read(n) - - def __enter__(self) -> "_FakeHTTPResponse": - """ - Enter the runtime context related to this object. - - :return: self - """ - return self - - def __exit__(self, exc_type: type, exc: Exception, tb: TracebackType) -> bool: - """ - Exit the runtime context related to this object. - - :param exc_type: type, the exception type - :param exc: Exception, the exception instance - :param tb: TracebackType, the traceback object - :return: bool, False to propagate exceptions - """ - return False - - -def _make_zip_bytes(files: dict[str, bytes]) -> bytes: - """ - Create a ZIP archive in memory containing the specified files. - - :returns: a bytes object of a ZIP containing those files - """ - bio = io.BytesIO() - with zipfile.ZipFile(bio, "w", zipfile.ZIP_DEFLATED) as z: - for name, data in files.items(): - z.writestr(name, data) - - return bio.getvalue() - - -def test_get_cache_dir_custom_path_creates_marker(tmp_path: Path) -> None: - """ - Test that providing a custom cache directory creates the directory and marker file. - - :param tmp_path: Path, a temporary directory provided by pytest - """ - custom = tmp_path / "mycache" - path = get_biocracker_cache_dir(custom) - assert path == custom - assert path.exists() - assert (path / ".biocracker_cache_marker").exists() - - -# def test_get_cache_dir_auto_linux_uses_xdg(monkeypatch: MonkeyPatch, tmp_path: Path) -> None: -# """ -# Test that on Linux, the cache directory is created under XDG_CACHE_HOME. - -# :param monkeypatch: MonkeyPatch, pytest fixture for monkeypatching -# :param tmp_path: Path, a temporary directory provided by pytest -# """ -# # Pretend we're on Linux -# monkeypatch.setattr("platform.system", lambda: "Linux") -# xdg = tmp_path / "xdg" -# monkeypatch.setenv("XDG_CACHE_HOME", str(xdg)) -# path = get_biocracker_cache_dir() -# assert path == xdg / "biocracker" -# assert (path / ".biocracker_cache_marker").exists() - - -# def test_get_cache_dir_auto_macos(monkeypatch: MonkeyPatch, tmp_path: Path) -> None: -# """ -# Test that on macOS, the cache directory is created under ~/Library/Caches. - -# :param monkeypatch: MonkeyPatch, pytest fixture for monkeypatching -# :param tmp_path: Path, a temporary directory provided by pytest -# """ -# monkeypatch.setattr("platform.system", lambda: "Darwin") -# monkeypatch.setenv("HOME", str(tmp_path)) # control home -# path = get_biocracker_cache_dir() -# assert path == tmp_path / "Library" / "Caches" / "biocracker" -# assert (path / ".biocracker_cache_marker").exists() - - -# def test_get_cache_dir_auto_windows(monkeypatch: MonkeyPatch, tmp_path: Path) -> None: -# """ -# Test that on Windows, the cache directory is created under LOCALAPPDATA. - -# :param monkeypatch: MonkeyPatch, pytest fixture for monkeypatching -# :param tmp_path: Path, a temporary directory provided by pytest -# """ -# monkeypatch.setattr("platform.system", lambda: "Windows") -# monkeypatch.setenv("LOCALAPPDATA", str(tmp_path / "AppData" / "Local")) -# path = get_biocracker_cache_dir() -# assert path == tmp_path / "AppData" / "Local" / "biocracker" -# assert (path / ".biocracker_cache_marker").exists() - - -def test_slug_and_guess_filename() -> None: - """ - Test _guess_filename_from_url and _slug_url functions. - """ - url = "https://example.com/path/to/file-model-v1.zip" - assert _guess_filename_from_url(url) == "file-model-v1.zip" - slug1 = _slug_url(url) - slug2 = _slug_url(url) - assert slug1 == slug2 # deterministic - assert slug1.startswith("file-model-v1.zip-") - assert len(slug1.split("-")[-1]) == 16 # 16 hex chars - - -def test_download_non_archive_idempotent(monkeypatch: MonkeyPatch, tmp_path: Path) -> None: - """ - Test that downloading a non-archive file is idempotent and uses caching. - - :param monkeypatch: MonkeyPatch, pytest fixture for monkeypatching - :param tmp_path: Path, a temporary directory provided by pytest - """ - # Fake small payload - data = b"hello-world" - calls = {"count": 0} - - def fake_urlopen(req): - calls["count"] += 1 - return _FakeHTTPResponse(data) - - monkeypatch.setattr("biocracker.helpers.urlopen", fake_urlopen) - - # First call downloads - out = download_and_prepare( - "https://example.com/payload.bin", - cache_dir=tmp_path, - ) - assert out.is_file() - assert out.read_bytes() == data - assert calls["count"] == 1 - - # Second call should hit READY and not re-download - def fail_urlopen(_): - raise AssertionError("Should not be called on idempotent run") - - monkeypatch.setattr("biocracker.helpers.urlopen", fail_urlopen) - out2 = download_and_prepare( - "https://example.com/payload.bin", - cache_dir=tmp_path, - ) - assert out2 == out - - -def test_download_zip_single_file_returns_inner_file(monkeypatch: MonkeyPatch, tmp_path: Path) -> None: - """ - Test downloading a ZIP archive with a single file returns that file directly. - - :param monkeypatch: MonkeyPatch, pytest fixture for monkeypatching - :param tmp_path: Path, a temporary directory provided by pytest - """ - zip_bytes = _make_zip_bytes({"model.pt": b"MODEL-DATA"}) - monkeypatch.setattr("biocracker.helpers.urlopen", lambda req: _FakeHTTPResponse(zip_bytes)) - - url = "https://host/models/model.zip" - out = download_and_prepare(url, cache_dir=tmp_path) - # Should return the single inner file - assert out.is_file() - assert out.name == "model.pt" - assert out.read_bytes() == b"MODEL-DATA" - # Ensure the container zip was removed (only extracted contents remain) - item_dir = next((tmp_path / "downloads").rglob("model.zip-*")) - assert not (item_dir / "model.zip").exists() - - -def test_download_zip_multi_file_returns_dir(monkeypatch: MonkeyPatch, tmp_path: Path) -> None: - """ - Test downloading a ZIP archive with multiple files returns the extraction directory. - - :param monkeypatch: MonkeyPatch, pytest fixture for monkeypatching - :param tmp_path: Path, a temporary directory provided by pytest - """ - zip_bytes = _make_zip_bytes( - { - "dir/a.txt": b"A", - "dir/b.txt": b"B", - } - ) - monkeypatch.setattr("biocracker.helpers.urlopen", lambda req: _FakeHTTPResponse(zip_bytes)) - - url = "https://host/archive/data.zip" - out = download_and_prepare(url, cache_dir=tmp_path) - # Should return a directory (multiple files) - assert out.is_dir() - files = sorted(p.name for p in out.rglob("*.txt")) - assert files == ["a.txt", "b.txt"] - - -def test_resume_incomplete_extraction(tmp_path: Path, monkeypatch: MonkeyPatch) -> None: - """ - Test resuming an incomplete extraction of a ZIP file. - - :param tmp_path: Path, a temporary directory provided by pytest - :param monkeypatch: MonkeyPatch, pytest fixture for monkeypatching - """ - # Prepare item_dir matching the slug rule - url = "https://host/pkg/single.zip" - from biocracker.helpers import _slug_url, get_biocracker_cache_dir - - base = get_biocracker_cache_dir(tmp_path) - downloads_root = base / "downloads" - slug = _slug_url(url) - item_dir = downloads_root / slug - item_dir.mkdir(parents=True, exist_ok=True) - - # Create a zip file with a single file under the guessed name - zip_path = item_dir / "single.zip" - with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as z: - z.writestr("inner.bin", b"BIN") - - # Monkeypatch urlopen to explode if called (shouldn't download again) - monkeypatch.setattr( - "biocracker.helpers.urlopen", lambda req: (_ for _ in ()).throw(AssertionError("No re-download")) - ) - - out = download_and_prepare(url, cache_dir=tmp_path) - assert out.is_file() - assert out.name == "inner.bin" - assert out.read_bytes() == b"BIN" - assert (item_dir / ".READY").exists() - - -def test_collapse_singleton_and_resolve(tmp_path: Path) -> None: - """ - Test collapsing a singleton directory structure and resolving the return path. - - :param tmp_path: Path, a temporary directory provided by pytest - """ - # Build nested single-dir tree with one file - root = tmp_path / "root" - d1 = root / "A" - d2 = d1 / "B" - d2.mkdir(parents=True) - f = d2 / "only.txt" - f.write_text("x") - - collapsed = _collapse_singleton(root) - # root has single child dir A -> collapse to A (since root itself is not "extracted") - # but our function descends only when path.is_dir() and looks at children; - # since root contains only dir A -> _collapse_singleton(root) collapses to A, then to B - assert collapsed == d2 # fully collapsed to the deepest singleton dir - - # Now test resolve: make structure like item_dir/extracted/... - item_dir = tmp_path / "item" - extracted = item_dir / "extracted" - (extracted / "A" / "B").mkdir(parents=True) - inner = extracted / "A" / "B" / "only.txt" - inner.write_text("y") - - ret = _resolve_return_path(item_dir) - assert ret.is_file() - assert ret.read_text() == "y" From 0718435e9f702319912bd5bd9ccab575e54df575 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Sat, 27 Dec 2025 12:16:39 +0100 Subject: [PATCH 04/12] ENH: new parsing module --- scripts/read_gbks.py | 20 +- src/biocracker/io/gbk_antismash.py | 1 + src/biocracker/model/domain.py | 4 + src/biocracker/query/modules.py | 829 ++++++++++++++++++++++++++++- 4 files changed, 847 insertions(+), 7 deletions(-) diff --git a/scripts/read_gbks.py b/scripts/read_gbks.py index d4184c3..523d61c 100644 --- a/scripts/read_gbks.py +++ b/scripts/read_gbks.py @@ -6,7 +6,7 @@ from biocracker.utils.logging import setup_logging, add_file_handler from biocracker.utils.json import iter_json from biocracker.model.region import Region -from biocracker.query.modules import linear_readout +from biocracker.query.modules import LinearReadout, linear_readout def cli() -> argparse.Namespace: @@ -31,10 +31,26 @@ def main() -> None: setup_logging(level="INFO") add_file_handler(os.path.join(args.out, "read_gbks.log"), level="INFO") + readouts: list[LinearReadout] = [] for region_record in iter_json(args.jsonl, jsonl=True): region = Region.from_dict(region_record) readout = linear_readout(region) - print(readout) + readouts.append(readout) + + print(f"Parsed {len(readouts)} linear readouts in total") + + # Sort on readout ID + readouts.sort(key=lambda r: r.id) + + # Only keep readouts with >= 2 modules + readouts = [r for r in readouts if len(r.modules) >= 2] + print(f"Parsed {len(readouts)} linear readouts with >= 2 modules") + + # Get specific readout + readout_ids = ["BGC0000055", "BGC0000336"] + specific_readouts = [r for r in readouts if r.id in readout_ids] + for specific_readout in specific_readouts: + print(f"Specific readout {specific_readout.id}: {specific_readout}") if __name__ == "__main__": diff --git a/src/biocracker/io/gbk_antismash.py b/src/biocracker/io/gbk_antismash.py index f1984cc..1b1565d 100644 --- a/src/biocracker/io/gbk_antismash.py +++ b/src/biocracker/io/gbk_antismash.py @@ -164,6 +164,7 @@ def _domain_rec_from_feat(feat: SeqFeature) -> Domain: start=s, end=e, sequence=aa_seq, + raw_qualifiers={k: v for k, v in feat.qualifiers.items()}, ) diff --git a/src/biocracker/model/domain.py b/src/biocracker/model/domain.py index 6f6d840..c7d49c5 100644 --- a/src/biocracker/model/domain.py +++ b/src/biocracker/model/domain.py @@ -1,6 +1,7 @@ """Domain data model.""" from dataclasses import dataclass, field, asdict +from typing import Any from biocracker.model.annotations import AnnotationSet @@ -15,6 +16,7 @@ class Domain: :param start: the starting position of the domain within the gene :param end: the ending position of the domain within the gene :param sequence: the amino acid sequence of the domain + :param raw_qualifiers: raw qualifiers or metadata associated with the domain :param annotations: the set of annotations associated with the domain """ @@ -23,6 +25,7 @@ class Domain: start: int end: int sequence: str + raw_qualifiers: dict[str, Any] = field(default_factory=dict) annotations: AnnotationSet = field(default_factory=AnnotationSet) @@ -51,5 +54,6 @@ def from_dict(cls, data: dict) -> "Domain": start=data["start"], end=data["end"], sequence=data["sequence"], + raw_qualifiers=data.get("raw_qualifiers", {}), annotations=annotations, ) diff --git a/src/biocracker/query/modules.py b/src/biocracker/query/modules.py index ba58c22..b357ee3 100644 --- a/src/biocracker/query/modules.py +++ b/src/biocracker/query/modules.py @@ -1,19 +1,838 @@ -from dataclasses import dataclass +"""Module for constructing linear readouts from genomic regions.""" + +from enum import Enum +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, TypeAlias, Literal from biocracker.model.region import Region +from biocracker.model.gene import Gene +from biocracker.model.domain import Domain + + +PKS_TYPES = { + "PKS_KS", + "PKS_AT", + "PKS_KR", + "PKS_DH", + "PKS_ER", +} +PKS_TE_ALIASES = {"Thioesterase", "PKS_TE", "TE"} + + +# Common NRPS domain labels found in antiSMASH outputs +NRPS_A = "AMP-binding" +NRPS_C = "Condensation" +NRPS_T_ALIASES = {"PCP", "Thiolation", "T", "Peptidyl-carrier-protein"} +NRPS_E = "Epimerization" +NRPS_MT_ALIASES = {"N-Methyltransferase", "MT"} +NRPS_OX_ALIASES = {"Oxidase", "Ox", "Oxidoreductase"} +NRPS_R_ALIASES = {"Thioester-reductase", "R", "Reductase"} +NRPS_TE = "Thioesterase" + + +class ModuleType(Enum): + """ + Enumeration of module types. + + :cvar NRPS: Nonribosomal Peptide Synthetase module + :cvar PKS: Polyketide Synthase module + """ + + NRPS = "NRPS" + PKS = "PKS" + + +class ModuleRole(Enum): + """ + Enumeration of module roles. + + :cvar STARTER: Starter module + :cvar ELONGATION: Elongation module + :cvar TERMINAL: Terminal module + :cvar STARTER_TERMINAL: Starter and terminal module + :cvar UNKNOWN: Unknown role + """ + + STARTER = "starter" + ELONGATION = "elongation" + TERMINAL = "terminal" + STARTER_TERMINAL = "starter+terminal" + UNKNOWN = "unknown" + + +@dataclass +class Module(ABC): + """ + Base class for a module in a linear readout. + + :param module_index_in_gene: index of the module within its gene + :param start: starting position of the module + :param end: ending position of the module + :param gene_id: ID of the gene containing the module + :param present_domains: list of domain types present in the module + :param role: functional role of the module + """ + module_index_in_gene: int + start: int + end: int + gene_id: str + present_domains: list[str] + role: ModuleRole + + @property + @abstractmethod + def type(self) -> ModuleType: + """ + Abstract property to get the type of the module. + """ + raise NotImplementedError + + @property + @abstractmethod + def substrate(self) -> Any: + """ + Abstract property to get the substrate information for the module. + """ + raise NotImplementedError + + +@dataclass +class NRPSAnatomy: + """ + Anatomy of a Nonribosomal Peptide Synthetase (NRPS) module. + + :param has_C: presence of condensation domain + :param has_T: presence of thiolation domain + :param has_E: presence of epimerization domain + :param has_MT: presence of methyltransferase domain + :param has_Ox: presence of oxidase domain + :param has_R: presence of reductase domain + :param has_TE: presence of thioesterase domain + """ + + has_C: bool + has_T: bool + has_E: bool + has_MT: bool + has_Ox: bool + has_R: bool + has_TE: bool + + +@dataclass +class NRPSSubstrate: + """ + Substrate information for a Nonribosomal Peptide Synthetase (NRPS) module. + + :param substrate_name: name of the predicted substrate + :param substrate_smiles: SMILES representation of the substrate + :param score: confidence score of the substrate prediction + """ + + substrate_name: str | None + substrate_smiles: str | None + score: float | None + + +class ATLoadingMode(Enum): + """ + Enumeration of acyltransferase (AT) loading modes. + + :cvar CIS: cis-acting AT domain + :cvar TRANS: trans-acting AT domain + :cvar UNKNOWN: unknown AT loading mode + """ + + CIS = "cis" + TRANS = "trans" + UNKNOWN = "unknown" + + +@dataclass +class PKSAnatomy: + """ + Anatomy of a Polyketide Synthase (PKS) module. + + :param has_active_KR: presence of active ketoreductase domain + :param has_active_DH: presence of active dehydratase domain + :param has_active_ER: presence of active enoylreductase domain + :param has_AT: presence of acyltransferase domain + """ + AT_loading_mode: ATLoadingMode + + has_active_KR: bool + has_active_DH: bool + has_active_ER: bool + + +class PKSExtenderUnit(Enum): + """ + Enumeration of PKS extender unit types. + + :cvar PKS_A: PKS extender unit type A + :cvar PKS_B: PKS extender unit type B + :cvar PKS_C: PKS extender unit type C + :cvar PKS_D: PKS extender unit type D + :cvar UNCLASSIFIED: unclassified extender unit type + """ + + PKS_A = "PKS_A" + PKS_B = "PKS_B" + PKS_C = "PKS_C" + PKS_D = "PKS_D" + UNCLASSIFIED = "UNCLASSIFIED" @dataclass -class Module: - ... +class PKSSubstrate: + """ + Substrate information for a Polyketide Synthase (PKS) module. + + :param extender_unit: type of extender unit used in the PKS module + """ + + extender_unit: PKSExtenderUnit + + +@dataclass +class NRPSModule(Module): + """ + Nonribosomal peptide synthetase (NRPS) module. + + :param role: functional role of the module + :param anatomy: anatomical features of the NRPS module + :param substrate: predicted substrate information for the NRPS module + """ + + role: ModuleRole + anatomy: NRPSAnatomy + predicted_substrate: NRPSSubstrate | None = None + + @property + def type(self) -> ModuleType: + """ + Get the type of the module. + + :return: ModuleType.NRPS + """ + return ModuleType.NRPS + + @property + def substrate(self) -> NRPSSubstrate | None: + """ + Get the predicted substrate information for the NRPS module. + + :return: NRPSSubstrate object containing substrate information, or None if not available + """ + return self._substrate + + +@dataclass +class PKSModule(Module): + """ + Polyketide synthase (PKS) module. + + :param type: module type (PKS) + :param role: functional role of the module + :param anatomy: anatomical features of the PKS module + """ + + role: ModuleRole + anatomy: PKSAnatomy + + @property + def type(self) -> ModuleType: + """ + Get the type of the module. + + :return: ModuleType.PKS + """ + return ModuleType.PKS + + @property + def substrate(self) -> PKSSubstrate: + """ + Get the predicted substrate information for the PKS module. + + :return: PKSSubstrate object containing substrate information + """ + # Rules: + # - KS + AT with neither KR nor DH nor ER => PKS_A + # - KS + AT + KR (no DH and no ER) => PKS_B (KR after AT is naturally true in window order) + # - KS + AT + KR + DH (no ER) => PKS_C + # - KS + AT + KR + DH + ER => PKS_D + match ( + self.anatomy.has_active_KR, + self.anatomy.has_active_DH, + self.anatomy.has_active_ER, + ): + case (True, True, True ): return PKSExtenderUnit.PKS_A + case (True, True, False): return PKSExtenderUnit.PKS_B + case (True, False, False): return PKSExtenderUnit.PKS_C + case (False, False, False): return PKSExtenderUnit.PKS_D + case _: return PKSExtenderUnit.UNCLASSIFIED @dataclass class LinearReadout: - ... + """ + A linear readout consisting of a sequence of modules. + + :param id: unique identifier for the linear readout + :param start: starting position of the linear readout + :param end: ending position of the linear readout + :param qualifiers: additional metadata or qualifiers associated with the linear readout + :param modules: list of modules in the linear readout + """ + + id: str + start: int + end: int + qualifiers: dict[str, Any] = field(default_factory=dict) + + modules: list[Module] = field(default_factory=list) + + def __str__(self) -> str: + """ + String representation of the LinearReadout. + + :return: string representation of the LinearReadout + """ + return f"LinearReadout(id={self.id}, start={self.start}, end={self.end}, modules={len(self.modules)})" + + +def _domain_types(domains: list[Domain]) -> set[str]: + """ + Helper function to extract the set of domain types from a list of Domain objects. + + :param domains: List of Domain objects + :return: Set of domain type strings + """ + return {d.type for d in domains if d.type is not None} + + +def _is_domain_type(domain: Domain, label: str | set[str]) -> bool: + """ + Check if a domain matches a given type label or set of labels. + + :param domain: Domain object to check + :param label: domain type label or set of labels to match against + :return: True if the domain type matches the label(s), False otherwise + """ + if not domain.type: + return False + + if isinstance(label, set): + return domain.type in label + + return domain.type == label + + +def _is_Cstarter(domain: Domain) -> bool: + """ + Determine if a condensation domain is a C-starter domain based on its qualifiers. + + :param domain: Domain object to evaluate + :return: True if the domain is a C-starter, False otherwise + """ + if not domain.type or domain.type != "Condensation": + return False + + txts = [] + if domain.id: + txts.append(domain.id) + + for _, vals in domain.raw_qualifiers.items(): + # Join lists and scalars; qualifiers may be list[str] + if isinstance(vals, (list, tuple)): + txts.extend(map(str, vals)) + else: + txts.append(str(vals)) + + blob = " ".join(txts).lower() + + return ("starter" in blob) or ("cstarter" in blob) or ("condensation_starter" in blob) + + +def _upstream_loading_cassette(all_genes: list[Gene], gene_idx: int, max_bp: int = 20_000) -> bool: + """ + Check for upstream loading cassette (CAL + ACP) in upstream genes within max_bp distance. + + :param all_genes: list of all Gene objects in the region/cluster + :param gene_idx: index of the gene gene in all_genes + :param max_bp: maximum base pair distance to search upstream + :return: True if a loading cassette is found upstream within max_bp, False otherwise + """ + cur_start = all_genes[gene_idx].start + + seen_cal = False + seen_acp = False + for j in range(gene_idx - 1, -1, -1): + g = all_genes[j] + if cur_start - g.end > max_bp: + break # exceeded max distance + types = _domain_types(g.domains) + d_ids = {d.id for d in g.domains if d.id} + if ("CAL_domain" in types) or any("faal" in d_id.lower() for d_id in d_ids): + seen_cal = True + if ("PP-binding" in types) or ("ACP" in types) or any("acp" in d_id.lower() for d_id in d_ids): + seen_acp = True + if seen_cal and seen_acp: + return True + + return False + + +def _upstream_has_nrps_A(all_genes: list[Gene], gene_idx: int) -> bool: + """ + Check if there is an upstream gene with an NRPS A-domain. + + :param all_genes: list of all Gene objects in the region/cluster + :param gene_idx: index of the gene gene in all_genes + :return: True if there is an upstream NRPS A-domain, False otherwise + """ + for j in range(gene_idx - 1, -1, -1): + if any(_is_domain_type(d, NRPS_A) for d in all_genes[j].domains): + return True + + return False + + +def collect_nrps_modules(gene: Gene, gene_idx: int, all_genes: list[Gene]) -> list[NRPSModule]: + """ + Collect NRPS modules from a given gene. + + :param gene: Gene object to analyze + :param gene_idx: index of the gene in the region's gene list + :param all_genes: List of all genes in the region + :return: List of NRPSModule objects""" + doms: list[Domain] = list(gene.domains) + out: list[NRPSModule] = [] + + # Indices of A domains in left-to-right order + a_idx = [i for i, d in enumerate(doms) if _is_domain_type(d, NRPS_A)] + if not a_idx: + return out # no A domains, no modules + + for mi, ai in enumerate(a_idx): + # Extend window backward by one if there is an immediately previous C (same gene) + start_i = ai + if ai - 1 >= 0 and _is_domain_type(doms[ai - 1], NRPS_C): + start_i = ai - 1 + + # Extend forward until (but not including) the next A-domain + end_i = a_idx[mi + 1] if mi + 1 < len(a_idx) else len(doms) + + window = doms[start_i:end_i] + present = _domain_types(window) + + has_C = any(_is_domain_type(d, NRPS_C) for d in window) + has_Cstarter = any(_is_Cstarter(d) for d in window) + has_T = any(_is_domain_type(d, NRPS_T_ALIASES) for d in window) + has_E = any(_is_domain_type(d, NRPS_E) for d in window) + has_MT = any(_is_domain_type(d, NRPS_MT_ALIASES) for d in window) + has_Ox = any(_is_domain_type(d, NRPS_OX_ALIASES) for d in window) + has_R = any(_is_domain_type(d, NRPS_R_ALIASES) for d in window) + has_TE = any(_is_domain_type(d, NRPS_TE) for d in window) + + # Fallback evidence of a separate loading cassette upstream + loading_upstream = _upstream_loading_cassette(all_genes, gene_idx) + upstream_has_A = _upstream_has_nrps_A(all_genes, gene_idx) + + # Role heuristic + is_first_module_in_gene = mi == 0 + + starter = ( + has_Cstarter + or (is_first_module_in_gene and loading_upstream and not upstream_has_A) + or ((not has_C) and not upstream_has_A) + ) + terminal = has_TE or has_R + + def _get_module_role(starter: bool, terminal: bool) -> ModuleRole: + match (starter, terminal): + case (True, True ): return ModuleRole.STARTER_TERMINAL + case (True, False): return ModuleRole.STARTER + case (False, True ): return ModuleRole.TERMINAL + case (False, False): return ModuleRole.ELONGATION + role: ModuleRole = _get_module_role(starter, terminal) + + s = min(d.start for d in window) + e = max(d.end for d in window) + + # Retrieve A domain substrate specificity prediction + A = doms[ai] + anns = A.annotations + substrate_pred: NRPSSubstrate | None = None + if anns: + preds = anns.results + + # Highest confidence first + preds_sorted = sorted(preds, key=lambda r: r.score or 0.0, reverse=True) + + # Get highest confidence prediction, if any + top_pred = preds_sorted[0] if preds_sorted else None + + if top_pred: + substrate_pred = NRPSSubstrate( + substrate_name=top_pred.label, + substrate_smiles=top_pred.metadata.get("smiles", None), + score=top_pred.score, + ) + + out.append(NRPSModule( + module_index_in_gene=mi, + start=s, + end=e, + gene_id=gene.id, + present_domains=list(present), + role=role, + anatomy=NRPSAnatomy( + has_C=has_C, + has_T=has_T, + has_E=has_E, + has_MT=has_MT, + has_Ox=has_Ox, + has_R=has_R, + has_TE=has_TE, + ), + predicted_substrate=substrate_pred, + )) + + return out + + +def _split_module_on_KS(domains: list[Domain]) -> list[list[Domain]]: + """ + Split a list of domains into windows based on PKS KS domains. + + :param domains: List of Domain objects + :return: List of lists of Domain objects, each representing a module window + """ + windows: list[list[Domain]] = [] + cur: list[Domain] = [] + + for d in domains: + if d.type == "PKS_KS": + # Start new module window anchored at this KS + if cur: + windows.append(cur) + cur = [d] + else: + if cur: # only append if we have started a module + cur.append(d) + + if cur: + windows.append(cur) + + return windows + + +def _is_active_accessory_domain(domain: Domain) -> bool: + """ + Determine if an accessory domain (KR, DH, ER) is active based on its qualifiers. + + :param domain: Domain object to evaluate + :return: True if the domain is active, False if inactive + """ + if not domain.type: + return True + + if domain.type not in {"PKS_KR", "PKS_DH", "PKS_ER"}: + return True # not a reducible domain, consider active by default + + texts = [] + if domain.id: + texts.append(domain.id) + for _, vals in domain.raw_qualifiers.items(): + if isinstance(vals, list): + texts.extend(map(str, vals)) + else: + texts.append(str(vals)) + + blob = " ".join(texts).lower() + + # Common antiSMASH phrasing patterns + inactive_flags = [ + "inactive", + "nonfunctional", + "non-functional", + "inactivated", + "broken", + "truncated", + ] + return not any(flag in blob for flag in inactive_flags) + + +def _classify_pks_window(window: list[Domain]) -> tuple[set[str], bool, bool, bool, bool]: + """ + Classify a PKS module window based on the presence and activity of domains. + + :param window: list of Domain objects in the module window + :return: tuple containing: + - module type (str) + - set of present domain types (set[str]) + - has active KR (bool) + - has active DH (bool) + - has active ER (bool) + - has AT (bool) + """ + types_linear = [d.type for d in window if d.type in PKS_TYPES] + present = set(types_linear) + + has_AT = "PKS_AT" in present + has_active_KR = any("PKS_KR" in present and _is_active_accessory_domain(d) for d in window if d.type == "PKS_KR") + has_active_DH = any("PKS_DH" in present and _is_active_accessory_domain(d) for d in window if d.type == "PKS_DH") + has_active_ER = any("PKS_ER" in present and _is_active_accessory_domain(d) for d in window if d.type == "PKS_ER") + + return present, has_active_KR, has_active_DH, has_active_ER, has_AT + + +def _window_bounds(window: list[Domain]) -> tuple[int, int]: + """ + Get the start and end positions of a domain window. + + :param window: list of Domain objects in the module window + :return: tuple of (start, end) positions + """ + return min(d.start for d in window), max(d.end for d in window) + + +def _is_AT_only_gene(gene: Gene) -> bool: + """ + Helper function to determine if a gene is an acyltransferase-domain-only gene. + + :param g: Gene object + :return: True if the gene is an AT-only gene, False otherwise + """ + types = _domain_types(gene.domains) + return ("PKS_AT" in types) and all(t in {"PKS_AT"} for t in types) + + +def _find_upstream_AT_only_gene(all_genes: list[Gene], gene_idx: int) -> Gene | None: + """ + Return the nearest upstream gene that is AT-only (relative to all_genes order). + + :param all_genes: list of Gene objects + :param gene_idx: index of the current gene in all_genes + :return: Gene object of the nearest upstream AT-only gene, or None if not found + """ + for j in range(gene_idx - 1, -1, -1): + if _is_AT_only_gene(all_genes[j]): + return all_genes[j] + + return None + + +def _upstream_has_pks_KS(all_genes: list[Gene], gene_idx: int, ks_start: int) -> bool: + """ + Check if there is an upstream gene with a PKS KS-domain. + + :param all_genes: list of all Gene objects in the region/cluster + :param gene_idx: index of the gene gene in all_genes + :param ks_start: start position of the KS domain in gene + :return: True if there is an upstream KS-domain, False otherwise + """ + # Genes upstream + for j in range(gene_idx -1, -1, -1): + if any(d.type == "PKS_KS" for d in all_genes[j].domains): + return True + + # Same gene, KS before this window's KS + for d in all_genes[gene_idx].domains: + if d.type == "PKS_KS" and d.start < ks_start: + return True + + return False + + +def _standalone_pks_AT_upstream( + all_genes: list[Gene], + gene_idx: int, + ks_start: int, + max_bp: int = 20_000 +) -> bool: + """ + Check for standalone PKS AT domain in upstream genes within max_bp distance. + + :param all_genes: list of all Gene objects in the region/cluster + :param gene_idx: index of the gene gene in all_genes + :param ks_start: start position of the KS domain in gene + :param max_bp: maximum base pair distance to search upstream + :return: True if a standalone PKS AT domain is found upstream within max_bp, False otherwise + """ + cur_start = ks_start + + # Same gene, before ks_start + for d in all_genes[gene_idx].domains: + if d.type == "PKS_AT" and d.end < ks_start: + return True + + # Upstream genes, within distance + for j in range(gene_idx - 1, -1, -1): + gene = all_genes[j] + if cur_start - gene.end > max_bp: + break # exceeded max distance + + if any(d.type == "PKS_AT" for d in gene.domains): + return True + + return False + + +def _is_last_global_KS(all_genes: list[Gene], gene_idx: int, ks_start: int) -> bool: + """ + Check if the given KS domain is the last KS domain in the entire gene cluster/region. + + :param all_genes: list of all Gene objects in the region/cluster + :param gene_idx: index of the gene gene in all_genes + :param ks_start: start position of the KS domain in gene + :return: True if this is the last KS domain, False otherwise + """ + # Same gene, any KS after this ks_start? + for d in all_genes[gene_idx].domains: + if d.type == "PKS_KS" and d.start > ks_start: + return False + + # Downstream genes + for j in range(gene_idx + 1, len(all_genes)): + if any(d.type == "PKS_KS" for d in all_genes[j].domains): + return False + + return True + + +def _downstream_has_TE( + all_genes: list[Gene], + gene_idx: int, + from_bp: int, + max_bp: int = 20_000 +) -> bool: + """ + Check for downstream thioesterase (TE) domain in downstream genes within max_bp distance. + + :param all_genes: list of all Gene objects in the region/cluster + :param gene_idx: index of the gene gene in all_genes + :param from_bp: base pair position to start searching from + :param max_bp: maximum base pair distance to search downstream + :return: True if a thioesterase domain is found downstream within max_bp, False otherwise + """ + # Same gene after from_bp + for d in all_genes[gene_idx].domains: + if d.type in PKS_TE_ALIASES and d.start > from_bp: + return True + + # Next genes within window + cur_end = from_bp + for j in range(gene_idx + 1, len(all_genes)): + gene = all_genes[j] + if gene.start - cur_end > max_bp: + break # exceeded max distance + + if any(d.type in PKS_TE_ALIASES for d in gene.domains): + return True + + return False + + +def collect_pks_modules(gene: Gene, gene_idx: int, all_genes: list[Gene]) -> list[PKSModule]: + """ + Collect PKS modules from a given gene. + + :param gene: Gene object to analyze + :param gene_idx: index of the gene in the region's gene list + :param all_genes: list of all genes in the region + :return: list of PKSModule objects + """ + out: list[PKSModule] = [] + + if all(d.type != "PKS_KS" for d in gene.domains): + return out # no KS domains, no modules + + windows = _split_module_on_KS(gene.domains) + for mi, win in enumerate(windows): + ( + present, + has_active_KR, + has_active_DH, + has_active_ER, + has_AT, + ) = _classify_pks_window(win) + + ks_start = win[0].start + s, e = _window_bounds(win) + + if has_AT: + AT_src: ATLoadingMode = ATLoadingMode.CIS + else: + AT_src: ATLoadingMode = ( + ATLoadingMode.TRANS + if _find_upstream_AT_only_gene(all_genes, gene_idx) is not None + else ATLoadingMode.UNKNOWN + ) + + # Assign provisional PKS role + has_TE_in_window = any(d.type in PKS_TE_ALIASES for d in win) + upstream_has_KS = _upstream_has_pks_KS(all_genes, gene_idx, ks_start) + starter = _standalone_pks_AT_upstream(all_genes, gene_idx, ks_start) and not upstream_has_KS + + terminal_by_TE = False + if _is_last_global_KS(all_genes, gene_idx, ks_start): + terminal_by_TE = has_TE_in_window or _downstream_has_TE(all_genes, gene_idx, from_bp=e) + + def _get_module_role(starter: bool, terminal_by_TE: bool) -> ModuleRole: + match (starter, terminal_by_TE): + case (True, True ): return ModuleRole.STARTER_TERMINAL + case (True, False): return ModuleRole.STARTER + case (False, True ): return ModuleRole.TERMINAL + case (False, False): return ModuleRole.ELONGATION + role: ModuleRole = _get_module_role(starter, terminal_by_TE) + + s, e = _window_bounds(win) + out.append(PKSModule( + module_index_in_gene=mi, + start=s, + end=e, + gene_id=gene.id, + present_domains=list(present), + role=role, + anatomy=PKSAnatomy( + AT_loading_mode=AT_src, + has_active_KR=has_active_KR, + has_active_DH=has_active_DH, + has_active_ER=has_active_ER, + ), + )) + + return out def linear_readout(region: Region) -> LinearReadout: """ + Construct a linear readout from the given genomic region. + + :param region: Region object representing the genomic region + :return: LinearReadout object containing the collected modules """ - return LinearReadout() + assert isinstance(region, Region), "region must be an instance of Region" + + collected: list[Module] = [] + + for gi, gene in enumerate(region.iter_genes()): + + # Collect NRPS modules + nrps_modules = collect_nrps_modules(gene, gi, region.genes) + collected.extend(nrps_modules) + + # Collect PKS modules + pks_modules = collect_pks_modules(gene, gi, region.genes) + collected.extend(pks_modules) + + return LinearReadout( + id=region.id, + start=region.start, + end=region.end, + qualifiers=region.qualifiers, + modules=collected + ) From ed753e98ca6c1257cca42e708e07209304e787c5 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Sat, 27 Dec 2025 18:17:32 +0100 Subject: [PATCH 05/12] FIX: genomic vs biosynthetic order --- scripts/read_gbks.py | 13 +- src/biocracker/query/modules.py | 729 ++++++++++++++++++++++++-------- 2 files changed, 553 insertions(+), 189 deletions(-) diff --git a/scripts/read_gbks.py b/scripts/read_gbks.py index 523d61c..5134a95 100644 --- a/scripts/read_gbks.py +++ b/scripts/read_gbks.py @@ -1,12 +1,13 @@ """Parse linear readouts from parsed GenBank files.""" import argparse +import json import os from biocracker.utils.logging import setup_logging, add_file_handler from biocracker.utils.json import iter_json from biocracker.model.region import Region -from biocracker.query.modules import LinearReadout, linear_readout +from biocracker.query.modules import LinearReadout, PKSModule, NRPSModule, linear_readout def cli() -> argparse.Namespace: @@ -47,10 +48,18 @@ def main() -> None: print(f"Parsed {len(readouts)} linear readouts with >= 2 modules") # Get specific readout - readout_ids = ["BGC0000055", "BGC0000336"] + readout_ids = ["BGC0000054", "BGC0000055", "BGC0000336"] specific_readouts = [r for r in readouts if r.id in readout_ids] for specific_readout in specific_readouts: print(f"Specific readout {specific_readout.id}: {specific_readout}") + for module in specific_readout.biosynthetic_order(): + print(f"\t{module.substrate if isinstance(module, PKSModule) else module.substrate.name}", f"{module.role}", sep="\t") + + # Write all readouts to output JSONL + out_jsonl = os.path.join(args.out, "linear_readouts.jsonl") + with open(out_jsonl, "w") as out_f: + for readout in readouts: + out_f.write(json.dumps(readout.to_dict()) + "\n") if __name__ == "__main__": diff --git a/src/biocracker/query/modules.py b/src/biocracker/query/modules.py index b357ee3..29a60c6 100644 --- a/src/biocracker/query/modules.py +++ b/src/biocracker/query/modules.py @@ -1,12 +1,17 @@ -"""Module for constructing linear readouts from genomic regions.""" +""" +Module for constructing linear readouts from genomic regions. + +Note: upstream/downstream scans are genomic (coordinate-based), not biosynthetic! +""" -from enum import Enum from abc import ABC, abstractmethod +from collections import Counter from dataclasses import dataclass, field -from typing import Any, TypeAlias, Literal +from enum import Enum +from typing import Any from biocracker.model.region import Region -from biocracker.model.gene import Gene +from biocracker.model.gene import Gene, Strand from biocracker.model.domain import Domain @@ -43,6 +48,7 @@ class ModuleType(Enum): PKS = "PKS" + class ModuleRole(Enum): """ Enumeration of module roles. @@ -70,6 +76,7 @@ class Module(ABC): :param start: starting position of the module :param end: ending position of the module :param gene_id: ID of the gene containing the module + :param gene_strand: strand of the gene containing the module :param present_domains: list of domain types present in the module :param role: functional role of the module """ @@ -77,6 +84,7 @@ class Module(ABC): start: int end: int gene_id: str + gene_strand: Strand present_domains: list[str] role: ModuleRole @@ -96,6 +104,24 @@ def substrate(self) -> Any: """ raise NotImplementedError + def to_dict(self) -> dict[str, Any]: + """ + Convert the Module object to a dictionary representation. + + :return: Dictionary representation of the Module + """ + raise NotImplementedError + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "Module": + """ + Create a Module object from a dictionary representation. + + :param data: Dictionary representation of the Module + :return: Module object + """ + raise NotImplementedError + @dataclass class NRPSAnatomy: @@ -119,21 +145,81 @@ class NRPSAnatomy: has_R: bool has_TE: bool + def to_dict(self) -> dict[str, bool]: + """ + Convert the NRPSAnatomy object to a dictionary representation. + + :return: Dictionary representation of the NRPSAnatomy + """ + return { + "has_C": self.has_C, + "has_T": self.has_T, + "has_E": self.has_E, + "has_MT": self.has_MT, + "has_Ox": self.has_Ox, + "has_R": self.has_R, + "has_TE": self.has_TE, + } + + @classmethod + def from_dict(cls, data: dict[str, bool]) -> "NRPSAnatomy": + """ + Create a NRPSAnatomy object from a dictionary representation. + + :param data: Dictionary representation of the NRPSAnatomy + :return: NRPSAnatomy object + """ + return cls( + has_C=data.get("has_C", False), + has_T=data.get("has_T", False), + has_E=data.get("has_E", False), + has_MT=data.get("has_MT", False), + has_Ox=data.get("has_Ox", False), + has_R=data.get("has_R", False), + has_TE=data.get("has_TE", False), + ) + @dataclass class NRPSSubstrate: """ Substrate information for a Nonribosomal Peptide Synthetase (NRPS) module. - :param substrate_name: name of the predicted substrate - :param substrate_smiles: SMILES representation of the substrate + :param name: name of the predicted substrate + :param smiles: SMILES representation of the substrate :param score: confidence score of the substrate prediction """ - substrate_name: str | None - substrate_smiles: str | None + name: str | None + smiles: str | None score: float | None + def to_dict(self) -> dict[str, Any]: + """ + Convert the NRPSSubstrate object to a dictionary representation. + + :return: Dictionary representation of the NRPSSubstrate + """ + return { + "name": self.name, + "smiles": self.smiles, + "score": self.score, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "NRPSSubstrate": + """ + Create a NRPSSubstrate object from a dictionary representation. + + :param data: Dictionary representation of the NRPSSubstrate + :return: NRPSSubstrate object + """ + return cls( + name=data.get("name", None), + smiles=data.get("smiles", None), + score=data.get("score", None), + ) + class ATLoadingMode(Enum): """ @@ -165,6 +251,37 @@ class PKSAnatomy: has_active_DH: bool has_active_ER: bool + def to_dict(self) -> dict[str, Any]: + """ + Convert the PKSAnatomy object to a dictionary representation. + + :return: Dictionary representation of the PKSAnatomy + """ + return { + "AT_loading_mode": self.AT_loading_mode.value, + "has_active_KR": self.has_active_KR, + "has_active_DH": self.has_active_DH, + "has_active_ER": self.has_active_ER, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "PKSAnatomy": + """ + Create a PKSAnatomy object from a dictionary representation. + + :param data: Dictionary representation of the PKSAnatomy + :return: PKSAnatomy object + """ + raw = (data.get("AT_loading_mode") or "unknown").lower() + AT_loading_mode = ATLoadingMode(raw) if raw in {"cis","trans","unknown"} else ATLoadingMode.UNKNOWN + + return cls( + AT_loading_mode=AT_loading_mode, + has_active_KR=data.get("has_active_KR", False), + has_active_DH=data.get("has_active_DH", False), + has_active_ER=data.get("has_active_ER", False), + ) + class PKSExtenderUnit(Enum): """ @@ -194,6 +311,28 @@ class PKSSubstrate: extender_unit: PKSExtenderUnit + def to_dict(self) -> dict[str, str]: + """ + Convert the PKSSubstrate object to a dictionary representation. + + :return: Dictionary representation of the PKSSubstrate + """ + return { + "extender_unit": self.extender_unit.value, + } + + @classmethod + def from_dict(cls, data: dict[str, str]) -> "PKSSubstrate": + """ + Create a PKSSubstrate object from a dictionary representation. + + :param data: Dictionary representation of the PKSSubstrate + :return: PKSSubstrate object + """ + return cls( + extender_unit=PKSExtenderUnit(data.get("extender_unit", "UNCLASSIFIED")), + ) + @dataclass class NRPSModule(Module): @@ -225,7 +364,49 @@ def substrate(self) -> NRPSSubstrate | None: :return: NRPSSubstrate object containing substrate information, or None if not available """ - return self._substrate + return self.predicted_substrate + + def to_dict(self) -> dict[str, Any]: + """ + Convert the NRPSModule object to a dictionary representation. + + :return: Dictionary representation of the NRPSModule + """ + return { + "type": self.type.value, + "module_index_in_gene": self.module_index_in_gene, + "start": self.start, + "end": self.end, + "gene_id": self.gene_id, + "gene_strand": self.gene_strand.value, + "present_domains": self.present_domains, + "role": self.role.value, + "anatomy": self.anatomy.to_dict(), + "predicted_substrate": self.predicted_substrate.to_dict() if self.predicted_substrate else None, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "NRPSModule": + """ + Create a NRPSModule object from a dictionary representation. + + :param data: Dictionary representation of the NRPSModule + :return: NRPSModule object + """ + anatomy_data = data.get("anatomy", {}) + substrate_data = data.get("predicted_substrate", None) + + return cls( + module_index_in_gene=data["module_index_in_gene"], + start=data["start"], + end=data["end"], + gene_id=data["gene_id"], + gene_strand=Strand(data["gene_strand"]), + present_domains=data["present_domains"], + role=ModuleRole(data["role"]), + anatomy=NRPSAnatomy.from_dict(anatomy_data), + predicted_substrate=NRPSSubstrate.from_dict(substrate_data) if substrate_data else None, + ) @dataclass @@ -267,11 +448,50 @@ def substrate(self) -> PKSSubstrate: self.anatomy.has_active_DH, self.anatomy.has_active_ER, ): - case (True, True, True ): return PKSExtenderUnit.PKS_A - case (True, True, False): return PKSExtenderUnit.PKS_B - case (True, False, False): return PKSExtenderUnit.PKS_C - case (False, False, False): return PKSExtenderUnit.PKS_D - case _: return PKSExtenderUnit.UNCLASSIFIED + case (False, False, False ): return PKSExtenderUnit.PKS_A + case (True, False, False ): return PKSExtenderUnit.PKS_B + case (True, True, False ): return PKSExtenderUnit.PKS_C + case (True, True, True ): return PKSExtenderUnit.PKS_D + case _: return PKSExtenderUnit.UNCLASSIFIED + + def to_dict(self) -> dict[str, Any]: + """ + Convert the PKSModule object to a dictionary representation. + + :return: Dictionary representation of the PKSModule + """ + return { + "type": self.type.value, + "module_index_in_gene": self.module_index_in_gene, + "start": self.start, + "end": self.end, + "gene_id": self.gene_id, + "gene_strand": self.gene_strand.value, + "present_domains": self.present_domains, + "role": self.role.value, + "anatomy": self.anatomy.to_dict(), + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "PKSModule": + """ + Create a PKSModule object from a dictionary representation. + + :param data: Dictionary representation of the PKSModule + :return: PKSModule object + """ + anatomy_data = data.get("anatomy", {}) + + return cls( + module_index_in_gene=data["module_index_in_gene"], + start=data["start"], + end=data["end"], + gene_id=data["gene_id"], + gene_strand=Strand(data["gene_strand"]), + present_domains=data["present_domains"], + role=ModuleRole(data["role"]), + anatomy=PKSAnatomy.from_dict(anatomy_data), + ) @dataclass @@ -300,6 +520,103 @@ def __str__(self) -> str: :return: string representation of the LinearReadout """ return f"LinearReadout(id={self.id}, start={self.start}, end={self.end}, modules={len(self.modules)})" + + def to_dict(self) -> dict[str, Any]: + """ + Convert the LinearReadout object to a dictionary representation. + + :return: Dictionary representation of the LinearReadout + """ + return { + "id": self.id, + "start": self.start, + "end": self.end, + "qualifiers": self.qualifiers, + "modules": [module.to_dict() for module in self.modules], + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "LinearReadout": + """ + Create a LinearReadout object from a dictionary representation. + + :param data: Dictionary representation of the LinearReadout + :return: LinearReadout object + """ + modules_data = data.get("modules", []) + modules: list[Module] = [] + + for mod_data in modules_data: + mod_type = mod_data.get("type", None) + if mod_type == ModuleType.NRPS.value: + modules.append(NRPSModule.from_dict(mod_data)) + elif mod_type == ModuleType.PKS.value: + modules.append(PKSModule.from_dict(mod_data)) + else: + raise ValueError(f"Unknown module type: {mod_type}") + + return cls( + id=data["id"], + start=data["start"], + end=data["end"], + qualifiers=data.get("qualifiers", {}), + modules=modules, + ) + + def biosynthetic_order(self) -> list[Module]: + """ + Return modules in biosynthetic order. + + :param orientation: 'forward' for 5' to 3', 'reverse' for 3' to 5' + :return: list of Module objects in the specified order + """ + if not self.modules: + return [] + + # Group modules by gene + by_gene: dict[str, list[Module]] = {} + for m in self.modules: + by_gene.setdefault(m.gene_id, []).append(m) + + # Infer strand per gene (sanity check) + gene_strand: dict[str, Strand] = {} + for gid, mods in by_gene.items(): + s = mods[0].gene_strand + if any(m.gene_strand is not s for m in mods): + raise ValueError(f"mixed gene_strand in gene_id={gid}") + gene_strand[gid] = s + + # Infer global biosyntehtic direction + strand_counts = Counter(gene_strand.values()) + global_reverse = strand_counts[Strand.REVERSE] > strand_counts[Strand.FORWARD] + + # Order genes along biosynthetic direction using genomic position + gene_ids = sorted( + by_gene.keys(), + key=lambda gid: min(m.start for m in by_gene[gid]), + reverse=global_reverse, + ) + + out: list[Module] = [] + for gid in gene_ids: + mods = by_gene[gid] + if gene_strand[gid] is Strand.FORWARD: + mods_sorted = sorted(mods, key=lambda m: m.start) + else: + mods_sorted = sorted(mods, key=lambda m: m.start, reverse=True) + out.extend(mods_sorted) + + return out + + +def _domain_index_by_obj(doms: list[Domain]) -> dict[int, int]: + """ + Helper function to create a mapping from Domain object IDs to their indices in a list. + + :param doms: list of Domain objects + :return: dictionary mapping Domain object IDs to their indices + """ + return {id(d): i for i, d in enumerate(doms)} def _domain_types(domains: list[Domain]) -> set[str]: @@ -355,20 +672,20 @@ def _is_Cstarter(domain: Domain) -> bool: return ("starter" in blob) or ("cstarter" in blob) or ("condensation_starter" in blob) -def _upstream_loading_cassette(all_genes: list[Gene], gene_idx: int, max_bp: int = 20_000) -> bool: +def _upstream_loading_cassette(all_genes: list[Gene], gene_idx_in_genomic_order: int, max_bp: int = 20_000) -> bool: """ Check for upstream loading cassette (CAL + ACP) in upstream genes within max_bp distance. :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx: index of the gene gene in all_genes + :param gene_idx_in_genomic_order: index of the gene gene in all_genes :param max_bp: maximum base pair distance to search upstream :return: True if a loading cassette is found upstream within max_bp, False otherwise """ - cur_start = all_genes[gene_idx].start + cur_start = all_genes[gene_idx_in_genomic_order].start seen_cal = False seen_acp = False - for j in range(gene_idx - 1, -1, -1): + for j in range(gene_idx_in_genomic_order - 1, -1, -1): g = all_genes[j] if cur_start - g.end > max_bp: break # exceeded max distance @@ -384,125 +701,21 @@ def _upstream_loading_cassette(all_genes: list[Gene], gene_idx: int, max_bp: int return False -def _upstream_has_nrps_A(all_genes: list[Gene], gene_idx: int) -> bool: +def _upstream_has_nrps_A(all_genes: list[Gene], gene_idx_in_genomic_order: int) -> bool: """ Check if there is an upstream gene with an NRPS A-domain. :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx: index of the gene gene in all_genes + :param gene_idx_in_genomic_order: index of the gene gene in all_genes :return: True if there is an upstream NRPS A-domain, False otherwise """ - for j in range(gene_idx - 1, -1, -1): + for j in range(gene_idx_in_genomic_order - 1, -1, -1): if any(_is_domain_type(d, NRPS_A) for d in all_genes[j].domains): return True return False -def collect_nrps_modules(gene: Gene, gene_idx: int, all_genes: list[Gene]) -> list[NRPSModule]: - """ - Collect NRPS modules from a given gene. - - :param gene: Gene object to analyze - :param gene_idx: index of the gene in the region's gene list - :param all_genes: List of all genes in the region - :return: List of NRPSModule objects""" - doms: list[Domain] = list(gene.domains) - out: list[NRPSModule] = [] - - # Indices of A domains in left-to-right order - a_idx = [i for i, d in enumerate(doms) if _is_domain_type(d, NRPS_A)] - if not a_idx: - return out # no A domains, no modules - - for mi, ai in enumerate(a_idx): - # Extend window backward by one if there is an immediately previous C (same gene) - start_i = ai - if ai - 1 >= 0 and _is_domain_type(doms[ai - 1], NRPS_C): - start_i = ai - 1 - - # Extend forward until (but not including) the next A-domain - end_i = a_idx[mi + 1] if mi + 1 < len(a_idx) else len(doms) - - window = doms[start_i:end_i] - present = _domain_types(window) - - has_C = any(_is_domain_type(d, NRPS_C) for d in window) - has_Cstarter = any(_is_Cstarter(d) for d in window) - has_T = any(_is_domain_type(d, NRPS_T_ALIASES) for d in window) - has_E = any(_is_domain_type(d, NRPS_E) for d in window) - has_MT = any(_is_domain_type(d, NRPS_MT_ALIASES) for d in window) - has_Ox = any(_is_domain_type(d, NRPS_OX_ALIASES) for d in window) - has_R = any(_is_domain_type(d, NRPS_R_ALIASES) for d in window) - has_TE = any(_is_domain_type(d, NRPS_TE) for d in window) - - # Fallback evidence of a separate loading cassette upstream - loading_upstream = _upstream_loading_cassette(all_genes, gene_idx) - upstream_has_A = _upstream_has_nrps_A(all_genes, gene_idx) - - # Role heuristic - is_first_module_in_gene = mi == 0 - - starter = ( - has_Cstarter - or (is_first_module_in_gene and loading_upstream and not upstream_has_A) - or ((not has_C) and not upstream_has_A) - ) - terminal = has_TE or has_R - - def _get_module_role(starter: bool, terminal: bool) -> ModuleRole: - match (starter, terminal): - case (True, True ): return ModuleRole.STARTER_TERMINAL - case (True, False): return ModuleRole.STARTER - case (False, True ): return ModuleRole.TERMINAL - case (False, False): return ModuleRole.ELONGATION - role: ModuleRole = _get_module_role(starter, terminal) - - s = min(d.start for d in window) - e = max(d.end for d in window) - - # Retrieve A domain substrate specificity prediction - A = doms[ai] - anns = A.annotations - substrate_pred: NRPSSubstrate | None = None - if anns: - preds = anns.results - - # Highest confidence first - preds_sorted = sorted(preds, key=lambda r: r.score or 0.0, reverse=True) - - # Get highest confidence prediction, if any - top_pred = preds_sorted[0] if preds_sorted else None - - if top_pred: - substrate_pred = NRPSSubstrate( - substrate_name=top_pred.label, - substrate_smiles=top_pred.metadata.get("smiles", None), - score=top_pred.score, - ) - - out.append(NRPSModule( - module_index_in_gene=mi, - start=s, - end=e, - gene_id=gene.id, - present_domains=list(present), - role=role, - anatomy=NRPSAnatomy( - has_C=has_C, - has_T=has_T, - has_E=has_E, - has_MT=has_MT, - has_Ox=has_Ox, - has_R=has_R, - has_TE=has_TE, - ), - predicted_substrate=substrate_pred, - )) - - return out - - def _split_module_on_KS(domains: list[Domain]) -> list[list[Domain]]: """ Split a list of domains into windows based on PKS KS domains. @@ -537,7 +750,7 @@ def _is_active_accessory_domain(domain: Domain) -> bool: :return: True if the domain is active, False if inactive """ if not domain.type: - return True + return True # can't tell, assume active if domain.type not in {"PKS_KR", "PKS_DH", "PKS_ER"}: return True # not a reducible domain, consider active by default @@ -546,7 +759,7 @@ def _is_active_accessory_domain(domain: Domain) -> bool: if domain.id: texts.append(domain.id) for _, vals in domain.raw_qualifiers.items(): - if isinstance(vals, list): + if isinstance(vals, (list, tuple)): texts.extend(map(str, vals)) else: texts.append(str(vals)) @@ -562,7 +775,9 @@ def _is_active_accessory_domain(domain: Domain) -> bool: "broken", "truncated", ] - return not any(flag in blob for flag in inactive_flags) + is_active = not any(flag in blob for flag in inactive_flags) + + return is_active def _classify_pks_window(window: list[Domain]) -> tuple[set[str], bool, bool, bool, bool]: @@ -610,122 +825,140 @@ def _is_AT_only_gene(gene: Gene) -> bool: return ("PKS_AT" in types) and all(t in {"PKS_AT"} for t in types) -def _find_upstream_AT_only_gene(all_genes: list[Gene], gene_idx: int) -> Gene | None: +def _find_genomic_upstream_AT_only_gene(all_genes: list[Gene], gene_idx_in_genomic_order: int) -> Gene | None: """ Return the nearest upstream gene that is AT-only (relative to all_genes order). :param all_genes: list of Gene objects - :param gene_idx: index of the current gene in all_genes + :param gene_idx_in_genomic_order: index of the current gene in all_genes :return: Gene object of the nearest upstream AT-only gene, or None if not found """ - for j in range(gene_idx - 1, -1, -1): + for j in range(gene_idx_in_genomic_order - 1, -1, -1): if _is_AT_only_gene(all_genes[j]): return all_genes[j] return None -def _upstream_has_pks_KS(all_genes: list[Gene], gene_idx: int, ks_start: int) -> bool: +def _upstream_has_pks_KS( + all_genes: list[Gene], + gene_idx_in_genomic_order: int, + doms: list[Domain], + ks_domain: Domain, +) -> bool: """ Check if there is an upstream gene with a PKS KS-domain. :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx: index of the gene gene in all_genes - :param ks_start: start position of the KS domain in gene + :param gene_idx_in_genomic_order: index of the gene gene in all_genes + :param doms: list of Domain objects in the current gene, sorted for biosynthetic order + :param ks_domain: the KS Domain object to check upstream of :return: True if there is an upstream KS-domain, False otherwise """ # Genes upstream - for j in range(gene_idx -1, -1, -1): + for j in range(gene_idx_in_genomic_order -1, -1, -1): if any(d.type == "PKS_KS" for d in all_genes[j].domains): return True - # Same gene, KS before this window's KS - for d in all_genes[gene_idx].domains: - if d.type == "PKS_KS" and d.start < ks_start: - return True - - return False + # Same gene: any KS earlier in biosynthetic order than this KS? + idx = _domain_index_by_obj(doms) + ks_i = idx[id(ks_domain)] + return any(d.type == "PKS_KS" and idx[id(d)] < ks_i for d in doms) def _standalone_pks_AT_upstream( all_genes: list[Gene], - gene_idx: int, - ks_start: int, + gene_idx_in_genomic_order: int, + doms: list[Domain], + ks_domain: Domain, max_bp: int = 20_000 ) -> bool: """ Check for standalone PKS AT domain in upstream genes within max_bp distance. :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx: index of the gene gene in all_genes - :param ks_start: start position of the KS domain in gene + :param gene_idx_in_genomic_order: index of the gene gene in all_genes + :param doms: list of Domain objects in the current gene, sorted for biosynthetic order + :param ks_domain: the KS Domain object to check upstream of :param max_bp: maximum base pair distance to search upstream :return: True if a standalone PKS AT domain is found upstream within max_bp, False otherwise """ - cur_start = ks_start - - # Same gene, before ks_start - for d in all_genes[gene_idx].domains: - if d.type == "PKS_AT" and d.end < ks_start: - return True - - # Upstream genes, within distance - for j in range(gene_idx - 1, -1, -1): - gene = all_genes[j] - if cur_start - gene.end > max_bp: - break # exceeded max distance + # Same gene: any AT earlier than this KS in biosynthetic order? + idx = _domain_index_by_obj(doms) + ks_i = idx[id(ks_domain)] + if any(d.type == "PKS_AT" and idx[id(d)] < ks_i for d in doms): + return True - if any(d.type == "PKS_AT" for d in gene.domains): + # Other genes upstream, within distance (still genomic) + ks_start = ks_domain.start + cur_start = ks_start + for j in range(gene_idx_in_genomic_order - 1, -1, -1): + g = all_genes[j] + if cur_start - g.end > max_bp: + break + if any(d.type == "PKS_AT" for d in g.domains): return True return False -def _is_last_global_KS(all_genes: list[Gene], gene_idx: int, ks_start: int) -> bool: +def _is_last_global_KS( + all_genes: list[Gene], + gene_idx_in_genomic_order: int, + doms: list[Domain], + ks_domain: Domain, +) -> bool: """ Check if the given KS domain is the last KS domain in the entire gene cluster/region. :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx: index of the gene gene in all_genes - :param ks_start: start position of the KS domain in gene + :param gene_idx_in_genomic_order: index of the gene gene in all_genes + :param doms: list of Domain objects in the current gene, sorted for biosynthetic order + :param ks_domain: the KS Domain object to check :return: True if this is the last KS domain, False otherwise """ - # Same gene, any KS after this ks_start? - for d in all_genes[gene_idx].domains: - if d.type == "PKS_KS" and d.start > ks_start: - return False + # Same gene: any KS later in biosynthetic order? + idx = _domain_index_by_obj(doms) + ks_i = idx[id(ks_domain)] + if any(d.type == "PKS_KS" and idx[id(d)] > ks_i for d in doms): + return False - # Downstream genes - for j in range(gene_idx + 1, len(all_genes)): + # Downstream genes (genomic list order) + for j in range(gene_idx_in_genomic_order + 1, len(all_genes)): if any(d.type == "PKS_KS" for d in all_genes[j].domains): return False return True -def _downstream_has_TE( +def _genomic_downstream_has_TE( all_genes: list[Gene], - gene_idx: int, - from_bp: int, + gene_idx_in_genomic_order: int, + doms: list[Domain], + win: list[Domain], max_bp: int = 20_000 ) -> bool: """ Check for downstream thioesterase (TE) domain in downstream genes within max_bp distance. :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx: index of the gene gene in all_genes - :param from_bp: base pair position to start searching from + :param gene_idx_in_genomic_order: index of the gene gene in all_genes + :param doms: list of Domain objects in the current gene, sorted for biosynthetic order + :param win: current module window (list of Domain objects) :param max_bp: maximum base pair distance to search downstream :return: True if a thioesterase domain is found downstream within max_bp, False otherwise """ - # Same gene after from_bp - for d in all_genes[gene_idx].domains: - if d.type in PKS_TE_ALIASES and d.start > from_bp: - return True + idx = _domain_index_by_obj(doms) + last_i = idx[id(win[-1])] + + # Same gene: any TE later than the window end in biosynthetic order? + if any(d.type in PKS_TE_ALIASES and idx[id(d)] > last_i for d in doms): + return True - # Next genes within window + # Other genes downstream, within distance (still genomic) + from_bp = max(d.end for d in win) # genomic coordinate for distance window cur_end = from_bp - for j in range(gene_idx + 1, len(all_genes)): + for j in range(gene_idx_in_genomic_order + 1, len(all_genes)): gene = all_genes[j] if gene.start - cur_end > max_bp: break # exceeded max distance @@ -736,12 +969,132 @@ def _downstream_has_TE( return False -def collect_pks_modules(gene: Gene, gene_idx: int, all_genes: list[Gene]) -> list[PKSModule]: +def domains_biosynthetic(gene: Gene) -> list[Domain]: + """ + Return domains in biosynthetic order within a gene. + + :param gene: Gene object + :return: list of Domain objects in biosynthetic order + .. note:: we assume Domain.start/end are genomic coordinates + """ + doms = sorted(gene.domains, key=lambda d: d.start) + if gene.strand is Strand.REVERSE: + doms = list(reversed(doms)) + + return doms + + +def collect_nrps_modules(gene: Gene, gene_idx_in_genomic_order: int, all_genes: list[Gene]) -> list[NRPSModule]: + """ + Collect NRPS modules from a given gene. + + :param gene: Gene object to analyze + :param gene_idx_in_genomic_order: index of the gene in the region's gene list + :param all_genes: List of all genes in the region + :return: List of NRPSModule objects""" + doms: list[Domain] = domains_biosynthetic(gene) + out: list[NRPSModule] = [] + + # Indices of A domains in left-to-right order + a_idx = [i for i, d in enumerate(doms) if _is_domain_type(d, NRPS_A)] + if not a_idx: + return out # no A domains, no modules + + for mi, ai in enumerate(a_idx): + # Extend window backward by one if there is an immediately previous C (same gene) + start_i = ai + if ai - 1 >= 0 and _is_domain_type(doms[ai - 1], NRPS_C): + start_i = ai - 1 + + # Extend forward until (but not including) the next A-domain + end_i = a_idx[mi + 1] if mi + 1 < len(a_idx) else len(doms) + + window = doms[start_i:end_i] + present = _domain_types(window) + + has_C = any(_is_domain_type(d, NRPS_C) for d in window) + has_Cstarter = any(_is_Cstarter(d) for d in window) + has_T = any(_is_domain_type(d, NRPS_T_ALIASES) for d in window) + has_E = any(_is_domain_type(d, NRPS_E) for d in window) + has_MT = any(_is_domain_type(d, NRPS_MT_ALIASES) for d in window) + has_Ox = any(_is_domain_type(d, NRPS_OX_ALIASES) for d in window) + has_R = any(_is_domain_type(d, NRPS_R_ALIASES) for d in window) + has_TE = any(_is_domain_type(d, NRPS_TE) for d in window) + + # Fallback evidence of a separate loading cassette upstream + loading_upstream = _upstream_loading_cassette(all_genes, gene_idx_in_genomic_order) + upstream_has_A = _upstream_has_nrps_A(all_genes, gene_idx_in_genomic_order) + + # Role heuristic + is_first_module_in_gene = mi == 0 + + starter = ( + has_Cstarter + or (is_first_module_in_gene and loading_upstream and not upstream_has_A) + or ((not has_C) and not upstream_has_A) + ) + terminal = has_TE or has_R + + def _get_module_role(starter: bool, terminal: bool) -> ModuleRole: + match (starter, terminal): + case (True, True ): return ModuleRole.STARTER_TERMINAL + case (True, False): return ModuleRole.STARTER + case (False, True ): return ModuleRole.TERMINAL + case (False, False): return ModuleRole.ELONGATION + role: ModuleRole = _get_module_role(starter, terminal) + + s = min(d.start for d in window) + e = max(d.end for d in window) + + # Retrieve A domain substrate specificity prediction + A = doms[ai] + anns = A.annotations + substrate_pred: NRPSSubstrate | None = None + if anns: + preds = anns.results + + # Highest confidence first + preds_sorted = sorted(preds, key=lambda r: r.score or 0.0, reverse=True) + + # Get highest confidence prediction, if any + top_pred = preds_sorted[0] if preds_sorted else None + + if top_pred: + substrate_pred = NRPSSubstrate( + name=top_pred.label, + smiles=top_pred.metadata.get("smiles", None), + score=top_pred.score, + ) + + out.append(NRPSModule( + module_index_in_gene=mi, + start=s, + end=e, + gene_id=gene.id, + gene_strand=gene.strand, + present_domains=list(present), + role=role, + anatomy=NRPSAnatomy( + has_C=has_C, + has_T=has_T, + has_E=has_E, + has_MT=has_MT, + has_Ox=has_Ox, + has_R=has_R, + has_TE=has_TE, + ), + predicted_substrate=substrate_pred, + )) + + return out + + +def collect_pks_modules(gene: Gene, gene_idx_in_genomic_order: int, all_genes: list[Gene]) -> list[PKSModule]: """ Collect PKS modules from a given gene. :param gene: Gene object to analyze - :param gene_idx: index of the gene in the region's gene list + :param gene_idx_in_genomic_order: index of the gene in the region's gene list :param all_genes: list of all genes in the region :return: list of PKSModule objects """ @@ -750,7 +1103,8 @@ def collect_pks_modules(gene: Gene, gene_idx: int, all_genes: list[Gene]) -> lis if all(d.type != "PKS_KS" for d in gene.domains): return out # no KS domains, no modules - windows = _split_module_on_KS(gene.domains) + doms = domains_biosynthetic(gene) + windows = _split_module_on_KS(doms) for mi, win in enumerate(windows): ( present, @@ -760,7 +1114,6 @@ def collect_pks_modules(gene: Gene, gene_idx: int, all_genes: list[Gene]) -> lis has_AT, ) = _classify_pks_window(win) - ks_start = win[0].start s, e = _window_bounds(win) if has_AT: @@ -768,18 +1121,19 @@ def collect_pks_modules(gene: Gene, gene_idx: int, all_genes: list[Gene]) -> lis else: AT_src: ATLoadingMode = ( ATLoadingMode.TRANS - if _find_upstream_AT_only_gene(all_genes, gene_idx) is not None + if _find_genomic_upstream_AT_only_gene(all_genes, gene_idx_in_genomic_order) is not None else ATLoadingMode.UNKNOWN ) # Assign provisional PKS role has_TE_in_window = any(d.type in PKS_TE_ALIASES for d in win) - upstream_has_KS = _upstream_has_pks_KS(all_genes, gene_idx, ks_start) - starter = _standalone_pks_AT_upstream(all_genes, gene_idx, ks_start) and not upstream_has_KS + KS_domain = win[0] # first domain in window is KS since we split on KS + upstream_has_KS = _upstream_has_pks_KS(all_genes, gene_idx_in_genomic_order, doms, KS_domain) + starter = _standalone_pks_AT_upstream(all_genes, gene_idx_in_genomic_order, doms, KS_domain) and not upstream_has_KS terminal_by_TE = False - if _is_last_global_KS(all_genes, gene_idx, ks_start): - terminal_by_TE = has_TE_in_window or _downstream_has_TE(all_genes, gene_idx, from_bp=e) + if _is_last_global_KS(all_genes, gene_idx_in_genomic_order, doms, KS_domain): + terminal_by_TE = has_TE_in_window or _genomic_downstream_has_TE(all_genes, gene_idx_in_genomic_order, doms, win) def _get_module_role(starter: bool, terminal_by_TE: bool) -> ModuleRole: match (starter, terminal_by_TE): @@ -795,6 +1149,7 @@ def _get_module_role(starter: bool, terminal_by_TE: bool) -> ModuleRole: start=s, end=e, gene_id=gene.id, + gene_strand=gene.strand, present_domains=list(present), role=role, anatomy=PKSAnatomy( From 14b2e5e8571f7ef22bd24509ff7bc19d72392789 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Sun, 28 Dec 2025 10:56:32 +0100 Subject: [PATCH 06/12] UPD: add substituent type placeholder --- src/biocracker/query/modules.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/biocracker/query/modules.py b/src/biocracker/query/modules.py index 29a60c6..494b8ad 100644 --- a/src/biocracker/query/modules.py +++ b/src/biocracker/query/modules.py @@ -310,6 +310,7 @@ class PKSSubstrate: """ extender_unit: PKSExtenderUnit + substituent_type: int | None = None def to_dict(self) -> dict[str, str]: """ @@ -319,6 +320,7 @@ def to_dict(self) -> dict[str, str]: """ return { "extender_unit": self.extender_unit.value, + "substituent_type": self.substituent_type, } @classmethod @@ -331,6 +333,7 @@ def from_dict(cls, data: dict[str, str]) -> "PKSSubstrate": """ return cls( extender_unit=PKSExtenderUnit(data.get("extender_unit", "UNCLASSIFIED")), + substituent_type=data.get("substituent_type", None), ) @@ -438,6 +441,10 @@ def substrate(self) -> PKSSubstrate: :return: PKSSubstrate object containing substrate information """ + # Configure factory type + def setup_substrate(extender_unit: PKSExtenderUnit) -> PKSSubstrate: + return PKSSubstrate(extender_unit=extender_unit) + # Rules: # - KS + AT with neither KR nor DH nor ER => PKS_A # - KS + AT + KR (no DH and no ER) => PKS_B (KR after AT is naturally true in window order) @@ -448,11 +455,11 @@ def substrate(self) -> PKSSubstrate: self.anatomy.has_active_DH, self.anatomy.has_active_ER, ): - case (False, False, False ): return PKSExtenderUnit.PKS_A - case (True, False, False ): return PKSExtenderUnit.PKS_B - case (True, True, False ): return PKSExtenderUnit.PKS_C - case (True, True, True ): return PKSExtenderUnit.PKS_D - case _: return PKSExtenderUnit.UNCLASSIFIED + case (False, False, False ): return setup_substrate(PKSExtenderUnit.PKS_A) + case (True, False, False ): return setup_substrate(PKSExtenderUnit.PKS_B) + case (True, True, False ): return setup_substrate(PKSExtenderUnit.PKS_C) + case (True, True, True ): return setup_substrate(PKSExtenderUnit.PKS_D) + case _: return setup_substrate(PKSExtenderUnit.UNCLASSIFIED) def to_dict(self) -> dict[str, Any]: """ From a24fdd4b24fe50f887e4b3bffac6d51fc614dfa0 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Sun, 28 Dec 2025 13:06:59 +0100 Subject: [PATCH 07/12] UPD: make download correctly parse and extract .gz files --- .gitignore | 3 +- src/biocracker/utils/download.py | 47 ++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 3040041..348b161 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__/ .vscode/ dist/ scratch/ -*cache*/ \ No newline at end of file +*cache*/ +*pfam*/ \ No newline at end of file diff --git a/src/biocracker/utils/download.py b/src/biocracker/utils/download.py index 10bcf00..ff82efb 100644 --- a/src/biocracker/utils/download.py +++ b/src/biocracker/utils/download.py @@ -6,6 +6,7 @@ import tarfile import tempfile import zipfile +import gzip from hashlib import sha256 from pathlib import Path from urllib.request import Request, urlopen @@ -223,6 +224,7 @@ def download_and_prepare(url: str, cache_dir: str | Path | None = None, *, force .. note:: function is idempotent unless `force=True` .. note:: handles ZIP archives; other archive formats are not auto-detected .. note:: intermediate files are removed on success + :raises RuntimeError: if extraction fails """ base_cache = get_biocracker_cache_dir(cache_dir) downloads_root = base_cache / "downloads" @@ -253,39 +255,42 @@ def download_and_prepare(url: str, cache_dir: str | Path | None = None, *, force if need_download: _download_with_progress(url, archive_path) - # If it's a ZIP archive, extract and remove the archive afterwards - # (We consider "auto-clean" to mean removing the downloaded container if we extracted.) - if zipfile.is_zipfile(archive_path): - # If it's an archive, extract and remove the archive afterwards - extract_dir = item_dir / "extracted" - if extract_dir.exists() and (force or extract_dir.stat().st_size == 0): - shutil.rmtree(extract_dir) - if zipfile.is_zipfile(archive_path) or tarfile.is_tarfile(archive_path): - extract_dir.mkdir(parents=True, exist_ok=True) - # Try auto; if that fails, fall back to explicit formats + extract_dir = item_dir / "extracted" + + is_zip = zipfile.is_zipfile(archive_path) + is_tar = tarfile.is_tarfile(archive_path) + is_gz = archive_path.suffix.lower() == ".gz" and not is_tar + + if is_zip or is_tar or is_gz: + if extract_dir.exists() and force: + shutil.rmtree(extract_dir, ignore_errors=True) + extract_dir.mkdir(parents=True, exist_ok=True) + + if is_gz: + # gunzip + out_path = extract_dir / archive_path.with_suffix("").name + with gzip.open(archive_path, "rb") as fin, open(out_path, "wb") as fout: + shutil.copyfileobj(fin, fout, length=1024 * 1024) + + else: try: shutil.unpack_archive(str(archive_path), str(extract_dir)) except shutil.ReadError: - if zipfile.is_zipfile(archive_path): + if is_zip: with zipfile.ZipFile(archive_path, "r") as zf: zf.extractall(extract_dir) - elif tarfile.is_tarfile(archive_path): + elif is_tar: with tarfile.open(archive_path, "r:*") as tf: tf.extractall(extract_dir) else: - raise - # Remove the container after successful extraction - try: - archive_path.unlink() - except Exception: - pass + raise RuntimeError("unrecognized archive format despite prior checks") + else: - # Non-archive: keep the file in place as the final payload - # (No-op) + # Non-archive: keep the file pass # Mark URL and READY url_txt.write_text(url) ready_marker.touch() - return _resolve_return_path(item_dir) \ No newline at end of file + return _resolve_return_path(item_dir) From 5259dc270ea7967b1dab6a94f71588b7ad7ca12e Mon Sep 17 00:00:00 2001 From: David Meijer Date: Mon, 29 Dec 2025 21:45:48 +0100 Subject: [PATCH 08/12] UPD --- scripts/01_create_hmm_model.py | 174 ++++++++++++++++++++ scripts/{parse_gbks.py => 02_parse_gbks.py} | 33 +++- scripts/{read_gbks.py => 03_read_gbks.py} | 29 ++-- scripts/README.md | 4 + scripts/example.py | 92 +++++++++++ src/biocracker/inference/model_pfam.py | 114 +++++++++++++ src/biocracker/query/modules.py | 47 +++++- 7 files changed, 472 insertions(+), 21 deletions(-) create mode 100644 scripts/01_create_hmm_model.py rename scripts/{parse_gbks.py => 02_parse_gbks.py} (53%) rename scripts/{read_gbks.py => 03_read_gbks.py} (67%) create mode 100644 scripts/README.md create mode 100644 scripts/example.py create mode 100644 src/biocracker/inference/model_pfam.py diff --git a/scripts/01_create_hmm_model.py b/scripts/01_create_hmm_model.py new file mode 100644 index 0000000..a8cc7bb --- /dev/null +++ b/scripts/01_create_hmm_model.py @@ -0,0 +1,174 @@ +"""Script to create a custom HMM model from selected PFAM families or clans.""" + +import argparse +import logging +import subprocess +from pathlib import Path + +from biocracker.utils.logging import setup_logging, add_file_handler +from biocracker.utils.download import download_and_prepare + + +PFAM_A_HMM_URL = r"https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz" +CLAN_MAP_URL = r"https://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam38.0/Pfam-A.clans.tsv.gz" + +LOG_LVL = "INFO" +log = logging.getLogger(__name__) + + +def run_cmd(cmd: list[str]) -> None: + """ + Runs a command using subprocess and raises an error if it fails. + + :param cmd: command to run as a list of strings + """ + res = subprocess.run(cmd, capture_output=True, text=True) + if res.returncode != 0: + raise RuntimeError(f"command '{' '.join(cmd)}' failed ({res.returncode}):\n{res.stderr}") + + +def ensure_hmmer_available() -> None: + """ + Ensures that HMMER suite is available in the system PATH. + + :raises RuntimeError: if HMMER is not available + """ + try: + subprocess.run(["hmmpress", "-h"], capture_output=True, check=True) + subprocess.run(["hmmfetch", "-h"], capture_output=True, check=True) + except (FileNotFoundError, subprocess.CalledProcessError): + raise RuntimeError("HMMER suite is not available; please install it to use this script") + + +def normalize_pfam_hmm_path(p: Path) -> Path: + """ + Normalizes the PFAM HMM path to ensure it points to the Pfam-A.hmm file. + + :param p: path to PFAM HMM file or directory containing it + :return: path to Pfam-A.hmm file + :raises FileNotFoundError: if the Pfam-A.hmm file does not exist + """ + if p.is_dir(): + p = p / "Pfam-A.hmm" + if not p.exists(): + raise FileNotFoundError(f"PFAM HMM file not found at {p}") + return p + + +def ensure_hmmpressed(hmm_file: Path) -> None: + """ + Ensures that the HMM file is pressed using hmmpress. + + :param hmm_file: path to HMM file + """ + needed = [".h3f", ".h3i", ".h3m", ".h3p"] + if all(Path(str(hmm_file) + s).exists() for s in needed): + return + + run_cmd(["hmmpress", "-f", str(hmm_file)]) + + +def load_clan_map(clan_map_path: Path) -> dict[str, set[str]]: + """ + Loads the clan map from the given file. + + :param clan_map_path: path to clan map file + :return: mapping of clan IDs to sets of PFAM model names + """ + clan_map: dict[str, set[str]] = {} + with open(clan_map_path, "r") as f: + for line in f: + parts = line.rstrip("\n").split("\t") + if len(parts) < 4: + continue + _, clan_id, _, model_name = parts[:4] + if clan_id: + clan_map.setdefault(clan_id, set()).add(model_name) + + return clan_map + + +def resolve_model_names(inputs: list[str], clan_map: dict[str, set[str]]) -> set[str]: + """ + Resolves PFAM model names from given inputs, which can be PFAM model names or clan IDs. + + :param inputs: list of PFAM model names or clan IDs + :param clan_map: mapping of clan IDs to sets of PFAM model names + :return: set of resolved PFAM model names + """ + targets = [t.upper() for t in inputs] + model_names: set[str] = set() + + for t in targets: + if t.startswith("PF"): + err_msg = "please provide PFAM model names; not PFAM IDs (e.g., use 'SH3_1' instead of 'PF00018')" + log.error(err_msg) + raise ValueError(err_msg) + if t.startswith("CL"): + model_names |= clan_map.get(t, set()) + if t not in clan_map: + log.warning(f"clan ID {t} not found in clan map") + else: + # allow direct model names too (e.g., SH3_1) + model_names.add(t) + + model_names.discard("") # remove empty strings if any + return model_names + + +def cli() -> argparse.Namespace: + """ + Configures command line interface for HMM model creation script. + + :return: parsed command line arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument("--output", type=str, required=True, help="path to output HMM file") + parser.add_argument("--model-names", nargs="+", type=str, required=True, help="PFAM model and/or clan IDs to include in the HMM model") + parser.add_argument("--cache-dir", type=str, default=None, help="directory to cache downloaded PFAM database files") + parser.add_argument("--name", type=str, default="custom_pfam_model", help="name for the created HMM model") + return parser.parse_args() + + +def main() -> None: + ensure_hmmer_available() + + args = cli() + out_dir = Path(args.output).expanduser().resolve() + out_dir.mkdir(parents=True, exist_ok=True) + + setup_logging(LOG_LVL) + add_file_handler(str(out_dir / "create_hmm_model.log"), level=LOG_LVL) + + log.info("command line arguments:") + for arg, val in vars(args).items(): + log.info(f"\t{arg}: {val}") + + cache_dir = Path(args.cache_dir).expanduser().resolve() if args.cache_dir else out_dir / "cache" + cache_dir.mkdir(parents=True, exist_ok=True) + + pfam_hmm_path = Path(download_and_prepare(PFAM_A_HMM_URL, cache_dir)) + clan_map_path = Path(download_and_prepare(CLAN_MAP_URL, cache_dir)) + + pfam_hmm_file = normalize_pfam_hmm_path(pfam_hmm_path) + ensure_hmmpressed(pfam_hmm_file) + + clan_map = load_clan_map(clan_map_path) + model_names = resolve_model_names(args.model_names, clan_map) + + if not model_names: + raise ValueError("no PFAM model names resolved from the provided inputs") + + log.info(f"collected {len(model_names)} PFAM model names for HMM model creation") + + keys_path = out_dir / f"{args.name}.keys.txt" + keys_path.write_text("\n".join(sorted(model_names)) + "\n") + + output_hmm_path = out_dir / f"{args.name}.hmm" + run_cmd(["hmmfetch", "-f", "-o", str(output_hmm_path), str(pfam_hmm_file), str(keys_path)]) + + log.info(f"created custom HMM model at {output_hmm_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/parse_gbks.py b/scripts/02_parse_gbks.py similarity index 53% rename from scripts/parse_gbks.py rename to scripts/02_parse_gbks.py index ea8082a..107965d 100644 --- a/scripts/parse_gbks.py +++ b/scripts/02_parse_gbks.py @@ -4,15 +4,22 @@ import os import json import glob +import time +import logging +from pathlib import Path from biocracker.utils.logging import setup_logging, add_file_handler from biocracker.io.readers import load_regions from biocracker.io.options import AntiSmashOptions -from biocracker.inference.registry import register_domain_model +from biocracker.inference.registry import GENE_MODELS, DOMAIN_MODELS, register_domain_model, register_gene_model from biocracker.inference.model_paras import ParasModel +from biocracker.inference.model_pfam import PfamModel from biocracker.pipelines.annotate_region import annotate_region +log = logging.getLogger(__name__) + + def cli() -> argparse.Namespace: """ Command line interface for parsing and annotating GenBank files. @@ -22,6 +29,8 @@ def cli() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("--gbks", type=str, required=True) parser.add_argument("--out", type=str, required=True, help="output directory") + parser.add_argument("--cache", type=str, required=True, help="cache directory") + parser.add_argument("--hmms", type=str, required=False, help="directory with HMMs for gene models") return parser.parse_args() @@ -29,13 +38,25 @@ def main() -> None: """ Main function to parse and annotate GenBank files. """ + t0 = time.time() + args = cli() os.makedirs(args.out, exist_ok=True) setup_logging(level="INFO") add_file_handler(os.path.join(args.out, "parse_gbks.log"), level="INFO") - register_domain_model(ParasModel(cache_dir=None, threshold=0.1, keep_top=3)) + register_domain_model(ParasModel(cache_dir=args.cache, threshold=0.1, keep_top=3)) + + if args.hmms: + # Find all .hmm files in the provided directory; use filename (without extension) as label + hmm_files = glob.glob(os.path.join(args.hmms, "*.hmm")) + for hmm_file in hmm_files: + label = Path(os.path.basename(hmm_file)).stem + register_gene_model(PfamModel(hmm_path=hmm_file, label=label)) + + log.info(f"registered domain models: {list(DOMAIN_MODELS)}") + log.info(f"registered gene models: {list(GENE_MODELS)}") options = AntiSmashOptions(readout_level="cand_cluster") @@ -49,6 +70,14 @@ def main() -> None: annotate_region(region) out_f.write(json.dumps(region.to_dict()) + "\n") + te = time.time() + elapsed = te - t0 + elapsed_mins = elapsed / 60.0 + elapsed_hrs = elapsed_mins / 60.0 + log.info(f"total time elapsed: {elapsed:.2f} seconds") + log.info(f"total time elapsed: {elapsed_mins:.2f} minutes") + log.info(f"total time elapsed: {elapsed_hrs:.2f} hours") + if __name__ == "__main__": main() diff --git a/scripts/read_gbks.py b/scripts/03_read_gbks.py similarity index 67% rename from scripts/read_gbks.py rename to scripts/03_read_gbks.py index 5134a95..1142193 100644 --- a/scripts/read_gbks.py +++ b/scripts/03_read_gbks.py @@ -3,11 +3,15 @@ import argparse import json import os +import logging from biocracker.utils.logging import setup_logging, add_file_handler from biocracker.utils.json import iter_json from biocracker.model.region import Region -from biocracker.query.modules import LinearReadout, PKSModule, NRPSModule, linear_readout +from biocracker.query.modules import LinearReadout, linear_readout + + +log = logging.getLogger(__name__) def cli() -> argparse.Namespace: @@ -38,22 +42,25 @@ def main() -> None: readout = linear_readout(region) readouts.append(readout) - print(f"Parsed {len(readouts)} linear readouts in total") + log.info(f"parsed {len(readouts)} linear readouts in total") # Sort on readout ID readouts.sort(key=lambda r: r.id) # Only keep readouts with >= 2 modules readouts = [r for r in readouts if len(r.modules) >= 2] - print(f"Parsed {len(readouts)} linear readouts with >= 2 modules") - - # Get specific readout - readout_ids = ["BGC0000054", "BGC0000055", "BGC0000336"] - specific_readouts = [r for r in readouts if r.id in readout_ids] - for specific_readout in specific_readouts: - print(f"Specific readout {specific_readout.id}: {specific_readout}") - for module in specific_readout.biosynthetic_order(): - print(f"\t{module.substrate if isinstance(module, PKSModule) else module.substrate.name}", f"{module.role}", sep="\t") + log.info(f"parsed {len(readouts)} linear readouts with >= 2 modules") + + # Report on how many readouts contain each found modifier + modifier_counts: dict[str, int] = {} + for readout in readouts: + for modifier_name in set(readout.modifiers): + if modifier_name not in modifier_counts: + modifier_counts[modifier_name] = 0 + modifier_counts[modifier_name] += 1 + log.info(f"modifier presence across all {len(readouts)} readouts:") + for modifier_name, count in modifier_counts.items(): + log.info(f"\t{modifier_name}: {count}") # Write all readouts to output JSONL out_jsonl = os.path.join(args.out, "linear_readouts.jsonl") diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..0be9ac8 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,4 @@ +# Gene models + +glycosylation: CL0113 +oxidoreduction: CL0063 (detects ctoA halogenase chlorotonil) \ No newline at end of file diff --git a/scripts/example.py b/scripts/example.py new file mode 100644 index 0000000..e244ca3 --- /dev/null +++ b/scripts/example.py @@ -0,0 +1,92 @@ +"""Parses antiSMASH GBK file with BioCracker.""" + +import argparse +import logging +import glob +import os +from pathlib import Path + +from biocracker.utils.logging import setup_logging +from biocracker.io.readers import load_regions +from biocracker.io.options import AntiSmashOptions +from biocracker.inference.registry import DOMAIN_MODELS, GENE_MODELS, register_domain_model, register_gene_model +from biocracker.inference.model_paras import ParasModel +from biocracker.inference.model_pfam import PfamModel +from biocracker.pipelines.annotate_region import annotate_region +from biocracker.query.modules import PKSModule, NRPSModule, linear_readout + + +log = logging.getLogger(__name__) + + +def cli() -> argparse.Namespace: + """ + Command line interface for example script. + + :return: parsed command line arguments + """ + parser = argparse.ArgumentParser() + parser.add_argument("--gbk", type=str, required=True, help="input antiSMASH GBK region file") + parser.add_argument("--cache", type=str, required=True, help="path to cache directory") + parser.add_argument("--hmms", type=str, required=False, help="path to custom HMM directory") + parser.add_argument("--by-orf", action="store_true", help="group modules by their originating gene (ORF)") + return parser.parse_args() + + +def pprint_module(module: PKSModule | NRPSModule) -> str: + """ + Pretty print a module. + + :param module: module to print + :return: string representation of the module + """ + if isinstance(module, PKSModule): + return f"PKS Module: role={module.role}, extender_unit={module.substrate.extender_unit.value}, AT_loading_mode={module.anatomy.AT_loading_mode}" + elif isinstance(module, NRPSModule): + return f"NRPS Module: role={module.role}, substrate={module.substrate.name}" + else: + return "Unknown Module" + + +def main() -> None: + args = cli() + + setup_logging(logging.INFO) + + # Register domain and gene models + register_domain_model(ParasModel(cache_dir=args.cache, threshold=0.1, keep_top=3)) + + if args.hmms: + hmm_files = glob.glob(os.path.join(args.hmms, "*.hmm")) + for hmm_file in hmm_files: + label = Path(os.path.basename(hmm_file)).stem + register_gene_model(PfamModel(hmm_path=hmm_file, label=label)) + + log.info(f"registered domain models: {list(DOMAIN_MODELS)}") + log.info(f"registered gene models: {list(GENE_MODELS)}") + + # Configure options for loading regions + options = AntiSmashOptions(readout_level="cand_cluster") + + # Load regions from GBK file; and annotate regions + regions = load_regions(args.gbk, options) + for region in regions: + annotate_region(region) + + # Output some information about the parsed regions + for region in regions: + readout = linear_readout(region) + log.info(readout) + log.info(set(readout.modifiers)) + result = readout.biosynthetic_order(by_orf=args.by_orf) + if args.by_orf: + for gene_id, modules in result: + for mi, module in enumerate(modules): + log.info(f"{gene_id}\t{mi+1}\t{pprint_module(module)}") + else: + for mi, module in enumerate(result): + log.info(f"{mi+1}\t{pprint_module(module)}") + + +if __name__ == "__main__": + main() diff --git a/src/biocracker/inference/model_pfam.py b/src/biocracker/inference/model_pfam.py new file mode 100644 index 0000000..889f7ef --- /dev/null +++ b/src/biocracker/inference/model_pfam.py @@ -0,0 +1,114 @@ +"""Module for PFAM domain inference model.""" + +import logging +import math +import os + +from pyhmmer import easel, plan7, hmmer + +from biocracker.inference.base import GeneInferenceModel +from biocracker.model.gene import Gene +from biocracker.model.inference import InferenceResult + + +log = logging.getLogger(__name__) + + +def evalue_to_score(evalue: float, cap: float = 50.0) -> float: + """ + Convert E-value to [0, 1] confidence score. + + :param evalue: E-value from HMMER + :param cap: value at which score is capped to 1.0 + :return: confidence score between 0 and 1 + """ + if evalue <= 0: + return 1.0 + + score = -math.log10(evalue) + + return min(score / cap, 1.0) + + +class PfamModel(GeneInferenceModel): + """ + PFAM domain inference model. + + :param hmm_path: path to the PFAM HMM file + """ + + evalue_cutoff: float = 1e-5 + use_gathering_cutoff: bool = True + + _alphabet: easel.Alphabet | None = None + _pipeline: plan7.Pipeline | None = None + _hmm: plan7.HMM | None = None + + def __init__(self, hmm_path: str, label: str) -> None: + """ + Initialize the PFAM model. + + :param hmm_path: path to the PFAM HMM file + :param label: label to emit on hit + """ + super().__init__() + + self.name = f"pfam_{label}" + + self.hmm_path = hmm_path + self.label = label + + def __post_init__(self) -> None: + """ + Post-initialization checks. + + :raises FileNotFoundError: if the HMM file does not exist + """ + super().__post_init__() + + # Make sure hmm_path is valid + if not os.path.isfile(self.hmm_path): + log.error(f"PFAM HMM file not found at {self.hmm_path}") + raise FileNotFoundError(f"PFAM HMM file not found at {self.hmm_path}") + + def _init_hmmer(self) -> None: + """ + Initialize HMMER components. + + :raises ValueError: if no HMM is found in the specified file + """ + if self._hmm is not None: + return + + self._alphabet = easel.Alphabet.amino() + self._pipeline = plan7.Pipeline(self._alphabet, bit_cutoffs="gathering" if self.use_gathering_cutoff else None) + + with plan7.HMMFile(self.hmm_path) as hmm_file: + self._hmm = list(hmm_file) + + def predict(self, gene: Gene) -> list[InferenceResult]: + """ + Make predictions for a given gene. + + :param gene: the gene to make predictions for + :return: a list of InferenceResult objects containing the predictions. + """ + self._init_hmmer() + + protein = gene.sequence + if not protein: + return [] + + text_seq = easel.TextSequence(name=gene.id.encode(), sequence=protein) + dig_seq = text_seq.digitize(self._alphabet) + + hits_iter = hmmer.hmmscan([dig_seq], self._hmm, cpus=1, E=self.evalue_cutoff) + + query_hits = next(hits_iter) # only one sequence + + for hit in query_hits: + evalue = hit.evalue + score = evalue_to_score(evalue) + return [self.result(label=self.label, score=score, metadata={"evalue": evalue})] + + return [] diff --git a/src/biocracker/query/modules.py b/src/biocracker/query/modules.py index 494b8ad..e6d9c0b 100644 --- a/src/biocracker/query/modules.py +++ b/src/biocracker/query/modules.py @@ -8,7 +8,7 @@ from collections import Counter from dataclasses import dataclass, field from enum import Enum -from typing import Any +from typing import Any, Literal, overload from biocracker.model.region import Region from biocracker.model.gene import Gene, Strand @@ -450,15 +450,17 @@ def setup_substrate(extender_unit: PKSExtenderUnit) -> PKSSubstrate: # - KS + AT + KR (no DH and no ER) => PKS_B (KR after AT is naturally true in window order) # - KS + AT + KR + DH (no ER) => PKS_C # - KS + AT + KR + DH + ER => PKS_D + # - else UNCLASSIFIED + # Note: assumes that presence of AT domain is already established match ( self.anatomy.has_active_KR, self.anatomy.has_active_DH, self.anatomy.has_active_ER, ): - case (False, False, False ): return setup_substrate(PKSExtenderUnit.PKS_A) - case (True, False, False ): return setup_substrate(PKSExtenderUnit.PKS_B) - case (True, True, False ): return setup_substrate(PKSExtenderUnit.PKS_C) case (True, True, True ): return setup_substrate(PKSExtenderUnit.PKS_D) + case (True, True, False ): return setup_substrate(PKSExtenderUnit.PKS_C) + case (True, _, _ ): return setup_substrate(PKSExtenderUnit.PKS_B) # presence of ER doesn't matter if DH is not present + case (False, _, _ ): return setup_substrate(PKSExtenderUnit.PKS_A) # presence of DH/ER doesn't matter if no KR case _: return setup_substrate(PKSExtenderUnit.UNCLASSIFIED) def to_dict(self) -> dict[str, Any]: @@ -519,6 +521,7 @@ class LinearReadout: qualifiers: dict[str, Any] = field(default_factory=dict) modules: list[Module] = field(default_factory=list) + modifiers: list[str] = field(default_factory=list) def __str__(self) -> str: """ @@ -540,6 +543,7 @@ def to_dict(self) -> dict[str, Any]: "end": self.end, "qualifiers": self.qualifiers, "modules": [module.to_dict() for module in self.modules], + "modifiers": self.modifiers, } @classmethod @@ -568,14 +572,20 @@ def from_dict(cls, data: dict[str, Any]) -> "LinearReadout": end=data["end"], qualifiers=data.get("qualifiers", {}), modules=modules, + modifiers=data.get("modifiers", []), ) - def biosynthetic_order(self) -> list[Module]: + @overload + def biosynthetic_order(self, by_orf: Literal[False] = False) -> list[Module]: ... + @overload + def biosynthetic_order(self, by_orf: Literal[True] = True) -> list[tuple[str, list[Module]]]: ... + + def biosynthetic_order(self, by_orf: bool = False): """ Return modules in biosynthetic order. - :param orientation: 'forward' for 5' to 3', 'reverse' for 3' to 5' - :return: list of Module objects in the specified order + :param by_orf: if True, group modules by their originating gene (ORF) + :return: list of Module objects in biosynthetic order, or list of tuples (gene_id, list of Module) if by_orf is True """ if not self.modules: return [] @@ -604,6 +614,19 @@ def biosynthetic_order(self) -> list[Module]: reverse=global_reverse, ) + if by_orf: + grouped: list[tuple[str, list[Module]]] = [] + for gid in gene_ids: + mods = by_gene[gid] + if gene_strand[gid] is Strand.FORWARD: + mods_sorted = sorted(mods, key=lambda m: m.start) + else: + mods_sorted = sorted(mods, key=lambda m: m.start, reverse=True) + grouped.append((gid, mods_sorted)) + + return grouped + + # Flatten modules in biosynthetic order out: list[Module] = [] for gid in gene_ids: mods = by_gene[gid] @@ -1180,6 +1203,7 @@ def linear_readout(region: Region) -> LinearReadout: assert isinstance(region, Region), "region must be an instance of Region" collected: list[Module] = [] + modifiers: list[str] = [] for gi, gene in enumerate(region.iter_genes()): @@ -1191,10 +1215,17 @@ def linear_readout(region: Region) -> LinearReadout: pks_modules = collect_pks_modules(gene, gi, region.genes) collected.extend(pks_modules) + # Check if there are any gene-level modifiers + if gene.annotations: + for result in gene.annotations.results: + label = result.label + modifiers.append(label) + return LinearReadout( id=region.id, start=region.start, end=region.end, qualifiers=region.qualifiers, - modules=collected + modules=collected, + modifiers=modifiers, ) From 850e1d776c8018177ef57191c404eb754799f765 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Mon, 29 Dec 2025 23:16:20 +0100 Subject: [PATCH 09/12] ENH: make pks module readout cross-gene regionwide --- scripts/example.py | 4 +- src/biocracker/query/modules.py | 510 +++++++++----------------------- 2 files changed, 150 insertions(+), 364 deletions(-) diff --git a/scripts/example.py b/scripts/example.py index e244ca3..4b5208b 100644 --- a/scripts/example.py +++ b/scripts/example.py @@ -41,9 +41,9 @@ def pprint_module(module: PKSModule | NRPSModule) -> str: :return: string representation of the module """ if isinstance(module, PKSModule): - return f"PKS Module: role={module.role}, extender_unit={module.substrate.extender_unit.value}, AT_loading_mode={module.anatomy.AT_loading_mode}" + return f"PKS Module: extender_unit={module.substrate.extender_unit.value}, AT_loading_mode={module.anatomy.AT_loading_mode}" elif isinstance(module, NRPSModule): - return f"NRPS Module: role={module.role}, substrate={module.substrate.name}" + return f"NRPS Module: substrate={module.substrate.name}" else: return "Unknown Module" diff --git a/src/biocracker/query/modules.py b/src/biocracker/query/modules.py index e6d9c0b..171d4a6 100644 --- a/src/biocracker/query/modules.py +++ b/src/biocracker/query/modules.py @@ -14,15 +14,15 @@ from biocracker.model.gene import Gene, Strand from biocracker.model.domain import Domain - -PKS_TYPES = { - "PKS_KS", - "PKS_AT", - "PKS_KR", - "PKS_DH", - "PKS_ER", -} +DH_TYPES = {"PKS_DH", "PKS_DHt", "PKS_DH2"} +KR_TYPES = {"PKS_KR"} +ER_TYPES = {"PKS_ER"} +KS_TYPES = {"PKS_KS"} +AT_TYPES = {"PKS_AT"} +PKS_TYPES = KS_TYPES | AT_TYPES | KR_TYPES | DH_TYPES | ER_TYPES PKS_TE_ALIASES = {"Thioesterase", "PKS_TE", "TE"} +PKS_ACCESSORY = KR_TYPES | DH_TYPES | ER_TYPES +PKS_ANCHOR = KS_TYPES # Common NRPS domain labels found in antiSMASH outputs @@ -36,6 +36,19 @@ NRPS_TE = "Thioesterase" +@dataclass(frozen=True) +class DomainRef: + """ + Reference to a domain within a gene. + + :param gene: Gene object containing the domain + :param domain: Domain object within the gene + """ + + gene: Gene + domain: Domain + + class ModuleType(Enum): """ Enumeration of module types. @@ -48,25 +61,6 @@ class ModuleType(Enum): PKS = "PKS" - -class ModuleRole(Enum): - """ - Enumeration of module roles. - - :cvar STARTER: Starter module - :cvar ELONGATION: Elongation module - :cvar TERMINAL: Terminal module - :cvar STARTER_TERMINAL: Starter and terminal module - :cvar UNKNOWN: Unknown role - """ - - STARTER = "starter" - ELONGATION = "elongation" - TERMINAL = "terminal" - STARTER_TERMINAL = "starter+terminal" - UNKNOWN = "unknown" - - @dataclass class Module(ABC): """ @@ -78,7 +72,6 @@ class Module(ABC): :param gene_id: ID of the gene containing the module :param gene_strand: strand of the gene containing the module :param present_domains: list of domain types present in the module - :param role: functional role of the module """ module_index_in_gene: int start: int @@ -86,7 +79,6 @@ class Module(ABC): gene_id: str gene_strand: Strand present_domains: list[str] - role: ModuleRole @property @abstractmethod @@ -342,12 +334,10 @@ class NRPSModule(Module): """ Nonribosomal peptide synthetase (NRPS) module. - :param role: functional role of the module :param anatomy: anatomical features of the NRPS module :param substrate: predicted substrate information for the NRPS module """ - role: ModuleRole anatomy: NRPSAnatomy predicted_substrate: NRPSSubstrate | None = None @@ -383,7 +373,6 @@ def to_dict(self) -> dict[str, Any]: "gene_id": self.gene_id, "gene_strand": self.gene_strand.value, "present_domains": self.present_domains, - "role": self.role.value, "anatomy": self.anatomy.to_dict(), "predicted_substrate": self.predicted_substrate.to_dict() if self.predicted_substrate else None, } @@ -406,7 +395,6 @@ def from_dict(cls, data: dict[str, Any]) -> "NRPSModule": gene_id=data["gene_id"], gene_strand=Strand(data["gene_strand"]), present_domains=data["present_domains"], - role=ModuleRole(data["role"]), anatomy=NRPSAnatomy.from_dict(anatomy_data), predicted_substrate=NRPSSubstrate.from_dict(substrate_data) if substrate_data else None, ) @@ -418,11 +406,9 @@ class PKSModule(Module): Polyketide synthase (PKS) module. :param type: module type (PKS) - :param role: functional role of the module :param anatomy: anatomical features of the PKS module """ - role: ModuleRole anatomy: PKSAnatomy @property @@ -452,16 +438,22 @@ def setup_substrate(extender_unit: PKSExtenderUnit) -> PKSSubstrate: # - KS + AT + KR + DH + ER => PKS_D # - else UNCLASSIFIED # Note: assumes that presence of AT domain is already established - match ( - self.anatomy.has_active_KR, - self.anatomy.has_active_DH, - self.anatomy.has_active_ER, - ): - case (True, True, True ): return setup_substrate(PKSExtenderUnit.PKS_D) - case (True, True, False ): return setup_substrate(PKSExtenderUnit.PKS_C) - case (True, _, _ ): return setup_substrate(PKSExtenderUnit.PKS_B) # presence of ER doesn't matter if DH is not present - case (False, _, _ ): return setup_substrate(PKSExtenderUnit.PKS_A) # presence of DH/ER doesn't matter if no KR - case _: return setup_substrate(PKSExtenderUnit.UNCLASSIFIED) + + # True activity from qualifiers + KR = self.anatomy.has_active_KR + DH = self.anatomy.has_active_DH + ER = self.anatomy.has_active_ER + + # Product state logic + eff_DH = DH and KR # DH needs KR product to act in canonical cycle + eff_ER = ER and KR and DH # ER typically needs DH product (enoyl) + + match (KR, eff_DH, eff_ER): + case (False, _, _ ): return setup_substrate(PKSExtenderUnit.PKS_A) + case (True, False, _ ): return setup_substrate(PKSExtenderUnit.PKS_B) + case (True, True, False): return setup_substrate(PKSExtenderUnit.PKS_C) + case (True, True, True ): return setup_substrate(PKSExtenderUnit.PKS_D) + case _: return setup_substrate(PKSExtenderUnit.UNCLASSIFIED) def to_dict(self) -> dict[str, Any]: """ @@ -477,7 +469,6 @@ def to_dict(self) -> dict[str, Any]: "gene_id": self.gene_id, "gene_strand": self.gene_strand.value, "present_domains": self.present_domains, - "role": self.role.value, "anatomy": self.anatomy.to_dict(), } @@ -498,7 +489,6 @@ def from_dict(cls, data: dict[str, Any]) -> "PKSModule": gene_id=data["gene_id"], gene_strand=Strand(data["gene_strand"]), present_domains=data["present_domains"], - role=ModuleRole(data["role"]), anatomy=PKSAnatomy.from_dict(anatomy_data), ) @@ -637,16 +627,6 @@ def biosynthetic_order(self, by_orf: bool = False): out.extend(mods_sorted) return out - - -def _domain_index_by_obj(doms: list[Domain]) -> dict[int, int]: - """ - Helper function to create a mapping from Domain object IDs to their indices in a list. - - :param doms: list of Domain objects - :return: dictionary mapping Domain object IDs to their indices - """ - return {id(d): i for i, d in enumerate(doms)} def _domain_types(domains: list[Domain]) -> set[str]: @@ -676,101 +656,25 @@ def _is_domain_type(domain: Domain, label: str | set[str]) -> bool: return domain.type == label -def _is_Cstarter(domain: Domain) -> bool: - """ - Determine if a condensation domain is a C-starter domain based on its qualifiers. - - :param domain: Domain object to evaluate - :return: True if the domain is a C-starter, False otherwise - """ - if not domain.type or domain.type != "Condensation": - return False - - txts = [] - if domain.id: - txts.append(domain.id) - - for _, vals in domain.raw_qualifiers.items(): - # Join lists and scalars; qualifiers may be list[str] - if isinstance(vals, (list, tuple)): - txts.extend(map(str, vals)) - else: - txts.append(str(vals)) - - blob = " ".join(txts).lower() - - return ("starter" in blob) or ("cstarter" in blob) or ("condensation_starter" in blob) - - -def _upstream_loading_cassette(all_genes: list[Gene], gene_idx_in_genomic_order: int, max_bp: int = 20_000) -> bool: +def _is_pks_ks(d: Domain) -> bool: """ - Check for upstream loading cassette (CAL + ACP) in upstream genes within max_bp distance. - - :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx_in_genomic_order: index of the gene gene in all_genes - :param max_bp: maximum base pair distance to search upstream - :return: True if a loading cassette is found upstream within max_bp, False otherwise - """ - cur_start = all_genes[gene_idx_in_genomic_order].start - - seen_cal = False - seen_acp = False - for j in range(gene_idx_in_genomic_order - 1, -1, -1): - g = all_genes[j] - if cur_start - g.end > max_bp: - break # exceeded max distance - types = _domain_types(g.domains) - d_ids = {d.id for d in g.domains if d.id} - if ("CAL_domain" in types) or any("faal" in d_id.lower() for d_id in d_ids): - seen_cal = True - if ("PP-binding" in types) or ("ACP" in types) or any("acp" in d_id.lower() for d_id in d_ids): - seen_acp = True - if seen_cal and seen_acp: - return True + Check if a domain is a PKS KS domain. - return False - - -def _upstream_has_nrps_A(all_genes: list[Gene], gene_idx_in_genomic_order: int) -> bool: + :param d: Domain object to check + :return: True if the domain is a PKS KS domain, False otherwise """ - Check if there is an upstream gene with an NRPS A-domain. - - :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx_in_genomic_order: index of the gene gene in all_genes - :return: True if there is an upstream NRPS A-domain, False otherwise - """ - for j in range(gene_idx_in_genomic_order - 1, -1, -1): - if any(_is_domain_type(d, NRPS_A) for d in all_genes[j].domains): - return True - - return False + return d.type == "PKS_KS" -def _split_module_on_KS(domains: list[Domain]) -> list[list[Domain]]: +def _is_pks_domain(d: Domain) -> bool: """ - Split a list of domains into windows based on PKS KS domains. - - :param domains: List of Domain objects - :return: List of lists of Domain objects, each representing a module window - """ - windows: list[list[Domain]] = [] - cur: list[Domain] = [] - - for d in domains: - if d.type == "PKS_KS": - # Start new module window anchored at this KS - if cur: - windows.append(cur) - cur = [d] - else: - if cur: # only append if we have started a module - cur.append(d) - - if cur: - windows.append(cur) + Check if a domain is a PKS domain. - return windows - + :param d: Domain object to check + :return: True if the domain is a PKS domain, False otherwise + """ + return d.type in PKS_TYPES or (d.type in PKS_TE_ALIASES) + def _is_active_accessory_domain(domain: Domain) -> bool: """ @@ -782,7 +686,7 @@ def _is_active_accessory_domain(domain: Domain) -> bool: if not domain.type: return True # can't tell, assume active - if domain.type not in {"PKS_KR", "PKS_DH", "PKS_ER"}: + if domain.type not in PKS_ACCESSORY: return True # not a reducible domain, consider active by default texts = [] @@ -827,23 +731,13 @@ def _classify_pks_window(window: list[Domain]) -> tuple[set[str], bool, bool, bo present = set(types_linear) has_AT = "PKS_AT" in present - has_active_KR = any("PKS_KR" in present and _is_active_accessory_domain(d) for d in window if d.type == "PKS_KR") - has_active_DH = any("PKS_DH" in present and _is_active_accessory_domain(d) for d in window if d.type == "PKS_DH") - has_active_ER = any("PKS_ER" in present and _is_active_accessory_domain(d) for d in window if d.type == "PKS_ER") + has_active_KR = any(d.type in KR_TYPES and _is_active_accessory_domain(d) for d in window) + has_active_DH = any(d.type in DH_TYPES and _is_active_accessory_domain(d) for d in window) + has_active_ER = any(d.type in ER_TYPES and _is_active_accessory_domain(d) for d in window) return present, has_active_KR, has_active_DH, has_active_ER, has_AT -def _window_bounds(window: list[Domain]) -> tuple[int, int]: - """ - Get the start and end positions of a domain window. - - :param window: list of Domain objects in the module window - :return: tuple of (start, end) positions - """ - return min(d.start for d in window), max(d.end for d in window) - - def _is_AT_only_gene(gene: Gene) -> bool: """ Helper function to determine if a gene is an acyltransferase-domain-only gene. @@ -870,133 +764,17 @@ def _find_genomic_upstream_AT_only_gene(all_genes: list[Gene], gene_idx_in_genom return None -def _upstream_has_pks_KS( - all_genes: list[Gene], - gene_idx_in_genomic_order: int, - doms: list[Domain], - ks_domain: Domain, -) -> bool: +def genes_biosynthetic(region: Region) -> list[Gene]: """ - Check if there is an upstream gene with a PKS KS-domain. - - :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx_in_genomic_order: index of the gene gene in all_genes - :param doms: list of Domain objects in the current gene, sorted for biosynthetic order - :param ks_domain: the KS Domain object to check upstream of - :return: True if there is an upstream KS-domain, False otherwise - """ - # Genes upstream - for j in range(gene_idx_in_genomic_order -1, -1, -1): - if any(d.type == "PKS_KS" for d in all_genes[j].domains): - return True - - # Same gene: any KS earlier in biosynthetic order than this KS? - idx = _domain_index_by_obj(doms) - ks_i = idx[id(ks_domain)] - return any(d.type == "PKS_KS" and idx[id(d)] < ks_i for d in doms) - - -def _standalone_pks_AT_upstream( - all_genes: list[Gene], - gene_idx_in_genomic_order: int, - doms: list[Domain], - ks_domain: Domain, - max_bp: int = 20_000 -) -> bool: - """ - Check for standalone PKS AT domain in upstream genes within max_bp distance. - - :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx_in_genomic_order: index of the gene gene in all_genes - :param doms: list of Domain objects in the current gene, sorted for biosynthetic order - :param ks_domain: the KS Domain object to check upstream of - :param max_bp: maximum base pair distance to search upstream - :return: True if a standalone PKS AT domain is found upstream within max_bp, False otherwise - """ - # Same gene: any AT earlier than this KS in biosynthetic order? - idx = _domain_index_by_obj(doms) - ks_i = idx[id(ks_domain)] - if any(d.type == "PKS_AT" and idx[id(d)] < ks_i for d in doms): - return True - - # Other genes upstream, within distance (still genomic) - ks_start = ks_domain.start - cur_start = ks_start - for j in range(gene_idx_in_genomic_order - 1, -1, -1): - g = all_genes[j] - if cur_start - g.end > max_bp: - break - if any(d.type == "PKS_AT" for d in g.domains): - return True - - return False + Return genes in biosynthetic order within a region. - -def _is_last_global_KS( - all_genes: list[Gene], - gene_idx_in_genomic_order: int, - doms: list[Domain], - ks_domain: Domain, -) -> bool: - """ - Check if the given KS domain is the last KS domain in the entire gene cluster/region. - - :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx_in_genomic_order: index of the gene gene in all_genes - :param doms: list of Domain objects in the current gene, sorted for biosynthetic order - :param ks_domain: the KS Domain object to check - :return: True if this is the last KS domain, False otherwise + :param region: Region object + :return: list of Gene objects in biosynthetic order """ - # Same gene: any KS later in biosynthetic order? - idx = _domain_index_by_obj(doms) - ks_i = idx[id(ks_domain)] - if any(d.type == "PKS_KS" and idx[id(d)] > ks_i for d in doms): - return False - - # Downstream genes (genomic list order) - for j in range(gene_idx_in_genomic_order + 1, len(all_genes)): - if any(d.type == "PKS_KS" for d in all_genes[j].domains): - return False - - return True - - -def _genomic_downstream_has_TE( - all_genes: list[Gene], - gene_idx_in_genomic_order: int, - doms: list[Domain], - win: list[Domain], - max_bp: int = 20_000 -) -> bool: - """ - Check for downstream thioesterase (TE) domain in downstream genes within max_bp distance. - - :param all_genes: list of all Gene objects in the region/cluster - :param gene_idx_in_genomic_order: index of the gene gene in all_genes - :param doms: list of Domain objects in the current gene, sorted for biosynthetic order - :param win: current module window (list of Domain objects) - :param max_bp: maximum base pair distance to search downstream - :return: True if a thioesterase domain is found downstream within max_bp, False otherwise - """ - idx = _domain_index_by_obj(doms) - last_i = idx[id(win[-1])] - - # Same gene: any TE later than the window end in biosynthetic order? - if any(d.type in PKS_TE_ALIASES and idx[id(d)] > last_i for d in doms): - return True - - # Other genes downstream, within distance (still genomic) - from_bp = max(d.end for d in win) # genomic coordinate for distance window - cur_end = from_bp - for j in range(gene_idx_in_genomic_order + 1, len(all_genes)): - gene = all_genes[j] - if gene.start - cur_end > max_bp: - break # exceeded max distance - - if any(d.type in PKS_TE_ALIASES for d in gene.domains): - return True - - return False + genes = list(region.iter_genes()) + strand_counts = Counter(g.strand for g in genes) + global_reverse = strand_counts[Strand.REVERSE] > strand_counts[Strand.FORWARD] + return sorted(genes, key=lambda g: g.start, reverse=global_reverse) def domains_biosynthetic(gene: Gene) -> list[Domain]: @@ -1014,14 +792,27 @@ def domains_biosynthetic(gene: Gene) -> list[Domain]: return doms -def collect_nrps_modules(gene: Gene, gene_idx_in_genomic_order: int, all_genes: list[Gene]) -> list[NRPSModule]: +def region_domain_stream(region: Region) -> list[DomainRef]: + """ + Return domains in biosynthetic order within a region. + + :param region: Region object + :return: list of DomainRef objects in biosynthetic order + """ + out: list[DomainRef] = [] + for g in genes_biosynthetic(region): + for d in domains_biosynthetic(g): + out.append(DomainRef(gene=g, domain=d)) + + return out + + +def collect_nrps_modules(gene: Gene) -> list[NRPSModule]: """ Collect NRPS modules from a given gene. :param gene: Gene object to analyze - :param gene_idx_in_genomic_order: index of the gene in the region's gene list - :param all_genes: List of all genes in the region - :return: List of NRPSModule objects""" + :return: list of NRPSModule objects""" doms: list[Domain] = domains_biosynthetic(gene) out: list[NRPSModule] = [] @@ -1043,7 +834,6 @@ def collect_nrps_modules(gene: Gene, gene_idx_in_genomic_order: int, all_genes: present = _domain_types(window) has_C = any(_is_domain_type(d, NRPS_C) for d in window) - has_Cstarter = any(_is_Cstarter(d) for d in window) has_T = any(_is_domain_type(d, NRPS_T_ALIASES) for d in window) has_E = any(_is_domain_type(d, NRPS_E) for d in window) has_MT = any(_is_domain_type(d, NRPS_MT_ALIASES) for d in window) @@ -1051,28 +841,6 @@ def collect_nrps_modules(gene: Gene, gene_idx_in_genomic_order: int, all_genes: has_R = any(_is_domain_type(d, NRPS_R_ALIASES) for d in window) has_TE = any(_is_domain_type(d, NRPS_TE) for d in window) - # Fallback evidence of a separate loading cassette upstream - loading_upstream = _upstream_loading_cassette(all_genes, gene_idx_in_genomic_order) - upstream_has_A = _upstream_has_nrps_A(all_genes, gene_idx_in_genomic_order) - - # Role heuristic - is_first_module_in_gene = mi == 0 - - starter = ( - has_Cstarter - or (is_first_module_in_gene and loading_upstream and not upstream_has_A) - or ((not has_C) and not upstream_has_A) - ) - terminal = has_TE or has_R - - def _get_module_role(starter: bool, terminal: bool) -> ModuleRole: - match (starter, terminal): - case (True, True ): return ModuleRole.STARTER_TERMINAL - case (True, False): return ModuleRole.STARTER - case (False, True ): return ModuleRole.TERMINAL - case (False, False): return ModuleRole.ELONGATION - role: ModuleRole = _get_module_role(starter, terminal) - s = min(d.start for d in window) e = max(d.end for d in window) @@ -1103,7 +871,6 @@ def _get_module_role(starter: bool, terminal: bool) -> ModuleRole: gene_id=gene.id, gene_strand=gene.strand, present_domains=list(present), - role=role, anatomy=NRPSAnatomy( has_C=has_C, has_T=has_T, @@ -1119,69 +886,90 @@ def _get_module_role(starter: bool, terminal: bool) -> ModuleRole: return out -def collect_pks_modules(gene: Gene, gene_idx_in_genomic_order: int, all_genes: list[Gene]) -> list[PKSModule]: +def collect_pks_modules(region: Region, max_cross_gene_bp: int = 20_000) -> list[PKSModule]: """ - Collect PKS modules from a given gene. - - :param gene: Gene object to analyze - :param gene_idx_in_genomic_order: index of the gene in the region's gene list - :param all_genes: list of all genes in the region - :return: list of PKSModule objects + Collect PKS modules across a genomic region, allowing for cross-gene module assembly. + + :param region: Region object representing the genomic region + :param max_cross_gene_bp: maximum base pair distance to search across genes for module assembly + :return: list of PKSModule objects collected across the region """ - out: list[PKSModule] = [] + stream = region_domain_stream(region) - if all(d.type != "PKS_KS" for d in gene.domains): - return out # no KS domains, no modules + # Locate all KS anchors in the stream + ks_pos = [i for i, ref in enumerate(stream) if _is_pks_ks(ref.domain)] + if not ks_pos: + return [] # no KS domains, no modules - doms = domains_biosynthetic(gene) - windows = _split_module_on_KS(doms) - for mi, win in enumerate(windows): + out: list[PKSModule] = [] + module_index_by_gene: dict[str, int] = Counter() + + for k_i, start_idx in enumerate(ks_pos): + end_idx = ks_pos[k_i + 1] if k_i + 1 < len(ks_pos) else len(stream) + ks_ref = stream[start_idx] + ks = ks_ref.domain + + # Cancidate window: KS -> next KS (exclusive) + window_refs = stream[start_idx:end_idx] + + # Don't vaccum up far-away stuff + filtered: list[DomainRef] = [] + ks_end = ks.end + for ref in window_refs: + d = ref.domain + if d is ks: + filtered.append(ref) + continue + if abs(d.start - ks_end) <= max_cross_gene_bp: + filtered.append(ref) + else: + # Too far away; stop early + break + + # Collect PKS domains in the window + window_domains = [r.domain for r in filtered if _is_pks_domain(r.domain)] ( present, has_active_KR, has_active_DH, has_active_ER, - has_AT, - ) = _classify_pks_window(win) - - s, e = _window_bounds(win) + has_AT + ) = _classify_pks_window(window_domains) + # Determine AT mode (cis or trans) + # Note that upstream should be upstream in gene list here (genomic order) + genes = list(region.iter_genes()) + gene_idx = genes.index(ks_ref.gene) if has_AT: AT_src: ATLoadingMode = ATLoadingMode.CIS else: - AT_src: ATLoadingMode = ( - ATLoadingMode.TRANS - if _find_genomic_upstream_AT_only_gene(all_genes, gene_idx_in_genomic_order) is not None + AT_src = ( + ATLoadingMode.TRANS + if _find_genomic_upstream_AT_only_gene(genes, gene_idx) else ATLoadingMode.UNKNOWN ) - # Assign provisional PKS role - has_TE_in_window = any(d.type in PKS_TE_ALIASES for d in win) - KS_domain = win[0] # first domain in window is KS since we split on KS - upstream_has_KS = _upstream_has_pks_KS(all_genes, gene_idx_in_genomic_order, doms, KS_domain) - starter = _standalone_pks_AT_upstream(all_genes, gene_idx_in_genomic_order, doms, KS_domain) and not upstream_has_KS - - terminal_by_TE = False - if _is_last_global_KS(all_genes, gene_idx_in_genomic_order, doms, KS_domain): - terminal_by_TE = has_TE_in_window or _genomic_downstream_has_TE(all_genes, gene_idx_in_genomic_order, doms, win) - - def _get_module_role(starter: bool, terminal_by_TE: bool) -> ModuleRole: - match (starter, terminal_by_TE): - case (True, True ): return ModuleRole.STARTER_TERMINAL - case (True, False): return ModuleRole.STARTER - case (False, True ): return ModuleRole.TERMINAL - case (False, False): return ModuleRole.ELONGATION - role: ModuleRole = _get_module_role(starter, terminal_by_TE) - - s, e = _window_bounds(win) + # DHt is more commonly found in trans PKS modules, so we treat it as inactive in cis modules + if AT_src is ATLoadingMode.CIS: + present_DH_types = present.intersection(DH_TYPES) + if len(present_DH_types) == 1 and "PKS_DHt" in present_DH_types: + has_active_DH = False + + # Use window_domains bounds for start/end + s = min(r.domain.start for r in filtered) + e = max(r.domain.end for r in filtered) + + gid = ks_ref.gene.id + mi = module_index_by_gene[gid] + module_index_by_gene[gid] += 1 + out.append(PKSModule( module_index_in_gene=mi, start=s, end=e, - gene_id=gene.id, - gene_strand=gene.strand, + gene_id=gid, + gene_strand=ks_ref.gene.strand, present_domains=list(present), - role=role, anatomy=PKSAnatomy( AT_loading_mode=AT_src, has_active_KR=has_active_KR, @@ -1205,17 +993,15 @@ def linear_readout(region: Region) -> LinearReadout: collected: list[Module] = [] modifiers: list[str] = [] - for gi, gene in enumerate(region.iter_genes()): - - # Collect NRPS modules - nrps_modules = collect_nrps_modules(gene, gi, region.genes) - collected.extend(nrps_modules) + # Collect NRPS modules (gene-level) + for gene in region.iter_genes(): + collected.extend(collect_nrps_modules(gene)) - # Collect PKS modules - pks_modules = collect_pks_modules(gene, gi, region.genes) - collected.extend(pks_modules) + # Collect PKS modules region-wide (cross-gene) + collected.extend(collect_pks_modules(region)) - # Check if there are any gene-level modifiers + # Check if there are any gene-level modifiers + for gene in region.iter_genes(): if gene.annotations: for result in gene.annotations.results: label = result.label From 52c59a3cc9f159fc065ea22470ba543f9f4b819f Mon Sep 17 00:00:00 2001 From: David Meijer Date: Tue, 30 Dec 2025 00:18:07 +0100 Subject: [PATCH 10/12] UPD --- pyproject.toml | 3 +- scripts/02_parse_gbks.py | 145 +++++++++++++++++++++++++++++++-------- 2 files changed, 118 insertions(+), 30 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b101214..27c2529 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,8 @@ keywords = ["parser", "antismash"] dependencies = [ "biopython", "joblib", - "tqdm" + "tqdm", + "pyhmmer" ] classifiers = [ diff --git a/scripts/02_parse_gbks.py b/scripts/02_parse_gbks.py index 107965d..657f185 100644 --- a/scripts/02_parse_gbks.py +++ b/scripts/02_parse_gbks.py @@ -7,6 +7,8 @@ import time import logging from pathlib import Path +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import Generator from biocracker.utils.logging import setup_logging, add_file_handler from biocracker.io.readers import load_regions @@ -20,6 +22,10 @@ log = logging.getLogger(__name__) +_WORKER_OPTIONS = None +_WORKER_READY = False + + def cli() -> argparse.Namespace: """ Command line interface for parsing and annotating GenBank files. @@ -31,52 +37,133 @@ def cli() -> argparse.Namespace: parser.add_argument("--out", type=str, required=True, help="output directory") parser.add_argument("--cache", type=str, required=True, help="cache directory") parser.add_argument("--hmms", type=str, required=False, help="directory with HMMs for gene models") + parser.add_argument("--workers", type=int, default=1, help="number of parallel workers to use") return parser.parse_args() -def main() -> None: +def iter_gbks(folder: str) -> Generator[str, None, None]: """ - Main function to parse and annotate GenBank files. + Iterate over all GenBank files in a folder. + + :param folder: folder path + :yield: paths to GenBank files """ - t0 = time.time() + with os.scandir(folder) as it: + for e in it: + if e.is_file() and e.name.endswith(".gbk"): + yield e.path - args = cli() - os.makedirs(args.out, exist_ok=True) - setup_logging(level="INFO") - add_file_handler(os.path.join(args.out, "parse_gbks.log"), level="INFO") +def _init_worker(cache_dir: str, hmms_dir: str | None) -> None: + """ + Initialize a worker process by setting up its own cache and registering models. + + :param cache_dir: base cache directory + :param hmms_dir: directory containing HMM files for gene models (or None) + """ + global _WORKER_OPTIONS, _WORKER_READY - register_domain_model(ParasModel(cache_dir=args.cache, threshold=0.1, keep_top=3)) + per_worker_cache = os.path.join(cache_dir, f"worker_{os.getpid()}") + os.makedirs(per_worker_cache, exist_ok=True) - if args.hmms: - # Find all .hmm files in the provided directory; use filename (without extension) as label - hmm_files = glob.glob(os.path.join(args.hmms, "*.hmm")) + register_domain_model(ParasModel(cache_dir=per_worker_cache, threshold=0.1, keep_top=3)) + + if hmms_dir: + hmm_files = glob.glob(os.path.join(hmms_dir, "*.hmm")) for hmm_file in hmm_files: label = Path(os.path.basename(hmm_file)).stem register_gene_model(PfamModel(hmm_path=hmm_file, label=label)) - log.info(f"registered domain models: {list(DOMAIN_MODELS)}") - log.info(f"registered gene models: {list(GENE_MODELS)}") + _WORKER_OPTIONS = AntiSmashOptions(readout_level="cand_cluster") + _WORKER_READY = True + + +def _process_one_gbk(gbk_file: str) -> list[str]: + """ + Process a single GenBank file: load regions, annotate them, and return JSON lines. + + :param gbk_file: path to the GenBank file + :return: list of JSON strings representing annotated regions + """ + if not _WORKER_READY: + raise RuntimeError("worker not initialized; call _init_worker first") + + regions = load_regions(gbk_file, _WORKER_OPTIONS) - options = AntiSmashOptions(readout_level="cand_cluster") + lines: list[str] = [] + for region in regions: + annotate_region(region) + lines.append(json.dumps(region.to_dict())) - gbk_iter = glob.iglob(f"{args.gbks}/*.gbk") + return lines + + +def main() -> None: + """ + Main function to parse and annotate GenBank files. + """ + t0 = time.time() + args = cli() + + os.makedirs(args.out, exist_ok=True) + os.makedirs(args.cache, exist_ok=True) + + setup_logging(level="INFO") + add_file_handler(os.path.join(args.out, "parse_gbks.log"), level="INFO") out_jsonl = os.path.join(args.out, "regions.jsonl") - with open(out_jsonl, "w") as out_f: - for gbk_file in gbk_iter: - regions = load_regions(gbk_file, options) - for region in regions: - annotate_region(region) - out_f.write(json.dumps(region.to_dict()) + "\n") - - te = time.time() - elapsed = te - t0 - elapsed_mins = elapsed / 60.0 - elapsed_hrs = elapsed_mins / 60.0 - log.info(f"total time elapsed: {elapsed:.2f} seconds") - log.info(f"total time elapsed: {elapsed_mins:.2f} minutes") - log.info(f"total time elapsed: {elapsed_hrs:.2f} hours") + + log.info(f"workers: {args.workers}") + log.info(f"gbk dir: {args.gbks}") + log.info(f"out: {args.out}") + log.info(f"cache: {args.cache}") + + completed_files = 0 + + # Tune chunks by letting executor pull tasks naturally; keep a bounded number in flight + max_in_flight = args.workers * 5 + + gbk_iter = iter_gbks(args.gbks) + + with open(out_jsonl, "w") as out_f, ProcessPoolExecutor( + max_workers=args.workers, + initializer=_init_worker, + initargs=(args.cache, args.hmms), + ) as ex: + futures = set() + + # Prime the queue + try: + for _ in range(max_in_flight): + gbk_file = next(gbk_iter) + futures.add(ex.submit(_process_one_gbk, gbk_file)) + except StopIteration: + pass + + while futures: + # Write each result as it completes, and add a new task if available + for fut in as_completed(futures): + futures.remove(fut) + lines = fut.result() + if lines: + out_f.write("\n".join(lines) + "\n") + completed_files += 1 + + # Submit next + try: + gbk = next(gbk_iter) + futures.add(ex.submit(_process_one_gbk, gbk)) + except StopIteration: + pass + + if completed_files % 1000 == 0: + log.info(f"completed {completed_files} GenBank files") + + break # exit for-loop to re-evaluate futures + + t1 = time.time() + log.info(f"process files total: {completed_files}") + log.info(f"total time: {t1 - t0:.1f} seconds ({(t1 - t0) / completed_files:.2f} sec/file)") if __name__ == "__main__": From 4d56aa69f778072a5ce151f31c4a2570bbe8a517 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Tue, 30 Dec 2025 01:35:00 +0100 Subject: [PATCH 11/12] UPD: multiprocessing --- pyproject.toml | 7 ++- scripts/02_parse_gbks.py | 29 ++++++--- scripts/example.py | 2 +- src/biocracker/inference/model_paras.py | 83 ++++++++++++++++++++----- 4 files changed, 93 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 27c2529..d34f695 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "biocracker" version = "1.1.1" description = "BioCracker is a parser for antiSMASH output GenBank files" readme = "README.md" -requires-python = ">=3.10" +requires-python = ">=3.10,<3.12" license = "MIT" license-files = [ "LICENSE", @@ -14,10 +14,12 @@ authors = [ keywords = ["parser", "antismash"] dependencies = [ + "paras @ git+https://github.com/bthedragonmaster/parasect.git@v2.0.0", "biopython", "joblib", "tqdm", - "pyhmmer" + "pyhmmer", + "ijson" ] classifiers = [ @@ -28,7 +30,6 @@ classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Bio-Informatics" ] diff --git a/scripts/02_parse_gbks.py b/scripts/02_parse_gbks.py index 657f185..907acb3 100644 --- a/scripts/02_parse_gbks.py +++ b/scripts/02_parse_gbks.py @@ -13,7 +13,7 @@ from biocracker.utils.logging import setup_logging, add_file_handler from biocracker.io.readers import load_regions from biocracker.io.options import AntiSmashOptions -from biocracker.inference.registry import GENE_MODELS, DOMAIN_MODELS, register_domain_model, register_gene_model +from biocracker.inference.registry import register_domain_model, register_gene_model from biocracker.inference.model_paras import ParasModel from biocracker.inference.model_pfam import PfamModel from biocracker.pipelines.annotate_region import annotate_region @@ -35,7 +35,8 @@ def cli() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument("--gbks", type=str, required=True) parser.add_argument("--out", type=str, required=True, help="output directory") - parser.add_argument("--cache", type=str, required=True, help="cache directory") + parser.add_argument("--paras", type=str, required=False, help="path to all-substrates PARAS model file") + parser.add_argument("--cache", type=str, required=False, help="cache directory") parser.add_argument("--hmms", type=str, required=False, help="directory with HMMs for gene models") parser.add_argument("--workers", type=int, default=1, help="number of parallel workers to use") return parser.parse_args() @@ -54,11 +55,16 @@ def iter_gbks(folder: str) -> Generator[str, None, None]: yield e.path -def _init_worker(cache_dir: str, hmms_dir: str | None) -> None: +def _init_worker( + cache_dir: str, + paras_model_path: str | None, + hmms_dir: str | None +) -> None: """ Initialize a worker process by setting up its own cache and registering models. :param cache_dir: base cache directory + :param paras_model_path: path to the PARAS model file (or None) :param hmms_dir: directory containing HMM files for gene models (or None) """ global _WORKER_OPTIONS, _WORKER_READY @@ -66,7 +72,8 @@ def _init_worker(cache_dir: str, hmms_dir: str | None) -> None: per_worker_cache = os.path.join(cache_dir, f"worker_{os.getpid()}") os.makedirs(per_worker_cache, exist_ok=True) - register_domain_model(ParasModel(cache_dir=per_worker_cache, threshold=0.1, keep_top=3)) + pm = ParasModel(threshold=0.1, keep_top=3, cache_dir=per_worker_cache, model_path=paras_model_path) + register_domain_model(pm) if hmms_dir: hmm_files = glob.glob(os.path.join(hmms_dir, "*.hmm")) @@ -106,17 +113,25 @@ def main() -> None: args = cli() os.makedirs(args.out, exist_ok=True) - os.makedirs(args.cache, exist_ok=True) setup_logging(level="INFO") add_file_handler(os.path.join(args.out, "parse_gbks.log"), level="INFO") out_jsonl = os.path.join(args.out, "regions.jsonl") + # If cache dir is not given, set output dir as cache + if args.cache is None: + args.cache = args.out + + os.makedirs(args.cache, exist_ok=True) + log.info(f"workers: {args.workers}") log.info(f"gbk dir: {args.gbks}") log.info(f"out: {args.out}") log.info(f"cache: {args.cache}") + log.info(f"paras: {args.paras}") + log.info(f"hmms: {args.hmms}") + log.info(f"out jsonl: {out_jsonl}") completed_files = 0 @@ -128,7 +143,7 @@ def main() -> None: with open(out_jsonl, "w") as out_f, ProcessPoolExecutor( max_workers=args.workers, initializer=_init_worker, - initargs=(args.cache, args.hmms), + initargs=(args.cache, args.paras, args.hmms) ) as ex: futures = set() @@ -156,7 +171,7 @@ def main() -> None: except StopIteration: pass - if completed_files % 1000 == 0: + if completed_files % 100 == 0: log.info(f"completed {completed_files} GenBank files") break # exit for-loop to re-evaluate futures diff --git a/scripts/example.py b/scripts/example.py index 4b5208b..127ac94 100644 --- a/scripts/example.py +++ b/scripts/example.py @@ -54,7 +54,7 @@ def main() -> None: setup_logging(logging.INFO) # Register domain and gene models - register_domain_model(ParasModel(cache_dir=args.cache, threshold=0.1, keep_top=3)) + register_domain_model(ParasModel(threshold=0.1, keep_top=3, cache_dir=args.cache)) if args.hmms: hmm_files = glob.glob(os.path.join(args.hmms, "*.hmm")) diff --git a/src/biocracker/inference/model_paras.py b/src/biocracker/inference/model_paras.py index 954c920..d7a5cfe 100644 --- a/src/biocracker/inference/model_paras.py +++ b/src/biocracker/inference/model_paras.py @@ -10,7 +10,6 @@ import joblib import numpy as np from pyhmmer import easel, plan7, hmmer -from sklearn.ensemble import RandomForestClassifier import biocracker.data from biocracker.inference.base import DomainInferenceModel @@ -26,7 +25,8 @@ PARAS_DOWNLOAD_URL = "https://zenodo.org/records/17224548/files/all_substrates_model.paras.gz?download=1" -_PARAS_MODEL_CACHE: dict[str, object] = {} +_PARAS_MODEL_PATH_CACHE: dict[str, Path] = {} +_PARAS_MODEL_OBJ_CACHE: dict[str, object] = {} HMM_DB_PATH = str(files(biocracker.data).joinpath("AMP-binding_converted.hmm")) @@ -683,23 +683,63 @@ def featurize_signature(sig: str) -> np.ndarray: return features.flatten() # shape (n_positions * n_features,) -def load_paras_model(cache_dir: Path) -> RandomForestClassifier: +def get_paras_model_path(cache_dir: Path) -> Path: """ - Load the PARAS model from disk (cached in memory for reuse). + Get the path to the cached PARAS model, downloading it if necessary. :param cache_dir: Path to the cache directory - :return: loaded PARAS model + :return: Path to the PARAS model file """ - global _PARAS_MODEL_CACHE + cache_dir = Path(cache_dir) + cache_dir.mkdir(parents=True, exist_ok=True) - # If model already loaded, return it immediately - if PARAS_DOWNLOAD_URL in _PARAS_MODEL_CACHE: - return _PARAS_MODEL_CACHE[PARAS_DOWNLOAD_URL] - - # Otherwise, ensure the file is downloaded and load it + # If we already know the path and it still exists, reuse it + cached = _PARAS_MODEL_PATH_CACHE.get(PARAS_DOWNLOAD_URL) + if cached is not None and cached.exists(): + return cached + + # Otherwise download/prepare and remember the path model_path = download_and_prepare(PARAS_DOWNLOAD_URL, cache_dir) - model = joblib.load(model_path) - _PARAS_MODEL_CACHE[PARAS_DOWNLOAD_URL] = model + _PARAS_MODEL_PATH_CACHE[PARAS_DOWNLOAD_URL] = model_path + return model_path + + +def resolve_paras_model_path(model_path: Path | None, cache_dir: Path) -> Path: + """ + Resolve the PARAS model path, downloading it to cache if necessary. + + :param model_path: user-specified model path or None + :param cache_dir: Path to the cache directory + :return: Path to the PARAS model file + """ + cache_dir = Path(cache_dir) + cache_dir.mkdir(parents=True, exist_ok=True) + + if model_path is not None: + model_path = Path(model_path).expanduser().resolve() + if not model_path.exists(): + raise FileNotFoundError(f"specified PARAS model path does not exist: {model_path}") + return model_path + + # No model given -> download/prepare in cache dir + return Path(download_and_prepare(PARAS_DOWNLOAD_URL, cache_dir)) + + +def load_paras_model(model_path: Path) -> object: + """ + Load the PARAS model from the given path. + + :param model_path: Path to the PARAS model file + :return: loaded RandomForestClassifier model + """ + key = str(model_path) + model = _PARAS_MODEL_OBJ_CACHE.get(key) + + if model is None: + log.info(f"loading PARAS model: {model_path}") + model = joblib.load(model_path) + _PARAS_MODEL_OBJ_CACHE[key] = model + return model @@ -714,12 +754,20 @@ class ParasModel(DomainInferenceModel): name: str = "paras" - def __init__(self, cache_dir: Path | str | None = None, threshold: float = 0.1, keep_top: int = 3) -> None: + def __init__( + self, + threshold: float = 0.1, + keep_top: int = 3, + cache_dir: Path | str | None = None, + model_path: Path | str | None = None, + ) -> None: """ Initialize the ParasModel. - :param cache_dir: directory to cache the model :param threshold: probability threshold for predictions + :param keep_top: number of top predictions to keep + :param cache_dir: directory to cache the model + :param model_path: path to a custom PARAS model file """ super().__init__() @@ -727,6 +775,7 @@ def __init__(self, cache_dir: Path | str | None = None, threshold: float = 0.1, if cache_dir is None: cache_dir = PARAS_CACHE_DIR self.cache_dir = Path(cache_dir) + self.model_path = Path(model_path) if model_path is not None else None # Set other parameters self.threshold = threshold @@ -758,8 +807,8 @@ def predict(self, domain: Domain) -> list[InferenceResult]: """ if domain.type == "AMP-binding": # Prepare model - cache_dir = Path(self.cache_dir) - model = load_paras_model(cache_dir) + model_file = resolve_paras_model_path(self.model_path, self.cache_dir) + model = load_paras_model(model_file) # Find A domains in the sequence a_domains = find_a_domains(seq_id=domain.id, protein_seq=domain.sequence) From 51c9bb5527308808815f8c81965dd8323d785c41 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Tue, 30 Dec 2025 01:36:26 +0100 Subject: [PATCH 12/12] BMP: bump major version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d34f695..c5f1b15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "biocracker" -version = "1.1.1" +version = "2.0.0" description = "BioCracker is a parser for antiSMASH output GenBank files" readme = "README.md" requires-python = ">=3.10,<3.12"