|
3 | 3 | from collections.abc import Iterable |
4 | 4 | from dataclasses import asdict, dataclass, field |
5 | 5 | from io import StringIO |
6 | | -from typing import Any |
| 6 | +from typing import Any, Literal |
7 | 7 |
|
8 | 8 | from Bio import SeqIO |
9 | 9 | from Bio.SeqFeature import FeatureLocation, SeqFeature |
@@ -136,6 +136,37 @@ def to_dict(self) -> dict[str, Any]: |
136 | 136 | return d |
137 | 137 |
|
138 | 138 |
|
| 139 | +@dataclass |
| 140 | +class CandidateClusterRec: |
| 141 | + """ |
| 142 | + Data class representing a candidate cluster record parsed from antiSMASH output. |
| 143 | +
|
| 144 | + :param record_id: cluster record ID |
| 145 | + :param accession: cluster number/accession in gbk file |
| 146 | + :param start: cluster start position |
| 147 | + :param end: cluster end position |
| 148 | + :param product_tags: list of product tags |
| 149 | + :param genes: list of GeneRec instances |
| 150 | + """ |
| 151 | + |
| 152 | + record_id: str |
| 153 | + accession: int | None |
| 154 | + start: int |
| 155 | + end: int |
| 156 | + product_tags: list[str] |
| 157 | + genes: list[GeneRec] = field(default_factory=list) |
| 158 | + |
| 159 | + def to_dict(self) -> dict[str, Any]: |
| 160 | + """ |
| 161 | + Convert CandidateClusterRec instance to dictionary. |
| 162 | +
|
| 163 | + :return: dictionary representation of CandidateClusterRec |
| 164 | + """ |
| 165 | + d = asdict(self) |
| 166 | + d["genes"] = [g.to_dict() for g in self.genes] |
| 167 | + return d |
| 168 | + |
| 169 | + |
139 | 170 | @dataclass |
140 | 171 | class RegionRec: |
141 | 172 | """ |
@@ -177,6 +208,16 @@ def _iter_regions(record: SeqRecord) -> list[SeqFeature]: |
177 | 208 | return [f for f in record.features if f.type == "region"] |
178 | 209 |
|
179 | 210 |
|
| 211 | +def _iter_candidate_clusters(record: SeqRecord) -> list[SeqFeature]: |
| 212 | + """ |
| 213 | + Iterate over candidate cluster features in a Biopython SeqRecord. |
| 214 | +
|
| 215 | + :param record: Biopython SeqRecord object |
| 216 | + :return: list of candidate cluster SeqFeature objects |
| 217 | + """ |
| 218 | + return [f for f in record.features if f.type == "cand_cluster"] |
| 219 | + |
| 220 | + |
180 | 221 | def _iter_cds(record: SeqRecord) -> list[SeqFeature]: |
181 | 222 | """ |
182 | 223 | Iterate over CDS features in a Biopython SeqRecord. |
@@ -269,6 +310,63 @@ def _gene_rec_from_feat(feat: SeqFeature) -> GeneRec: |
269 | 310 | ) |
270 | 311 |
|
271 | 312 |
|
| 313 | +def _collect_candidate_cluster(record: SeqRecord) -> list[CandidateClusterRec]: |
| 314 | + """ """ |
| 315 | + clusters = _iter_candidate_clusters(record) |
| 316 | + cds_list = _iter_cds(record) |
| 317 | + dom_list = _iter_domains(record) |
| 318 | + |
| 319 | + cluster_recs: list[CandidateClusterRec] = [] |
| 320 | + |
| 321 | + for cc in clusters: |
| 322 | + _, cs, ce = _start_end(cc) |
| 323 | + |
| 324 | + # Try several common qualifier names for the index |
| 325 | + acc_vals = ( |
| 326 | + cc.qualifiers.get("candidate_cluster_number") |
| 327 | + or cc.qualifiers.get("cand_cluster_number") |
| 328 | + or cc.qualifiers.get("cluster_number") |
| 329 | + or cc.qualifiers.get("cluster_idx") |
| 330 | + or [0] |
| 331 | + ) |
| 332 | + accession = int(acc_vals[0]) if acc_vals else None |
| 333 | + |
| 334 | + products = cc.qualifiers.get("product", []) or [] |
| 335 | + |
| 336 | + # Genes inside cluster, sorted by coordinate |
| 337 | + gene_feats = [g for g in cds_list if _in_bounds(g, cc)] |
| 338 | + gene_feats.sort(key=lambda gf: (int(gf.location.start), int(gf.location.end))) |
| 339 | + |
| 340 | + genes: list[GeneRec] = [] |
| 341 | + for gf in gene_feats: |
| 342 | + g = _gene_rec_from_feat(gf) |
| 343 | + |
| 344 | + # Domains inside this gene, sorted by genomic start |
| 345 | + gene_doms = [df for df in dom_list if _in_bounds(df, gf)] |
| 346 | + gene_doms.sort(key=lambda df: (int(df.location.start), int(df.location.end))) |
| 347 | + dom_recs = [_domain_rec_from_feat(dd) for dd in gene_doms] |
| 348 | + |
| 349 | + # Oritentation normalization |
| 350 | + if g.strand == -1: |
| 351 | + dom_recs = dom_recs[::-1] |
| 352 | + |
| 353 | + g.domains = dom_recs |
| 354 | + genes.append(g) |
| 355 | + |
| 356 | + cluster_recs.append( |
| 357 | + CandidateClusterRec( |
| 358 | + record_id=record.id, |
| 359 | + accession=accession, |
| 360 | + start=cs, |
| 361 | + end=ce, |
| 362 | + product_tags=products, |
| 363 | + genes=genes, |
| 364 | + ) |
| 365 | + ) |
| 366 | + |
| 367 | + return cluster_recs |
| 368 | + |
| 369 | + |
272 | 370 | def _collect_region(record: SeqRecord) -> list[RegionRec]: |
273 | 371 | """ |
274 | 372 | Collect region records from a Biopython SeqRecord. |
@@ -326,33 +424,49 @@ def _collect_region(record: SeqRecord) -> list[RegionRec]: |
326 | 424 | return region_recs |
327 | 425 |
|
328 | 426 |
|
329 | | -def parse_region_gbk_string(src: str) -> list[RegionRec]: |
| 427 | +def parse_region_gbk_string( |
| 428 | + src: str, |
| 429 | + top_level: Literal["region", "cand_cluster"] = "region", |
| 430 | +) -> list[RegionRec]: |
330 | 431 | """ |
331 | 432 | Parse antiSMASH region GenBank string into RegionRec instances. |
332 | 433 |
|
333 | 434 | :param src: GenBank formatted string |
| 435 | + :param top_level: top-level feature to parse ('region' or 'cand_cluster') |
334 | 436 | :return: list of RegionRec instances |
| 437 | + :raises AssertionError: if top_level is not 'region' or 'cand_cluster' |
335 | 438 | """ |
336 | 439 | handle = StringIO(src) |
337 | 440 | out: list[RegionRec] = [] |
338 | 441 |
|
339 | 442 | for record in SeqIO.parse(handle, "genbank"): |
340 | | - out.extend(_collect_region(record)) |
| 443 | + if top_level == "region": |
| 444 | + out.extend(_collect_region(record)) |
| 445 | + elif top_level == "cand_cluster": |
| 446 | + out.extend(_collect_candidate_cluster(record)) |
| 447 | + else: |
| 448 | + raise ValueError(f"Unknown top_level '{top_level}'; must be 'region' or 'cand_cluster'") |
341 | 449 |
|
342 | 450 | return out |
343 | 451 |
|
344 | 452 |
|
345 | | -def parse_region_gbk_file(filepath: str) -> list[RegionRec]: |
| 453 | +def parse_region_gbk_file(filepath: str, top_level: Literal["region", "cand_cluster"] = "region") -> list[RegionRec]: |
346 | 454 | """ |
347 | 455 | Parse antiSMASH region GenBank file into RegionRec instances. |
348 | 456 |
|
349 | 457 | :param filepath: path to GenBank file |
| 458 | + :param top_level: top-level feature to parse ('region' or 'cand_cluster') |
350 | 459 | :return: list of RegionRec instances |
351 | 460 | """ |
352 | 461 | out: list[RegionRec] = [] |
353 | 462 |
|
354 | 463 | with open(filepath) as handle: |
355 | 464 | for record in SeqIO.parse(handle, "genbank"): |
356 | | - out.extend(_collect_region(record)) |
| 465 | + if top_level == "region": |
| 466 | + out.extend(_collect_region(record)) |
| 467 | + elif top_level == "cand_cluster": |
| 468 | + out.extend(_collect_candidate_cluster(record)) |
| 469 | + else: |
| 470 | + raise ValueError(f"Unknown top_level '{top_level}'; must be 'region' or 'cand_cluster'") |
357 | 471 |
|
358 | 472 | return out |
0 commit comments