Skip to content

Commit 6dd940f

Browse files
committed
ENH: add cand_cluster readout as top level next to region
1 parent d68ed59 commit 6dd940f

2 files changed

Lines changed: 131 additions & 16 deletions

File tree

examples/parse_antismash_gbk.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,23 @@ def main() -> None:
3232
"""
3333
args = cli()
3434
gbk_path = args.gbk
35-
regions = parse_region_gbk_file(gbk_path)
36-
logger.info(f" > Parsed {len(regions)} region(s) from {gbk_path}")
35+
target_name = "cand_cluster"
36+
targets = parse_region_gbk_file(gbk_path, top_level=target_name)
37+
logger.info(f" > Parsed {len(targets)} {target_name}(s) from {gbk_path}")
3738

38-
for region in regions:
39-
region_accession = region.accession if region.accession is not None else 0
39+
for target in targets:
40+
region_accession = target.accession if target.accession is not None else 0
4041
logger.info(
4142
f" "
42-
f"> (Region at {region.start} - {region.end}) "
43-
f"{region.record_id}.region{region_accession:03d} : {region.product_tags}"
43+
f"> ({target_name} at {target.start} - {target.end}) "
44+
f"{target.record_id}.{target_name}{region_accession:03d} : {target.product_tags}"
4445
)
45-
len_start = max([len(str(gene.start)) for gene in region.genes])
46-
len_end = max([len(str(gene.end)) for gene in region.genes])
47-
for gene in region.genes:
46+
len_start = max([len(str(gene.start)) for gene in target.genes])
47+
len_end = max([len(str(gene.end)) for gene in target.genes])
48+
for gene in target.genes:
4849
logger.info(
4950
f" "
50-
f"> (Gene at {gene.start:>{len_start}} - {gene.end:>{len_end}} on strand {gene.strand:>2}) "
51+
f"> (gene at {gene.start:>{len_start}} - {gene.end:>{len_end}} on strand {gene.strand:>2}) "
5152
f"{gene.name} : {gene.product}"
5253
)
5354
for domain in gene.domains:
@@ -60,7 +61,7 @@ def main() -> None:
6061
repr_domain_preds = ""
6162
logger.info(
6263
f" "
63-
f"> (Domain at {domain.start:>{domain_len_start}} - {domain.end:>{domain_len_end}}) "
64+
f"> (domain at {domain.start:>{domain_len_start}} - {domain.end:>{domain_len_end}}) "
6465
f"{domain.kind} {repr_domain_preds}"
6566
)
6667

src/biocracker/antismash.py

Lines changed: 119 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from collections.abc import Iterable
44
from dataclasses import asdict, dataclass, field
55
from io import StringIO
6-
from typing import Any
6+
from typing import Any, Literal
77

88
from Bio import SeqIO
99
from Bio.SeqFeature import FeatureLocation, SeqFeature
@@ -136,6 +136,37 @@ def to_dict(self) -> dict[str, Any]:
136136
return d
137137

138138

139+
@dataclass
140+
class CandidateClusterRec:
141+
"""
142+
Data class representing a candidate cluster record parsed from antiSMASH output.
143+
144+
:param record_id: cluster record ID
145+
:param accession: cluster number/accession in gbk file
146+
:param start: cluster start position
147+
:param end: cluster end position
148+
:param product_tags: list of product tags
149+
:param genes: list of GeneRec instances
150+
"""
151+
152+
record_id: str
153+
accession: int | None
154+
start: int
155+
end: int
156+
product_tags: list[str]
157+
genes: list[GeneRec] = field(default_factory=list)
158+
159+
def to_dict(self) -> dict[str, Any]:
160+
"""
161+
Convert CandidateClusterRec instance to dictionary.
162+
163+
:return: dictionary representation of CandidateClusterRec
164+
"""
165+
d = asdict(self)
166+
d["genes"] = [g.to_dict() for g in self.genes]
167+
return d
168+
169+
139170
@dataclass
140171
class RegionRec:
141172
"""
@@ -177,6 +208,16 @@ def _iter_regions(record: SeqRecord) -> list[SeqFeature]:
177208
return [f for f in record.features if f.type == "region"]
178209

179210

211+
def _iter_candidate_clusters(record: SeqRecord) -> list[SeqFeature]:
212+
"""
213+
Iterate over candidate cluster features in a Biopython SeqRecord.
214+
215+
:param record: Biopython SeqRecord object
216+
:return: list of candidate cluster SeqFeature objects
217+
"""
218+
return [f for f in record.features if f.type == "cand_cluster"]
219+
220+
180221
def _iter_cds(record: SeqRecord) -> list[SeqFeature]:
181222
"""
182223
Iterate over CDS features in a Biopython SeqRecord.
@@ -269,6 +310,63 @@ def _gene_rec_from_feat(feat: SeqFeature) -> GeneRec:
269310
)
270311

271312

313+
def _collect_candidate_cluster(record: SeqRecord) -> list[CandidateClusterRec]:
314+
""" """
315+
clusters = _iter_candidate_clusters(record)
316+
cds_list = _iter_cds(record)
317+
dom_list = _iter_domains(record)
318+
319+
cluster_recs: list[CandidateClusterRec] = []
320+
321+
for cc in clusters:
322+
_, cs, ce = _start_end(cc)
323+
324+
# Try several common qualifier names for the index
325+
acc_vals = (
326+
cc.qualifiers.get("candidate_cluster_number")
327+
or cc.qualifiers.get("cand_cluster_number")
328+
or cc.qualifiers.get("cluster_number")
329+
or cc.qualifiers.get("cluster_idx")
330+
or [0]
331+
)
332+
accession = int(acc_vals[0]) if acc_vals else None
333+
334+
products = cc.qualifiers.get("product", []) or []
335+
336+
# Genes inside cluster, sorted by coordinate
337+
gene_feats = [g for g in cds_list if _in_bounds(g, cc)]
338+
gene_feats.sort(key=lambda gf: (int(gf.location.start), int(gf.location.end)))
339+
340+
genes: list[GeneRec] = []
341+
for gf in gene_feats:
342+
g = _gene_rec_from_feat(gf)
343+
344+
# Domains inside this gene, sorted by genomic start
345+
gene_doms = [df for df in dom_list if _in_bounds(df, gf)]
346+
gene_doms.sort(key=lambda df: (int(df.location.start), int(df.location.end)))
347+
dom_recs = [_domain_rec_from_feat(dd) for dd in gene_doms]
348+
349+
# Oritentation normalization
350+
if g.strand == -1:
351+
dom_recs = dom_recs[::-1]
352+
353+
g.domains = dom_recs
354+
genes.append(g)
355+
356+
cluster_recs.append(
357+
CandidateClusterRec(
358+
record_id=record.id,
359+
accession=accession,
360+
start=cs,
361+
end=ce,
362+
product_tags=products,
363+
genes=genes,
364+
)
365+
)
366+
367+
return cluster_recs
368+
369+
272370
def _collect_region(record: SeqRecord) -> list[RegionRec]:
273371
"""
274372
Collect region records from a Biopython SeqRecord.
@@ -326,33 +424,49 @@ def _collect_region(record: SeqRecord) -> list[RegionRec]:
326424
return region_recs
327425

328426

329-
def parse_region_gbk_string(src: str) -> list[RegionRec]:
427+
def parse_region_gbk_string(
428+
src: str,
429+
top_level: Literal["region", "cand_cluster"] = "region",
430+
) -> list[RegionRec]:
330431
"""
331432
Parse antiSMASH region GenBank string into RegionRec instances.
332433
333434
:param src: GenBank formatted string
435+
:param top_level: top-level feature to parse ('region' or 'cand_cluster')
334436
:return: list of RegionRec instances
437+
:raises AssertionError: if top_level is not 'region' or 'cand_cluster'
335438
"""
336439
handle = StringIO(src)
337440
out: list[RegionRec] = []
338441

339442
for record in SeqIO.parse(handle, "genbank"):
340-
out.extend(_collect_region(record))
443+
if top_level == "region":
444+
out.extend(_collect_region(record))
445+
elif top_level == "cand_cluster":
446+
out.extend(_collect_candidate_cluster(record))
447+
else:
448+
raise ValueError(f"Unknown top_level '{top_level}'; must be 'region' or 'cand_cluster'")
341449

342450
return out
343451

344452

345-
def parse_region_gbk_file(filepath: str) -> list[RegionRec]:
453+
def parse_region_gbk_file(filepath: str, top_level: Literal["region", "cand_cluster"] = "region") -> list[RegionRec]:
346454
"""
347455
Parse antiSMASH region GenBank file into RegionRec instances.
348456
349457
:param filepath: path to GenBank file
458+
:param top_level: top-level feature to parse ('region' or 'cand_cluster')
350459
:return: list of RegionRec instances
351460
"""
352461
out: list[RegionRec] = []
353462

354463
with open(filepath) as handle:
355464
for record in SeqIO.parse(handle, "genbank"):
356-
out.extend(_collect_region(record))
465+
if top_level == "region":
466+
out.extend(_collect_region(record))
467+
elif top_level == "cand_cluster":
468+
out.extend(_collect_candidate_cluster(record))
469+
else:
470+
raise ValueError(f"Unknown top_level '{top_level}'; must be 'region' or 'cand_cluster'")
357471

358472
return out

0 commit comments

Comments
 (0)