mTRAc/mtrac.py at master · SushiLab/mTRAc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python


# ============================================================================ #
#
# Authors: Hans-Joachim Ruscheweyh (hansr@ethz.ch),
#          Lilith Feer,
#          Melanie Staeubli
#          Anna Sintsova
#          Shinichi Sunagawa
#
# Type "mtrac" for usage help
#
# Copyright (c) 2025 SunagawaLab.
#
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# ============================================================================ #

__author__ = ('Hans-Joachim Ruscheweyh (hansr@ethz.ch), '
              'Lilith Feer, '
              'Melanie Staeubli, '
              'Anna Sintsova, '
              'Shinichi Sunagawa')
__version__ = '0.0.5'
__date__ = '22 November 2025'
__license__ = "GPL - v3"
__maintainer__ = "Hans-Joachim Ruscheweyh & Anna Sintsova"
__tool_name__ = 'mTRAc'

import csv
import logging
import argparse
import sys
import pathlib
from typing import List, Tuple, Dict, Union
import os
import gzip
import pysam
import Bio.SeqIO.FastaIO as FastaIO
import Bio.SeqIO.QualityIO as QualityIO
import subprocess
import shutil
import pyrodigal
import pyhmmer
from pyhmmer.easel import SequenceFile
from pyhmmer.plan7 import HMMFile
import tempfile
from fetchmgs import fetchmgs
import collections
import numpy as np
import pickle
import pandas as pd
from pathlib import Path
from skbio.stats import composition


R1IDENTIFIER = '1'
R2IDENTIFIER = '2'
SIDENTIFIER = 'S'
ROOT_DB_FOLDER = pathlib.Path(__file__).resolve().parent.joinpath('databases')


def detect_databases(DATABASES):
    '''Searches through the default database
    location and detects databases based on
    their files. A database is detected as
    present if the folder in the
    databases folder contains 3 files named
    the following:
    databases/[name]/[name].fasta.gz
    databases/[name]/[name].gff3.gz
    databases/[name]/[name].map.gz

    :return:
    '''
    db_folders = [str(db).split('/')[-2]
                  for db in ROOT_DB_FOLDER.glob('*/*fasta.gz')]
    DATABASES.clear()
    for db_folder in db_folders:
        fastagz_file = ROOT_DB_FOLDER.joinpath(
            db_folder).joinpath(db_folder + '.fasta.gz')
        gff3gz_file = ROOT_DB_FOLDER.joinpath(
            db_folder).joinpath(db_folder + '.gff3.gz')
        mapgz_file = ROOT_DB_FOLDER.joinpath(
            db_folder).joinpath(db_folder + '.map.gz')

        if fastagz_file.exists() and gff3gz_file.exists() and mapgz_file.exists():
            DATABASES.append(db_folder)


DATABASES = []
detect_databases(DATABASES)
TIGRFAM_HMM = ROOT_DB_FOLDER.joinpath('hmm/tigrfam.hmm')
PFAM_HMM = ROOT_DB_FOLDER.joinpath('hmm/Pfam-A.hmm')
MG129_file = ROOT_DB_FOLDER.joinpath('hmm/129MGs')

TPM_MODEL_FILE = ROOT_DB_FOLDER.joinpath(
    'models/25-10-21TPM_KNN_3comp_9neighbors_euclidean_distance.1.5.1.pkl')
# MG_MODEL_FILE = ROOT_DB_FOLDER.joinpath(
#     'models/25-07-29MG_KNN_3comp_9neighbors_euclidean_distance.pkl')


def index_database(db_name):

    if db_name not in DATABASES:
        logging.error(f'Unknown database: {db_name}. Quitting ...')
        shutdown(1)
    marker_db_prepapred = ROOT_DB_FOLDER.joinpath(
        db_name).joinpath(db_name + '.prepared')
    fastafile_gz = ROOT_DB_FOLDER.joinpath(
        db_name).joinpath(db_name + '.fasta.gz')
    fastafile = ROOT_DB_FOLDER.joinpath(
        db_name).joinpath(db_name + '.fasta')
    mapfile_gz = ROOT_DB_FOLDER.joinpath(
        db_name).joinpath(db_name + '.map.gz')
    mapfile = ROOT_DB_FOLDER.joinpath(db_name).joinpath(db_name + '.map')
    gff3file_gz = ROOT_DB_FOLDER.joinpath(
        db_name).joinpath(db_name + '.gff3.gz')
    gff3file = ROOT_DB_FOLDER.joinpath(db_name).joinpath(db_name + '.gff3')

    if not marker_db_prepapred.exists():
        logging.info(
            f'Database {db_name} exists but is not built yet. Start building: ')
        for fgz, f in [(fastafile_gz, fastafile), (mapfile_gz, mapfile), (gff3file_gz, gff3file)]:
            with gzip.open(fgz, 'rb') as f_in:
                with open(f, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

        try:
            subprocess.check_call(f'bwa index {fastafile}', shell=True)
        except subprocess.CalledProcessError as e:
            logging.error(
                'Command {} failed with message:\t{}'.format(e.cmd, e.stderr))
            shutdown(1)
        marker_db_prepapred.touch()
    else:
        logging.info(f'Index of database {db_name} already exists.')

class GpredParameters:
    _forward_files: List[pathlib.Path] = []
    _reverse_files: List[pathlib.Path] = []
    _unpaired_files: List[pathlib.Path] = []
    _read_files_were_checked: bool = False
    _alignment_file: pathlib.Path = None
    _temp_alignment_file: pathlib.Path = None
    _fcnt_file: pathlib.Path = None
    _mgs_file: pathlib.Path = None
    _threads: int = 1
    _db_genome_file = None
    _db_gff_file = None
    _db_map_file = None

    def get_mgs_file(self):
        self._mgs_file.parent.mkdir(exist_ok=True, parents=True)
        return self._mgs_file

    def get_featurecounts_file(self):
        self._fcnt_file.parent.mkdir(exist_ok=True, parents=True)
        return self._fcnt_file

    def get_gff_file(self):
        return self._db_gff_file

    def is_paired_mode(self):
        if len(self._forward_files) != 0:
            return True
        else:
            return False

    def get_bwa_index(self):
        return self._db_genome_file

    def yield_paired_read_files(self):
        for r1_file, r2_file in zip(self._forward_files, self._reverse_files):
            yield r1_file, r2_file

    def get_singleend_read_files(self):
        return self._unpaired_files

    def set_threads(self, threads: int):
        threads = int(threads)
        if threads < 1:
            logging.error('Threads have to be at least 1')
            shutdown(1)
        if threads > os.cpu_count():
            logging.warning(
                'Number of threads exceeds the total number of CPU cores.')
        self._threads = threads

    def get_threads(self) -> int:
        return self._threads

    def get_db_map_file(self):
        return self._db_map_file

    def set_db_name(self, db_name: str) -> None:

        if db_name not in DATABASES:
            logging.error(f'Unknown database: {db_name}. Quitting ...')
            shutdown(1)
        marker_db_prepapred = ROOT_DB_FOLDER.joinpath(
            db_name).joinpath(db_name + '.prepared')
        fastafile_gz = ROOT_DB_FOLDER.joinpath(
            db_name).joinpath(db_name + '.fasta.gz')
        fastafile = ROOT_DB_FOLDER.joinpath(
            db_name).joinpath(db_name + '.fasta')
        mapfile_gz = ROOT_DB_FOLDER.joinpath(
            db_name).joinpath(db_name + '.map.gz')
        mapfile = ROOT_DB_FOLDER.joinpath(db_name).joinpath(db_name + '.map')
        gff3file_gz = ROOT_DB_FOLDER.joinpath(
            db_name).joinpath(db_name + '.gff3.gz')
        gff3file = ROOT_DB_FOLDER.joinpath(db_name).joinpath(db_name + '.gff3')

        if not marker_db_prepapred.exists():
            logging.error(f'Database {db_name} exists but is not built yet. Use "mtrac index {db_name}"')
            shutdown(1)

        self._db_genome_file = fastafile
        self._db_gff_file = gff3file
        self._db_map_file = mapfile
        if not self._db_map_file.exists():
            logging.error(
                f'Selected database incomplete. File not found: {self._db_map_file}')
            shutdown(1)
        if not self._db_gff_file.exists():
            logging.error(
                f'Selected database incomplete. File not found: {self._db_gff_file}')
            shutdown(1)
        if not self._db_genome_file.exists():
            logging.error(
                f'Selected database incomplete. File not found: {self._db_genome_file}')
            shutdown(1)

    def set_alignment_file(self, align_bam_file: pathlib.Path, align_fcnt_file: pathlib.Path, align_mgs_file: pathlib.Path, required_to_exist=True) -> None:
        self._alignment_file = align_bam_file
        self._fcnt_file = align_fcnt_file
        self._mgs_file = align_mgs_file
        self._temp_alignment_file = pathlib.Path(
            str(align_bam_file) + '_tmp.bam')
        if required_to_exist:
            if not align_bam_file.exists():
                logging.error(
                    f'Alignment file {align_bam_file} does not exist. Shutting down ...')
                shutdown(1)
            if not align_fcnt_file.exists():
                logging.error(
                    f'Alignment file {align_fcnt_file} does not exist. Shutting down ...')
                shutdown(1)
            if not align_mgs_file.exists():
                logging.error(
                    f'Alignment file {align_mgs_file} does not exist. Shutting down ...')
                shutdown(1)

        if not str(align_bam_file).endswith('.bam'):
            logging.error(
                f'Alignment file {align_bam_file} is/will be a BAM formatted file. Please set file suffix accordingly. Shutting down ...')
            shutdown(1)
        if not str(align_fcnt_file).endswith('.fcnt'):
            logging.error(
                f'FeatureCounts file {align_fcnt_file} is/will be a fcnt formatted file. Please set file suffix accordingly. Shutting down ...')
            shutdown(1)
        if not str(align_mgs_file).endswith('.mgs'):
            logging.error(
                f'FeatureCounts file {align_mgs_file} is/will be a fcnt.mgs formatted file. Please set file suffix accordingly. Shutting down ...')
            shutdown(1)

    def get_read_files(self) -> List[Tuple[pathlib.Path, str]]:
        read_files = []
        # , strict=True):
        for (r1_file, r2_file) in zip(self._forward_files, self._reverse_files):
            read_files.append((r1_file, f'/{R1IDENTIFIER}'))
            read_files.append((r2_file, f'/{R2IDENTIFIER}'))
        for u_file in self._unpaired_files:
            read_files.append((u_file, f'/{SIDENTIFIER}'))
        return read_files

    def get_temporary_alignment_file(self) -> pathlib.Path:
        self._temp_alignment_file.parent.mkdir(exist_ok=True, parents=True)
        return self._temp_alignment_file

    def delete_temporary_alignment_file(self) -> None:
        self._temp_alignment_file.unlink(missing_ok=True)

    def get_alignment_file(self) -> pathlib.Path:
        self._alignment_file.parent.mkdir(exist_ok=True, parents=True)
        return self._alignment_file

    def get_first_1000_reads(self, reads_file: pathlib.Path) -> List[Tuple[str, str]]:
        """ Read the first thousand reads
        and check if the file endings are correct.

        Params:
            reads_file: The file with the short read sequencing data

        Returns:
            A list with the first 1000 reads of the file as tuples of
                header and sequence

        """
        # self._forward_files = forward_files
        # self._reverse_files = reverse_files
        # self._unpaired_files = unpaired_files

        allowed_file_fq_endings = ['fq.gz', 'fq', 'fastq', 'fastq.gz']
        allowed_file_fa_endings = ['fa', 'fa.gz',
                                   'fasta', 'fasta.gz', 'fna', 'fna.gz']
        is_fq = False
        is_fa = False
        is_gz = False
        if str(reads_file).endswith('.gz'):
            is_gz = True
        for allowed_file_fa_ending in allowed_file_fa_endings:
            if str(reads_file).endswith(allowed_file_fa_ending):
                is_fa = True
        for allowed_file_fq_ending in allowed_file_fq_endings:
            if str(reads_file).endswith(allowed_file_fq_ending):
                is_fq = True

        reads = []
        if is_gz:
            of = gzip.open(reads_file, 'rt')
        else:
            of = open(reads_file, 'r')
        if is_fa:
            for (header, sequence) in FastaIO.SimpleFastaParser(of):
                if len(reads) >= 1000:
                    break
                reads.append((header.strip().split()[0], sequence))
        elif is_fq:
            for header, sequence, qual in QualityIO.FastqGeneralIterator(of):
                if len(reads) >= 1000:
                    break
                reads.append((header.strip().split()[0], sequence))
        else:
            logging.error(f'Unknown file format: {reads_file}')
            shutdown(1)
        of.close()
        return reads

    def set_read_files(self, forward_files: List[pathlib.Path], reverse_files: List[pathlib.Path],
                       unpaired_files: List[pathlib.Path], check_files: bool = True) -> None:
        """ Define set of read files
        that we should align against the
        database. This step can/will also check
        if the files exist and to check if foward
        and reverse read files have the same read
        headers

        Params:
            forward_files: A list of pathlike objects
                which are forward read files. Can be fasta
                or fastq. Can be gzipped or uncompressed
            reverse_files: A list of pathlike objects
                which are reverse read files. Can be fasta
                or fastq. Can be gzipped or uncompressed
            unpaired_files: A list of pathlike objects
                which are unpaired read files. Can be fasta
                or fastq. Can be gzipped or uncompressed
        """

        if check_files:
            files_that_dont_exist = []
            if len(forward_files + reverse_files + unpaired_files) == 0:
                logging.error(
                    'No input files defined with -f -r or -s. Quitting ...')
                shutdown(1)
            for f in forward_files + reverse_files + unpaired_files:
                if not f.exists():
                    files_that_dont_exist.append(f)
            if len(files_that_dont_exist) != 0:
                logging.error(
                    f'Some read files dont exist: {files_that_dont_exist}')
                for f in files_that_dont_exist:
                    logging.error(f'\t{f}')
                shutdown(1)
            if len(set(forward_files + reverse_files + unpaired_files)) != len(
                    forward_files + reverse_files + unpaired_files):
                logging.error(
                    f'Duplicated read files. Please submit every file only once. Shutting down ...')
                shutdown(1)
            if len(forward_files) != len(reverse_files):
                logging.error(
                    'Unequal number of files submitted with -r and -f. Quitting ...')
                shutdown(1)
            # , strict=True):
            for (r1_file, r2_file) in zip(forward_files, reverse_files):
                r1_reads = self.get_first_1000_reads(r1_file)
                r2_reads = self.get_first_1000_reads(r2_file)
                r1_header = set([r[0] for r in r1_reads])
                r2_header = set([r[0] for r in r2_reads])
                if len(r1_header.symmetric_difference(r2_header)) != 0:
                    logging.error(
                        f'Headers of reads are not identical. Shutting down ...')
                    logging.error(
                        f'Differing read headers: {r1_header.symmetric_difference(r2_header)}')
                    logging.error(f'Differing read headers file 1: {r1_file}')
                    logging.error(f'Differing read headers file 1: {r2_file}')
                    shutdown(1)

            for u_file in unpaired_files:
                u_reads = self.get_first_1000_reads(u_file)
        if len(forward_files) != 0 and len(unpaired_files) != 0:
            logging.error(
                'Submit either paired end or single-end files. A combination is not valid. Quitting ...')
            shutdown(1)
        self._forward_files = forward_files
        self._reverse_files = reverse_files
        self._unpaired_files = unpaired_files


def shutdown(exitcode: int) -> None:
    """
    Secure Shutdown
    Args:
        exitcode: The exitcode with which mOTU should shutdown.

    Returns:
        None
    """
    logging.info(
        f'{__tool_name__ } tool shutting down with exitcode {exitcode}')
    sys.exit(exitcode)


def startup() -> None:
    """
    A method to group all functions that should be
    executed during startup of the tool
    Returns:
        None
    """
    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                        level=logging.INFO, datefmt='%Y-%m-%d,%H:%M:%S')
    logging.info(f'{__tool_name__ } tool starting')


def filter_bam(in_bam_file_handle, bam_file_handle, minlength=45, min_perc_id=97.0, min_qcov=80.0) -> None:
    """Filter the alignments by alignment length, percent identity
    and query coverage and writes passing alignments to the output
    handle.

    Params:
        in_bam_file_handle: A generator with raw alignments from bwa

        bam_file_handle: A file handle where filtered alignments can
            be written to

        minlength: The minimal alignment length. default=45

        min_perc_id: The minimal percent identity of the alignment.
            Ranges between 0.0 and 100.0. default=97.0

        min_qcov: The minimal query coverage of the alignment.
            Ranges between 0.0 and 100.0. default=80.0

    Return:
        None
    """
    total_records = 0
    returned_records = 0
    for record in in_bam_file_handle:
        total_records += 1

        if record.is_unmapped:
            continue
        else:
            alnlength: int = sum(record.get_cigar_stats()[0][0:3])
            if alnlength < minlength:
                continue
            query_covered_bases: int = sum(record.get_cigar_stats()[0][0:2])
            query_length: int = record.infer_read_length()
            mismatches: int = record.get_tag('NM')
            percid: float = (alnlength - mismatches) / float(alnlength) * 100.0
            percid: float = round(percid, 2)
            if min_perc_id > percid:
                continue
            qcov: float = (100.0 * query_covered_bases) / float(query_length)
            if qcov < min_qcov:
                continue
            record.set_tag('id', percid, 'f')
            record.set_tag('qc', qcov, 'f')
            record.set_tag('al', alnlength, 'i')
            bam_file_handle.write(record)
            returned_records += 1
    return total_records, returned_records


def align(gpredparameter: GpredParameters) -> None:
    """Routine in which reads are aligned against the
    provided reference. Genes are then quantified
    using featureCounts and then filtered to represent
    the 129 genes of interest.

    Params:
        gpredparameter: Data class with information on
            read files, databases and parameters
    """

    logging.info(f'Start align command')
    temp_bam_file = gpredparameter.get_temporary_alignment_file()
    temp_bam_file_handle = None
    logging.info(f'\tStart alignment')
    if True:  # just for debugging so that I can run fcnt without alignment first
        if gpredparameter.is_paired_mode():
            for (r1_file, r2_file) in gpredparameter.yield_paired_read_files():
                bwa_command: str = f'bwa mem -a -t {gpredparameter.get_threads()} {gpredparameter.get_bwa_index()} {r1_file} {r2_file}'
                logging.info(f'\t\tExecuting: {bwa_command}')
                process = subprocess.Popen(
                    bwa_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
                in_bam_file_handle = pysam.AlignmentFile(process.stdout, 'rb')
                if not temp_bam_file_handle:
                    temp_bam_file_handle = pysam.AlignmentFile(
                        temp_bam_file, "wbu", header=in_bam_file_handle.header.to_dict())
                total_records, returned_records = filter_bam(
                    in_bam_file_handle, temp_bam_file_handle)
                process.stdout.close()
                return_code: int = process.wait()
                if return_code != 0:
                    logging.error(
                        f'BWA command failed with return code {return_code}. Quitting ...')
                    shutdown(1)
                if returned_records == 0:
                    logging.error(
                        f'No alignments left after filtering. Using the correct database? Quitting ...')
                    shutdown(1)
                if (returned_records * 100.0) / total_records < 50.0:
                    logging.warn(
                        f'<50% of the alignments make it through the filter. Aligning against the correct database?')
        else:
            logging.info('Single-end mode not implemented yet. Quitting ...')
            shutdown(1)

        temp_bam_file_handle.close()
        logging.info(f'\tFinished alignment. Start sorting.')
        maxmem = 8000
        maxsort_threads = 8
        sort_threads = 1
        sort_mem_per_thread = 8000
        if gpredparameter.get_threads() > maxsort_threads:
            sort_threads = maxsort_threads
            sort_mem_per_thread = int(maxmem/sort_threads)
        elif gpredparameter.get_threads() > 1:
            sort_threads = gpredparameter.get_threads()
            sort_mem_per_thread = int(maxmem / sort_threads)
        else:
            sort_threads = 1
            sort_mem_per_thread = 8000
        # print(f'sort_threads={sort_threads}, mem={sort_mem_per_thread}')

        pysam.sort('-n', '-m', str(sort_mem_per_thread) + 'M', '-@', str(sort_threads),
                   '-o', str(gpredparameter.get_alignment_file()), str(temp_bam_file))
        temp_bam_file.unlink(missing_ok=True)
        logging.info(f'\tFinished sorting. Start featureCounts.')
        if gpredparameter.is_paired_mode():
            fcnt_command: str = f'featureCounts -O -M --fraction -t gene -a {gpredparameter.get_gff_file()} -o {gpredparameter.get_featurecounts_file()} -F GTF -g locus_tag -p -B --verbose {gpredparameter.get_alignment_file()} --countReadPairs -T {gpredparameter.get_threads()}'
        else:
            logging.info('Single-end mode not implemented yet. Quitting ...')
            shutdown(1)

        logging.info(f'Executing: {fcnt_command}')
        try:
            subprocess.check_call(fcnt_command, shell=True)
        except subprocess.CalledProcessError as e:
            logging.error(
                'Command {} failed with message:\t{}'.format(e.cmd, e.stderr))
            shutdown(1)

    logging.info(f'\tFinished featureCounts. Start marker gene extraction.')
    gene_2_metadata = {}
    seen_genes = set()
    with open(gpredparameter.get_db_map_file()) as handle:
        for line in handle:
            splits = line.strip().split('\t')
            genome = splits[0]
            gene = splits[1]
            annotation = splits[2]
            gene_2_metadata[gene] = (genome, annotation)
    # Geneid	        Chr	        Start	End	    Strand	Length	test.bam
    # MJ392_00005	CP092639.1	1	    1404	+	    1404	1028.00
    with open(gpredparameter.get_featurecounts_file()) as fcnt_handle, open(gpredparameter.get_mgs_file(), 'w') as mgs_handle:
        mgs_handle.write(
            '#GENOME\tANNOTATION\tGENE\tCHROMOSOME\tLENGTH\tCOUNT\n')
        for line in fcnt_handle:
            if line.startswith('#'):
                continue
            if line.startswith('Geneid'):
                continue
            [geneid, chr, start, end, strand, length,
                count] = line.strip().split('\t')
            if geneid in gene_2_metadata:
                seen_genes.add(geneid)
                (genome, annotation) = gene_2_metadata[geneid]
                tmp = [genome, annotation, geneid, chr, length, count]
                tmp = '\t'.join(tmp)
                mgs_handle.write(f'{tmp}\n')
    if len(seen_genes) != len(gene_2_metadata):
        logging.error(
            f'Not all genes in the database mapping file ({gpredparameter.get_db_map_file()}) have been found in the featureCounts output. Using the wrong gff or map file? Quitting ...')
        gpredparameter.get_mgs_file().unlink()
        logging.error('Missing Genes:')
        for gene in gene_2_metadata.keys():
            if gene not in seen_genes:
                logging.error(f'\t\t{gene}')
        shutdown(1)
    logging.info(f'\tFinished marker gene extraction.')
    logging.info(f'Output file: {gpredparameter.get_mgs_file()}')
    logging.info(f'Finished align command')


def parse_align():

    db_str = '\n'.join(['\t\t\t - ' + db for db in DATABASES])
    parser = argparse.ArgumentParser(usage=f'''Program: {__tool_name__} - metatranscriptomic growth classifier
    Version: {__version__}


    {__tool_name__} align [options]

    Input options:
       -f   FILE[ FILE]  input file(s) for reads in forward orientation, fastq(.gz)-formatted

       -r   FILE[ FILE]  input file(s) for reads in reverse orientation, fastq(.gz)-formatted

       -db  STR          genome database to use. Choices:
    {db_str}

    Output options:
       -o   FILE         output file prefix. Will create 3 files:
                            - prefix.bam
                            - prefix.fcnt
                            - prefix.fcnt.mgs

    Algorithm options:
       -t   INT          number of threads [1]
          ''', formatter_class=CapitalisedHelpFormatter, add_help=False)

    # Input options
    # input files(s) for reads in forward orientation, fastq(.gz)-formatted
    parser.add_argument("-f", nargs="+", default=[])
    # input files(s) for reads in reverse orientation, fastq(.gz)-formatted
    parser.add_argument("-r", nargs="+", default=[])
    # input files(s) for unpaired reads, fastq(.gz)-formatted
    parser.add_argument("-s", nargs="+", default=[])
    parser.add_argument("-db", required=True, choices=DATABASES)  # database
    parser.add_argument("-o", required=True)  # output file name
    parser.add_argument("-t", type=int, default=1)  # number of threads

    args = parser.parse_args(sys.argv[2:])
    if sys.argv[2:] == []:
        parser.print_usage()
        shutdown(1)

    forward_files = [pathlib.Path(el) for el in args.f]
    reverse_files = [pathlib.Path(el) for el in args.r]
    unpaired_files = [pathlib.Path(el) for el in args.s]
    db = args.db
    align_bam_file = pathlib.Path(args.o + '.bam')
    align_fcnt_file = pathlib.Path(args.o + '.fcnt')
    align_mgs_file = pathlib.Path(args.o + '.mgs')
    threads = args.t

    gpredparameters = GpredParameters()
    gpredparameters.set_read_files(
        forward_files, reverse_files, unpaired_files, check_files=True)
    gpredparameters.set_alignment_file(
        align_bam_file, align_fcnt_file, align_mgs_file, required_to_exist=False)
    gpredparameters.set_threads(threads)
    gpredparameters.set_db_name(db)
    align(gpredparameters)


def predict_genes(genome_file: pathlib.Path, tmp_gff_file: pathlib.Path, tmp_faa_file: pathlib.Path):

    scaffold_2_sequence = {}
    if is_gzipped(genome_file):
        infile = gzip.open(genome_file, 'rt')
    else:
        infile = open(genome_file, 'r')
    for (header, sequence) in FastaIO.SimpleFastaParser(infile):
        header = header.split()[0]
        scaffold_2_sequence[header] = sequence

    gene_finder = pyrodigal.GeneFinder(meta=False, closed=True, mask=True)
    training_info = gene_finder.train(
        *(seq for seq in scaffold_2_sequence.values()))
    infile.close()
    tmp_gff_file2_handle = tempfile.NamedTemporaryFile(delete=False)
    tmp_gff_file2 = tmp_gff_file2_handle.name
    tmp_gff_file2_handle.close()
    with open(tmp_gff_file2, 'w') as fi, open(tmp_faa_file, 'w') as fo:
        for header, sequence in scaffold_2_sequence.items():
            genes = gene_finder.find_genes(sequence)
            genes.write_gff(fi, sequence_id=header)
            genes.write_translations(fo, sequence_id=header)
    gene_2_genome = {}
    with open(tmp_gff_file2, 'r') as inhandle, open(tmp_gff_file, 'w') as outhandle:
        for line in inhandle:
            if line.startswith('#'):
                outhandle.write(line)
            else:
                # CP092643.1      pyrodigal_v3.6.3        CDS     13      1362    172.0   +       0       ID=CP092643.1_1;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.438;conf=100.00;score=171.98;cscore=170.45;sscore=1.53;rscore=-1.64;uscore=-0.01;tscore=3.18;
                [genome, tool, cds, start, end, score, strand,
                    number, rest] = line.strip().split('\t')
                feature_id = rest.split(';')[0].split('=')[1]
                gene_2_genome[feature_id] = genome
                rest = rest + 'locus_tag=' + feature_id + ';'
                cds_line = '\t'.join(
                    [genome, tool, cds, start, end, score, strand, number, rest])
                outhandle.write(f'{cds_line}\n')
                gene_line = '\t'.join(
                    [genome, tool, 'gene', start, end, score, strand, number, rest])
                outhandle.write(f'{gene_line}\n')
    return gene_2_genome


def extract_mgs(tmp_faa_file: pathlib.Path):
    # top_level_tmp_folder = pathlib.Path('predict_data/hmm_stuff/')
    # tigrfam + pfam
    alphabet = pyhmmer.easel.Alphabet.amino()
    with SequenceFile(str(tmp_faa_file), digital=True, alphabet=alphabet) as seq_file:
        sequences = seq_file.read_block()

    gene_hmmhits = collections.defaultdict(list)
    for (f, cutoff) in [(TIGRFAM_HMM, 'noise'), (PFAM_HMM, 'gathering')]:
        with HMMFile(f) as hmm_file:
            for hmm_query_hits in pyhmmer.hmmsearch(hmm_file, sequences, bit_cutoffs='noise', cpus=1):
                hmm_id = hmm_query_hits.query.accession.decode()
                if hmm_id.startswith('PF'):
                    hmm_id = hmm_id.rsplit('.', 1)[0]
                for hmm_query_hit in hmm_query_hits.reported:
                    aa_identifier = hmm_query_hit.name.decode()
                    bitscore = hmm_query_hit.score
                    gene_hmmhits[aa_identifier].append((hmm_id, bitscore))

    # 40MGs
    fmg_output_folder = pathlib.Path(tempfile.mkdtemp())
    fetchmgs.extraction_genes([tmp_faa_file], [None],
                              fmg_output_folder, 1, True, False)
    scores_file = fmg_output_folder.joinpath(
        str(tmp_faa_file).split('/')[-1] + '.fetchMGs.scores')
    with open(scores_file, 'r') as inhandle:
        inhandle.readline()
        # protein_sequence_id    HMM bit score   COG
        for line in inhandle:
            [aa_identifier, bitscore, cog] = line.strip().split('\t')
            gene_hmmhits[aa_identifier].append((cog, float(bitscore)))

    # summarise
    all_129_mgs = set()
    with open(MG129_file) as handle:
        for line in handle:
            all_129_mgs.add(line.strip())
    gene_2_best_hit = {}
    for aa_identifier, hits in gene_hmmhits.items():
        hits2 = []
        for (target, bitscore) in hits:
            if target in all_129_mgs:
                hits2.append((target, bitscore, 'yes'))
        if len(hits2) == 0:
            continue
        elif len(hits2) == 1:
            gene_2_best_hit[aa_identifier] = hits2[0]
        else:
            sorted_hits = sorted(hits2, key=lambda x: x[1], reverse=True)
            gene_2_best_hit[aa_identifier] = sorted_hits[0]
    annotation_2_hits = collections.defaultdict(list)
    for aa_identifier, (annotation, bitscore, _) in gene_2_best_hit.items():
        annotation_2_hits[annotation].append((aa_identifier, bitscore))
    annotation_2_best_hit = {}
    for annotation, hits in annotation_2_hits.items():
        if len(hits) == 0:
            continue
        elif len(hits) == 1:
            annotation_2_best_hit[annotation] = hits[0]
        else:
            sorted_hits = sorted(hits, key=lambda x: x[1], reverse=True)
            annotation_2_best_hit[annotation] = sorted_hits[0]

    return annotation_2_best_hit


def normalize_counts(df: pd.DataFrame, marker_gene_length_col: str = 'gene_length_bases',
                     method: str = 'log_tpm') -> pd.DataFrame:
    '''
    df:: count_data produced by load_mg_count_data. This df already should have 'marker_gene_length_col'. Assumes the indes are mg ids.
    tpm is rpk normalized by depth and scaled to 1e6
    options raw, rpk, tpm, log_tpm, clr

    '''

    motus_mgs = ['COG0012', 'COG0016', 'COG0018', 'COG0172', 'COG0215',
                 'COG0495', 'COG0525', 'COG0533', 'COG0541', 'COG0552']

    if method == 'raw':
        return df
    if marker_gene_length_col not in df.columns:
        raise ValueError(
            f"Missing column {marker_gene_length_col} from count dataframe")
    rpk = df.copy()
    gene_length = rpk.pop(marker_gene_length_col)
    counts_rpk = rpk.div((gene_length / 1000), axis=0)
    if method == "rpk":
        return counts_rpk
    if method == 'mg':
        counts_motus = counts_rpk.T
        counts_motus['mgmed'] = counts_motus[motus_mgs].median(axis=1)
        counts_motus = counts_motus.div(
            counts_motus['mgmed'], axis=0).drop(columns=['mgmed']).T
        return np.log2(counts_motus + 0.5)
    counts_tpm = counts_rpk / counts_rpk.sum() * 1e6
    if method == "tpm":
        return counts_tpm
    if method == 'log_tpm':
        return np.log2(counts_tpm + 0.5)
    elif method == 'clr':
        counts_tpm_nz = composition.multi_replace(
            counts_tpm.transpose().to_numpy())  # multi_replace or multiplicative_replacement
        counts_clr = pd.DataFrame(composition.clr(counts_tpm_nz).transpose(),
                                  columns=counts_tpm.columns, index=counts_tpm.index)
        return counts_clr

    raise ValueError("Unknown transfromation method")


def load_model(pkl_file: Union[str, Path]) -> Tuple[object, list]:
    """Load pickled model and feature names."""
    if pkl_file is None:
        raise ValueError("Model file path is required")
    pkl_file = Path(pkl_file)
    if not pkl_file.exists():
        raise FileNotFoundError(f"Model file not found: {pkl_file}")
    try:
        with open(pkl_file, 'rb') as f:
            model = pickle.load(f)
        return model, model.feature_names_in_.tolist()
    except Exception as e:
        raise RuntimeError(f"Failed to load model from {pkl_file}: {str(e)}")


def predict_growth_state(
        mg_counts: Union[pd.DataFrame, str, Path],
        model_pkl: Union[str, Path],
        output_file: Union[str, Path] = None,
        norm_method: str = 'log_tpm',
        marker_gene_length_col: str = 'LENGTH',
        cutoff: float = 0.5,
        sep: str = '\t',
        min_counts: int = 1000,
        max_missing_mgs: int = 2
) -> pd.DataFrame:
    """
    Predict growth state from marker gene counts.

    Returns DataFrame with probability estimates and binary classifications.
    """
    # Input validation
    if not 0 <= cutoff <= 1:
        raise ValueError("Cutoff must be between 0 and 1")

    # Load model
    model, expected_features = load_model(model_pkl)
    # Load data if needed
    if isinstance(mg_counts, (str, Path)):
        mg_counts = pd.read_table(mg_counts, sep=sep, index_col=0)
    samples = [sample for sample in mg_counts.columns if sample != marker_gene_length_col]
    original_sample_count = len(samples)
    logging.info(f"Loading the counts. Found {original_sample_count} samples")
    # Process data
    sample_totals = mg_counts[samples].sum(axis=0)
    low_count_samples = sample_totals[sample_totals < min_counts].index.tolist()
    if len(low_count_samples) == len(samples):
        logging.warning("None of the samples contain sufficient counts for this genome. Exiting.")
        return None
    if low_count_samples:
        logging.info(f"Dropping {len(low_count_samples)} samples with < {min_counts} total counts")
        mg_counts = mg_counts.drop(columns=low_count_samples)
    # Ensure feature alignment
    missing_features = set(expected_features) - set(mg_counts.index)
    if len(missing_features) > max_missing_mgs:
        logging.warning("Not enough MG detected for this genome.")
        return None
    if len(missing_features) > 0:
        logging.info(
            f"Warning: Missing features found, will impute with median values: {missing_features}")
        median_values = mg_counts.median(axis=0)
        for feature in missing_features:
            mg_counts.loc[feature] = median_values
    logging.info(f"Normalising counts using {norm_method} method.")
    mg_norm = normalize_counts(
        mg_counts, method=norm_method, marker_gene_length_col=marker_gene_length_col).T
    mg_norm = mg_norm[expected_features]
    logging.info(f"Making predictions for {len(mg_norm.index)} samples")
    scores = model.predict_proba(mg_norm)[:, 1]
    predictions = (scores >= cutoff)
    results = pd.DataFrame({
        'Probability estimate': np.round(scores, decimals=2),
        'Classification': predictions
    }, index=mg_norm.index)
    results['Classification'] = results['Classification'].map({
        True: 'Growth',
        False: 'No growth'
    })
    if output_file:
        results.to_csv(output_file, sep=sep)
    return results


def predict_multiple_genomes_old_version(counts_file, model_file, output_file, method='TPM'):
    genome_2_counts = collections.defaultdict(list)
    samples = None
    with open(counts_file, 'r') as handle:
        for entry in csv.DictReader(handle, delimiter='\t'):
            genome = entry.pop('#GENOME')
            annotation = entry.pop('ANNOTATION')
            length = entry.pop('LENGTH')
            entry.pop('GENE')
            entry.pop('CHROMOSOME')
            genome_2_counts[genome].append((annotation, length, entry))
            samples = entry.keys()

    samples = sorted(list(samples))
    logging.info(
        f'Found abundances for {len(samples)} samples in {len(genome_2_counts)} genomes')
    for genome, counts in genome_2_counts.items():
        logging.info(f'\tProcessing genome: {genome}')
        tmp = tempfile.NamedTemporaryFile(
            mode='w', suffix='.csv', delete=False, newline='')
        tmp_str = ['marker_gene_id', 'gene_length_bases'] + samples
        tmp_str = ','.join(tmp_str)
        tmp.write(f'{tmp_str}\n')

        for annotation, length, entry in counts:
            tmp_str = [annotation, length]
            for sample in samples:
                val = entry[sample]
                tmp_str.append(val)
            tmp_str = ','.join(tmp_str)
            tmp.write(f'{tmp_str}\n')

        fname = tmp.name
        tmp.close()
        genome_of = output_file + '.' + genome + '.csv'
        print(fname)
        results = predict_growth_state(
            fname, model_file, output_file=genome_of, norm_method=method)
        return results


def predict_multiple_genomes(counts_file, model_file, output_file, marker_gene_length_col='LENGTH',
                              method='log_tpm'):
    counts = pd.read_table(counts_file).drop(['GENE', 'CHROMOSOME'], axis=1)
    for genome, counts_genome in counts.groupby('#GENOME'):
        logging.info(f'GENOME: {genome}')
        counts_genome = counts_genome.drop('#GENOME', axis=1)
        genome_of = output_file + '.' + genome + '.csv'
        results = predict_growth_state(counts_genome.set_index('ANNOTATION'),
                                       model_file, genome_of,
                                       marker_gene_length_col=marker_gene_length_col,
                                       norm_method=method)
        if results is None:
            logging.info(f'Finished processing genome {genome}. No predictions made.')
        else:
            logging.info(
                f'Finished processing genome {genome}. Results written to {genome_of}')


def parse_predict():
    parser = argparse.ArgumentParser(usage=f'''Program: {__tool_name__} - metatranscriptomic growth classifier
        Version: {__version__}


        {__tool_name__} predict [options]

        Input options:
           -i   FILE        Input CSV file with count(s) produced by extract/merge functions.

        Output options:
           -o  STR          Output prefix


              ''', formatter_class=CapitalisedHelpFormatter, add_help=False)

    # Input options
    parser.add_argument("-i", default=None, required=True)