diff --git a/Dockerfile b/Dockerfile index f9d93d47..d043be0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN git clone git://github.com/samtools/htslib.git RUN cd htslib && make install # bcftools -RUN git clone git://github.com/samtools/bcftools.git +RUN git clone git://github.com/samtools/bcftools.git RUN cd bcftools && make # samtools @@ -33,7 +33,7 @@ RUN cd augustus && make # HDF5 RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz RUN tar xzf hdf5-1.10.1.tar.gz -RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr +RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr RUN cd hdf5-1.10.1 && make && make install # sonLib @@ -62,7 +62,7 @@ RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2 FROM ubuntu:18.04 RUN apt-get update -RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev +RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate RUN add-apt-repository -y ppa:deadsnakes/ppa RUN apt-get install -y python3.7 python3-pip # Kent diff --git a/README.md b/README.md index fe5b39eb..1bdafb5a 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Above is a flowchart schematic of the functionality of the `CAT` pipeline. # Installation -The pipeline can be installed by a simple `pip` install: +The pipeline can be installed by a simple `pip` install: `pip install git+https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit.git` @@ -25,7 +25,19 @@ If you want to do the direct pip installation, you can grab the config files fro Either form of `pip` installation will install all of the python dependencies. -## Dependencies (if not using Docker) +## Docker/Singularity + +If you are capable of using Docker or Singularity, we highly recommend you do so. CAT will automatically pull the Docker container and use it for all of the binaries. If you use docker or singularity, all you need to do is install CAT to your python environment. + +### Environmental Variables + +There are two environmental variables to consider when running docker/singularity. + +`$TMPDIR`: Set this to a specific path if you are worried about running out of space in your tmp directory. CAT will automatically mount this location in the container and point temporary files to it. + +`$SINGULARITY_IMAGE`: If you want to use your own singularity image, you can specify the image path here. + +## Compiling dependencies Below is a full breakdown of the required dependencies if you are not using Docker. Some of these can be challenging to get to all compile and work properly. In addition to the breakdown below, you may get guidance looking at the [Dockerfile](https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit/blob/master/Dockerfile) or looking at [this list of installation commands](https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit/issues/118#issuecomment-434910846) generated by a helpful user. @@ -36,9 +48,9 @@ By default, you don't need to worry about installing any of these. However, ther 3. [samtools](http://www.htslib.org/) (1.3 or greater). 4. [Augustus](http://bioinf.uni-greifswald.de/augustus/binaries/). Make sure you are installing `augustus >= 3.3.1`. If you want to use Augustus CGP, install the development version from the [Github repository](https://github.com/Gaius-Augustus/Augustus). You need to follow the instructions to compile `augustus` in comparative augustus mode. This requires that you modify a few lines in the `common.mk` file, and also need to have `sqlite3`, `lp-solve`, `bamtools`, and `libboost` installed. If you are using ubuntu, this should work: `apt-get install libboost-all-dev libboost sqlite3 libsqlite3-0 libsqlite3-dev libgsl0-dev lp-solve liblpsolve55-dev bamtools libbamtools-dev` - - After you have the primary `augustus` binaries compiled, add the directory to your path. Note that if you move the `augustus` binaries from their original location, you will need to set the `AUGUSTUS_CONFIG_PATH` global variable to point to the species directory. - + + After you have the primary `augustus` binaries compiled, add the directory to your path. Note that if you move the `augustus` binaries from their original location, you will need to set the `AUGUSTUS_CONFIG_PATH` global variable to point to the species directory. + You will also need to put the contents of the `scripts` directory on your path. Next, you need to compile the following auxiliary programs from the folder `auxprogs`: 1. `joingenes`. Compiling this program will place it in the `augustus` binary directory. 2. `bam2hints`. Compiling this program will place it in the `augustus` binary directory. Requires `bamtools` to be installed. If the `bamtools` headers are not at `/usr/include/bamtools`, you will need to modify the makefile. @@ -46,7 +58,7 @@ By default, you don't need to worry about installing any of these. However, ther 4. `bam2wig`. Compiling this program will NOT place it in the `augustus` binary directory, you must do so yourself. This program requires you modify the makefile to explicitly point to your installation of `htslib`, `bcftools`, `samtools`, and `tabix`. `Tabix` is now packaged with `htslib`, and both are included in your `kent` directory at `$kent/src/htslib/`. 5. `homGeneMapping`. This program must also have its makefile at `$augustus/trunks/auxprogs/homGeneMapping/src/Makefile` modified to turn on the `BOOST = true` and `SQLITE = true` flags. Then run `make clean && make` to recompile. 6. There are a series of perl scripts that you need to place on your path from the `$augustus/trunks/scripts` directory: `wig2hints.pl`, `exonerate2hints.pl`, `transMap2hints.pl`, and `join_mult_hints.pl`. -5. [HAL toolkit](https://github.com/glennhickey/hal). To install the HAL toolkit, you must also have the [sonLib](https://github.com/benedictpaten/sonLib) repository in the same parent directory. Compile sonLib first, then compile hal. Once hal is compiled, you need to have the binaries on your path. +5. [HAL toolkit](https://github.com/glennhickey/hal). To install the HAL toolkit, you must also have the [sonLib](https://github.com/benedictpaten/sonLib) repository in the same parent directory. Compile sonLib first, then compile hal. Once hal is compiled, you need to have the binaries on your path. 6. [wiggletools](https://github.com/Ensembl/WiggleTools). Used to combine RNA-seq expression in assembly hubs. 7. [sambamba](https://github.com/lomereiter/sambamba/releases). Used to name sort faster than samtools for hints building. 8. [exonerate](https://www.ebi.ac.uk/about/vertebrate-genomics/software/exonerate). Used for protein alignments in hints building. @@ -58,11 +70,12 @@ Many of the above dependencies are on `conda`. However, you will still need to i 2. `toil`: The version on conda is too old (not python3 compatible). 3. `cat`: You will need to install this repo with `pip install` after you enter the conda env (`conda activate cattest`). 4. `HAL`: Not available on conda. + 5. The version of `augustus` on conda does not have the required flags set in its compliation. ``` conda create -y -n cattest -c conda-forge -c bioconda -c defaults python=3.7 pyfasta luigi seaborn pandas \ ete3 pysam numpy scipy bx-python bcbio-gff biopython parasail-python configobj sqlalchemy \ -samtools bamtools augustus exonerate wiggletools bedtools \ +samtools bamtools exonerate wiggletools bedtools \ ucsc-fatotwobit ucsc-gff3togenepred ucsc-genepredtobed ucsc-genepredtofakepsl ucsc-bamtopsl ucsc-transmappsltogenepred \ ucsc-pslpostarget ucsc-axtchain ucsc-chainmergesort ucsc-pslmap ucsc-pslrecalcmatch ucsc-pslmappostchain \ ucsc-gtftogenepred ucsc-genepredtogtf ucsc-pslcdnafilter ucsc-psltobigpsl \ @@ -71,16 +84,16 @@ ucsc-bedsort ucsc-bedtobigbed In total, you must have all of the binaries and scripts listed below on your path. The pipeline will check for them before executing steps. -`hal2fasta halStats halLiftover exonerate faToTwoBit pyfasta gff3ToGenePred genePredToBed genePredToFakePsl bamToPsl exonerate2hints.pl blat2hints.pl transMapPslToGenePred join_mult_hints.pl pslPosTarget axtChain chainMergeSort pslMap pslRecalcMatch pslMapPostChain augustus transMap2hints.pl joingenes hal2maf gtfToGenePred genePredToGtf bedtools homGeneMapping pslCDnaFilter clusterGenes pslToBigPsl bedSort bedToBigBed sambamba wig2hints.pl pal2nal.pl` -# Running the pipeline - -This pipeline makes use of [Luigi](https://github.com/spotify/luigi) to link the various steps together. First, start the `luigid` daemon: - -`luigid --background --logdir luigi_logs` +``` +hal2fasta halStats halLiftover exonerate faToTwoBit pyfasta gff3ToGenePred genePredToBed \ +genePredToFakePsl bamToPsl exonerate2hints.pl blat2hints.pl transMapPslToGenePred \ +join_mult_hints.pl pslPosTarget axtChain chainMergeSort pslMap pslRecalcMatch pslMapPostChain \ +augustus transMap2hints.pl joingenes hal2maf gtfToGenePred genePredToGtf bedtools homGeneMapping \ +pslCDnaFilter clusterGenes pslToBigPsl bedSort bedToBigBed sambamba wig2hints.pl pal2nal.pl +``` -Which provides the central scheduler as well as the web UI, which can be accessed at `localhost:8082`. -If you don't want to use the daemon, *which is highly recommended* add the flag `--local-scheduler` to the invocation. +# Running the pipeline To run the test data, change directories to the CAT installation folder and do the following: @@ -109,7 +122,7 @@ reserved_keys = ['gene_biotype', The keys `ID`, `Name` and `Parent` are required for valid GFF3 and define the hierarchical relationship. The remaining keys, `gene_biotype`, `transcript_biotype`, `gene_name`, `gene_id`, `transcript_id` and `transcript_name` are also all required. In many cases you will not have common names, and so it is fine for `transcript_name` to equal `transcript_id` and for `gene_name` to equal `gene_id`. The biotypes can be whatever you want, but `protein_coding` is a special biotype that tells CAT this gene or transcript is coding. You *must* flag your protein coding genes *and transcripts* as `protein_coding`! -You may have any arbitrary set of keys and values in the GFF3 that are not the reserved keys. These keys and values will be propagated on to the resulting transcript in the CAT output GFF3. +You may have any arbitrary set of keys and values in the GFF3 that are not the reserved keys. These keys and values will be propagated on to the resulting transcript in the CAT output GFF3. If your GFF3 has duplicate transcript names, the pipeline will complain. One common cause of this is PAR locus genes. You will want to remove PAR genes -- If your GFF3 came from GENCODE, you should be able to do this: `grep -v PAR_Y $gff > $gff.fixed`. @@ -125,7 +138,7 @@ As described above, the primary method to executing the pipeline is to follow th `--config`: Path to the config file with annotations and extrinsic hints. See [the config section](#config-file) for more information. (REQUIRED). -`--binary-mode`: How should CAT run its binary dependencies? Valid choices are "docker" (the default, in which case you must have a Docker daemon running) or "local" (in which case ensure your PATH contains the necessary dependencies). +`--binary-mode`: How should CAT run its binary dependencies? Valid choices are "docker" (the default, in which case you must have a Docker daemon running), "singularity", or "local" (in which case ensure your PATH contains the necessary dependencies). `--out-dir`: Output directory. Defaults to `./cat_output`. @@ -136,11 +149,11 @@ As described above, the primary method to executing the pipeline is to follow th `--workers`: Number of local cores to use. If running `toil` in singleMachine mode, care must be taken with the balance of this value and the `--maxCores` parameter. ## transMap options -`--global-near-best`: Adjusts the `globalNearBest` parameter passed to `pslCDnaFilter`. Defaults to 0.15. The `globalNearBest` algorithm determines which set of alignments are within a certain distance of the highest scoring alignment for a given source transcript. Making this value smaller will increase the number of alignments filtered out, decreasing the apparent paralogous alignment rate. Alignments which survive this filter are putatively paralogous. +`--global-near-best`: Adjusts the `globalNearBest` parameter passed to `pslCDnaFilter`. Defaults to 0.15. The `globalNearBest` algorithm determines which set of alignments are within a certain distance of the highest scoring alignment for a given source transcript. Making this value smaller will increase the number of alignments filtered out, decreasing the apparent paralogous alignment rate. Alignments which survive this filter are putatively paralogous. ## AugustusTM(R) options -`--augustus`: Run AugustusTM(R)? +`--augustus`: Run AugustusTM(R)? `--augustus-species`: What Augustus species do we want to use? If your species is not a mammal, please choose [one of the species listed here](http://bioinf.uni-greifswald.de/augustus/). @@ -168,7 +181,7 @@ As described above, the primary method to executing the pipeline is to follow th ## Filtering and consensus finding options -`--filter-overlapping-genes`: Should genes that get flagged as overlapping be removed? After consensus finding is finished, instances of gene family collapse or paralog mis-assignment may lead to overlapping CDS intervals on different genes. This also in some instances may be a feature of the original annotation set. However, some annotation databases do not like this, so this flag will remove all such instances and resolve them down to one gene. +`--filter-overlapping-genes`: Should genes that get flagged as overlapping be removed? After consensus finding is finished, instances of gene family collapse or paralog mis-assignment may lead to overlapping CDS intervals on different genes. This also in some instances may be a feature of the original annotation set. However, some annotation databases do not like this, so this flag will remove all such instances and resolve them down to one gene. `--overlapping-gene-distance`: This controls the amount of *exonic* overlap two genes must have to be flagged as overlapping. Setting this value higher than 1 is recommended for smaller genomes that likely have real overlapping genes. Default is 1. @@ -230,7 +243,7 @@ Follow these [instructions](https://toilpipelines.wordpress.com/2018/01/18/runni `--provisioner`: The provisioner of the cloud system. For AWS it's just `aws`. -`--maxNodes`: Maximum number of nodes running at once. +`--maxNodes`: Maximum number of nodes running at once. # Config file @@ -335,7 +348,7 @@ This step performs the preliminary classification of `transMap` transcripts. Thi 10. ValidStart -- start with ATG? 11. ValidStop -- valid stop codon (in frame)? 12. ProperOrf -- is the orf a multiple of 3? - + This module will populate the folder `--work-dir/transMap`. ## FilterTransMap @@ -383,7 +396,7 @@ This module will populate the folder `--work-dir/augustus_cgp`. ## AugustusPb -Running `AugustusPB` requires that IsoSeq data be provided. This mode runs on single genomes, and attempts to discover new isoforms. Transcripts predicted in this process undergo the same parental gene assignment described above. +Running `AugustusPB` requires that IsoSeq data be provided. This mode runs on single genomes, and attempts to discover new isoforms. Transcripts predicted in this process undergo the same parental gene assignment described above. This module will populate the folder `--work-dir/augustus_pb`. @@ -409,7 +422,7 @@ Which can be interpreted as 'species 0 had 6273 extrinsic hints (RNA-seq coverag ## AlignTranscripts -Transcript alignment allows for `AugustusTM(R)` transcripts to be compared to their parental `transMap`. As a result, only protein coding transcripts are aligned. For each transcripts, alignment is performed by parasail two ways -- CDS alignment, and mRNA alignment. The results of these alignments are saved in the folder `--work-dir/transcript_alignment`. These alignments are used to create functional annotations of transcripts in the [EvaluateTranscripts](#evaluatetranscripts) module. +Transcript alignment allows for `AugustusTM(R)` transcripts to be compared to their parental `transMap`. As a result, only protein coding transcripts are aligned. For each transcripts, alignment is performed by parasail two ways -- CDS alignment, and mRNA alignment. The results of these alignments are saved in the folder `--work-dir/transcript_alignment`. These alignments are used to create functional annotations of transcripts in the [EvaluateTranscripts](#evaluatetranscripts) module. ## EvaluateTranscripts @@ -455,7 +468,7 @@ The evaluation tables will be loaded as tracks in the final [assembly hub](#asse The consensus finding process takes in transcripts from every mode and attempts to find the highest quality ortholog for a source transcript. The de-novo transcript modes are also evaluated for providing novel isoforms or novel loci. The final gene set is output with a series of features measuring how confident the prediction is. To evaluate `transMap`, `AugustusTM` and `AugustusTMR` transcripts a consensus score is assigned to each. This score is the sum of the alignment goodness, intron/exon annotation support, original intron support, and intron/exon RNA-seq/IsoSeq support if extrinsic data were provided. The transcript with the highest consensus score is chosen. - + If one of the de-novo `augustus` modes is run, then the those transcripts are evaluated for providing novel information. If a prediction did not overlap any transMap projections, then it is tagged as putative novel and incorporated into the gene set. If a prediction overlaps a `transMap` projection that was filtered out during paralog resolution, then it is tagged as a possible paralog as well as with the names of overlapping transcripts and incorporated into the gene set. If a prediction overlaps a transMap projection and contains a splice junction not seen in the reference annotation, then it is tagged as a novel isoform and incorporated into the gene set as a member of the gene it overlapped with. After consensus finding is complete, a final filtering process is performed. This filtering process deduplicates the transcript set. Duplicates most often occur when the `augustus` execution modes create an identical transcript model from different input isoforms. In this case, the duplicates are removed and the remaining transcript tagged with the names of alternative source transcripts. Strand resolution throws out transcripts that are on opposite strands. The correct strand is chosen by looking at which contains the most high quality transcripts. Finally, the transcripts are again clustered using `clusterGenes` on CDS intervals to resolve the case where incorporating novel predictions lead to different gene IDs sharing CDS bases. diff --git a/cat/__init__.py b/cat/__init__.py index 09332363..a1ac296c 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -53,7 +53,6 @@ from .exceptions import * logger = logging.getLogger('cat') -logger.setLevel('INFO') ### @@ -185,7 +184,7 @@ def get_pipeline_args(self): args.set('global_near_best', self.global_near_best, True) args.set('filter_overlapping_genes', self.filter_overlapping_genes, True) args.set('overlapping_gene_distance', self.overlapping_gene_distance, True) - + # user specified flags for consensus finding args.set('intron_rnaseq_support', self.intron_rnaseq_support, False) args.set('exon_rnaseq_support', self.exon_rnaseq_support, False) @@ -398,6 +397,8 @@ def load_docker(self): 'Either install it or use a different option for --binary-mode.') os.environ['SINGULARITY_PULLFOLDER'] = os.path.abspath(self.work_dir) os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(self.work_dir) + if os.environ.get('SINGULARITY_IMAGE'): + return tools.fileOps.ensure_dir(self.work_dir) if not os.path.isfile(os.path.join(self.work_dir, 'cat.img')): subprocess.check_call(['singularity', 'pull', '--name', 'cat.img', @@ -565,7 +566,7 @@ def success(task): os.remove(os.path.abspath(task.job_store)) except OSError: pass - else: + else: cmd = ['toil', 'stats', '--raw', os.path.abspath(task.job_store)] raw = tools.procOps.call_proc(cmd) parsed = raw[raw.index('{'):raw.rfind('}') + 1] @@ -1620,14 +1621,13 @@ def get_args(pipeline_args, mode): args = tools.misc.HashableNamespace() if mode == 'augPB': args.tablename = tools.sqlInterface.AugPbAlternativeGenes.__tablename__ + pb_genomes = set(pipeline_args.target_genomes) & (pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}) args.gps = {genome: AugustusPb.get_args(pipeline_args, genome).augustus_pb_gp - for genome in pipeline_args.isoseq_genomes} + for genome in pb_genomes} args.filtered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}} + for genome in pb_genomes} args.unfiltered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in pipeline_args.isoseq_genomes - {pipeline_args.ref_genome}} - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in pipeline_args.isoseq_genomes} + for genome in pb_genomes} # add the reference annotation as a pseudo-transMap to assign parents in reference args.filtered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.unfiltered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp @@ -1643,23 +1643,20 @@ def get_args(pipeline_args, mode): unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.filtered_tm_gps = filtered_tm_gp_files args.unfiltered_tm_gps = unfiltered_tm_gp_files - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]} elif mode == 'exRef': + exref_genomes = set(pipeline_args.target_genomes) & set(pipeline_args.external_ref_genomes) args.tablename = tools.sqlInterface.ExRefAlternativeGenes.__tablename__ args.gps = {genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp - for genome in pipeline_args.external_ref_genomes} + for genome in exref_genomes} filtered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in pipeline_args.external_ref_genomes} + for genome in exref_genomes} unfiltered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in pipeline_args.external_ref_genomes} + for genome in exref_genomes} # add the reference annotation as a pseudo-transMap to assign parents in reference filtered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.filtered_tm_gps = filtered_tm_gp_files args.unfiltered_tm_gps = unfiltered_tm_gp_files - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]} else: raise Exception('Invalid mode passed to FindDenovoParents') return args @@ -1696,8 +1693,7 @@ def run(self): table_target = self.get_table_targets(genome, denovo_args.tablename, pipeline_args) filtered_tm_gp = denovo_args.filtered_tm_gps[genome] unfiltered_tm_gp = denovo_args.unfiltered_tm_gps[genome] - chrom_sizes = denovo_args.chrom_sizes[genome] - df = assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp) + df = assign_parents(filtered_tm_gp, unfiltered_tm_gp, denovo_gp) db = pipeline_args.dbs[genome] with tools.sqlite.ExclusiveSqlConnection(db) as engine: df.to_sql(denovo_args.tablename, engine, if_exists='replace') @@ -2298,7 +2294,7 @@ def output(self): def run(self): pipeline_args = self.get_pipeline_args() luigi_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'stats') - + try: toil_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'toil_stats') except ValueError: @@ -3184,7 +3180,7 @@ def construct_consensus_gp_as(has_rna, has_pb): ''' -snake_composite = '''track hubCentral +snake_composite = '''track cactus compositeTrack on shortLabel Cactus longLabel Cactus Alignment Tracks @@ -3199,11 +3195,11 @@ def construct_consensus_gp_as(has_rna, has_pb): visibility full type bigBed 3 - track hubCentralAlignments + track cactusAlignments shortLabel Alignments view Alignments visibility full - subTrack hubCentral + subTrack cactus ''' @@ -3212,7 +3208,7 @@ def construct_consensus_gp_as(has_rna, has_pb): shortLabel {genome} otherSpecies {genome} visibility {visibility} - parent hubCentralAlignments off + parent cactusAlignments off priority 3 bigDataUrl {hal_path} type halSnake diff --git a/cat/chaining.py b/cat/chaining.py index e9a00ff3..eb4533d6 100644 --- a/cat/chaining.py +++ b/cat/chaining.py @@ -1,5 +1,8 @@ """" -Toil program to generate UCSC chains and nets between two genomes in a HAL file. +Toil program to generate UCSC chains and nets between one reference genome and N target genomes in a HAL file. + +The reference genome is split up by contig/chromosome, and those are chained against all the things they align to +in each genome specified. """ import argparse import collections diff --git a/cat/classify.py b/cat/classify.py index 3bc003c2..88bf75f4 100644 --- a/cat/classify.py +++ b/cat/classify.py @@ -42,8 +42,6 @@ 2) mRNA """ -import itertools - import pandas as pd import tools.bio @@ -244,19 +242,17 @@ def find_indels(tx, psl, aln_mode): """ def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand): """convert alignment coordinates to target chromosome coordinates, inverting if negative strand""" - left_chrom_pos = coordinate_fn(left_pos) - assert left_chrom_pos is not None - right_chrom_pos = coordinate_fn(right_pos) - if right_chrom_pos is None: - right_chrom_pos = coordinate_fn(right_pos - 1) - if strand == '-': - left_chrom_pos += 1 - else: - left_chrom_pos -= 1 - assert right_chrom_pos is not None - if strand == '-': - left_chrom_pos, right_chrom_pos = right_chrom_pos, left_chrom_pos - assert right_chrom_pos >= left_chrom_pos + # this is a deletion, and we don't need to adjust + if left_pos == right_pos: + left_chrom_pos = right_chrom_pos = coordinate_fn(left_pos) + # for + strand insertion, just translate + elif strand == '+': + right_chrom_pos = coordinate_fn(right_pos) + left_chrom_pos = coordinate_fn(left_pos) + # for - strand insertion, translate but move outwards + else: + right_chrom_pos = coordinate_fn(left_pos) + 1 + left_chrom_pos = coordinate_fn(right_pos - 1) return left_chrom_pos, right_chrom_pos def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): @@ -264,9 +260,10 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): left_chrom_pos, right_chrom_pos = convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, tx.strand) if left_chrom_pos is None or right_chrom_pos is None: - assert aln_mode == 'CDS' return None + assert right_chrom_pos >= left_chrom_pos + if left_chrom_pos > tx.thick_start and right_chrom_pos < tx.thick_stop: indel_type = 'CodingMult3' if offset % 3 == 0 else 'Coding' else: @@ -295,13 +292,20 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): t_offset = t_start - block_size - t_pos assert (q_offset >= 0 and t_offset >= 0) if q_offset != 0: # query insertion -> insertion in target sequence - left_pos = q_start - q_offset - right_pos = q_start + if tx.strand == '+': + left_pos = q_start - q_offset + right_pos = q_start + else: + left_pos = psl.q_size - q_start - 1 + right_pos = psl.q_size - q_start + q_offset row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, 'Insertion') if row is not None: r.append(row) if t_offset != 0: # target insertion -> insertion in reference sequence - left_pos = right_pos = q_start + if tx.strand == '+': + left_pos = right_pos = q_start + else: + left_pos = right_pos = psl.q_size - q_start row = parse_indel(left_pos, right_pos, coordinate_fn, tx, t_offset, 'Deletion') if row is not None: r.append(row) diff --git a/cat/consensus.py b/cat/consensus.py index e5443880..f38808fb 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -648,9 +648,11 @@ def add_exref_ids(s): 'source_gene_common_name': s.CommonName} # bring in extra tags for exRef - if 'exRef' in denovo_tx_modes: - for key, val in tools.misc.parse_gff_attr_line(exref_annot.loc[aln_id].ExtraTags).items(): - denovo_tx_dict[aln_id][key] = val + if tools.nameConversions.aln_id_is_exref(aln_id): + tags = exref_annot.loc[aln_id].ExtraTags + if len(tags) > 0: + for key, val in tools.misc.parse_gff_attr_line(tags).items(): + denovo_tx_dict[aln_id][key] = val # record some metrics metrics['denovo'][tx_mode][s.TranscriptClass.replace('_', ' ').capitalize()] += 1 @@ -907,12 +909,15 @@ def convert_attrs(attrs, id_field): else: attrs['Name'] = attrs['gene_id'] # convert empty strings into nan - attrs = {x: y if y != '' else 'nan' for x, y in attrs.items()} - # don't include the support vectors in the string, they will be placed in their respective places - attrs_str = ['='.join([key, str(val).replace('=', '_')]) for key, val in sorted(attrs.items()) if 'support' not in key] - # explicitly escape any semicolons that may exist in the input strings - attrs_str = [x.replace(';', '%3B') for x in attrs_str] - return score, ';'.join(attrs_str) + attrs_str = [] + for key, val in attrs.items(): + val = str(val) + if len(val) == 0: + val = 'nan' + val = val.replace('=', '%3D').replace(';', '%3B') + key = key.replace('=', '%3D').replace(';', '%3B') + attrs_str.append(f"{key}={val}") + return score, ";".join(attrs_str) def find_feature_support(attrs, feature, i): """Extracts the boolean value from the comma delimited string""" @@ -929,7 +934,7 @@ def find_all_tx_modes(attrs_list): for attrs in attrs_list: tx_modes.update(attrs['transcript_modes'].split(',')) return ','.join(tx_modes) - + intervals = set() for tx in tx_objs: intervals.update(tx.exon_intervals) diff --git a/cat/hints_db.py b/cat/hints_db.py index 8b73cfc0..3c049c94 100644 --- a/cat/hints_db.py +++ b/cat/hints_db.py @@ -307,7 +307,7 @@ def run_protein_aln(job, protein_subset, genome_fasta_file_id): # perform alignment tmp_exonerate = tools.fileOps.get_tmp_toil_file() cmd = ['exonerate', '--model', 'protein2genome', '--showvulgar', 'no', '--showalignment', 'no', - '--showquerygff', 'yes', genome_fasta, protein_fasta] + '--showquerygff', 'yes', protein_fasta, genome_fasta] tools.procOps.run_proc(cmd, stdout=tmp_exonerate) return job.fileStore.writeGlobalFile(tmp_exonerate) diff --git a/cat/parent_gene_assignment.py b/cat/parent_gene_assignment.py index e5ec1ec0..661258ca 100644 --- a/cat/parent_gene_assignment.py +++ b/cat/parent_gene_assignment.py @@ -4,47 +4,68 @@ import pandas as pd import itertools import collections -from tools.defaultOrderedDict import DefaultOrderedDict import tools.procOps import tools.fileOps import tools.mathOps import tools.transcripts import tools.intervals +import tools.nameConversions -def assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp, min_distance=0.9, stranded=True): +def assign_parents(filtered_tm_gp, unfiltered_tm_gp, denovo_gp, min_distance=0.75, + tm_jaccard_distance=0.2, stranded=True): """ Main function for assigning parental genes. Parental gene assignment methodology: - A) Each denovo transcript is evaluated for overlapping any transMap transcripts. - Overlap is defined as having at least 1 exonic base shared. + A) clusterGenes is used to cluster filtered transMap transcripts. B) If a denovo transcript is assigned to more than one gene, then this is attempted to be resolved. Resolution occurs by looking first at the transMap themselves. If any transMap projections overlap each other - with a Jaccard metric > 0.001, then we call this as a badAnnotOrTm. These will be discarded unless all splices - are supported. - Next, we look at the asymmetric distance between this prediction and the gene intervals. If this difference + with a Jaccard metric > tm_jaccard_distance, then we call this as a badAnnotOrTm. + These will be discarded unless all splices are supported. + C) Next, we look at the asymmetric distance between this prediction and the gene intervals. If this difference in these distances is over min_distance for all comparisons, we call this rescued and it can be incorporated. Otherwise, this transcript is tagged ambiguousOrFusion. + + Additionally, we look at all of the transMap projections that were filtered out and apply those + gene names to the AlternativeGeneIds column. This is a marker of possible paralogy. """ + def assign_type(s): + if tools.nameConversions.aln_id_is_denovo(s.gene): + return True + return False + filtered_transmap_dict = tools.transcripts.get_gene_pred_dict(filtered_tm_gp, stranded) unfiltered_transmap_dict = tools.transcripts.get_gene_pred_dict(unfiltered_tm_gp, stranded) - filtered_ids = unfiltered_transmap_dict.keys() - filtered_transmap_dict.keys() - - tm_chrom_dict = create_chrom_dict(unfiltered_transmap_dict, chrom_sizes) denovo_dict = tools.transcripts.get_gene_pred_dict(denovo_gp, stranded) - denovo_chrom_dict = create_chrom_dict(denovo_dict) - # begin parent gene assignment + with tools.fileOps.TemporaryFilePath() as tmp: + cmd = ['clusterGenes', '-minOverlappingBases=30', tmp, 'no', unfiltered_tm_gp, denovo_gp] + if not stranded: + cmd.append(['-ignoreStrand']) + tools.procOps.run_proc(cmd) + cluster_df = pd.read_csv(tmp, sep='\t') + + cluster_df['is_denovo'] = cluster_df.apply(assign_type, axis=1) + r = [] - for chrom, tm_tx_by_chromosome in tm_chrom_dict.items(): - for denovo_tx_id, denovo_tx in denovo_chrom_dict[chrom].items(): - # find the names of both filtered and unfiltered transMap transcript IDs that overlap - unfiltered_overlapping_tm_txs = find_tm_overlaps(denovo_tx, tm_tx_by_chromosome) - filtered_overlapping_tm_txs = {tx for tx in unfiltered_overlapping_tm_txs if tx.name not in filtered_ids} - # extract only gene names for the filtered set - filtered_gene_ids = {tx.name2 for tx in filtered_overlapping_tm_txs} + for _, d in cluster_df.groupby('#cluster'): + if not any(d.is_denovo): + continue + unfiltered_overlapping_tm_txs = set() + filtered_overlapping_tm_txs = set() + denovo_txs = set() + for tx_id, is_denovo in zip(d.gene, d.is_denovo): + if is_denovo: + denovo_txs.add(denovo_dict[tx_id]) + elif tx_id in filtered_transmap_dict: + filtered_overlapping_tm_txs.add(filtered_transmap_dict[tx_id]) + else: + unfiltered_overlapping_tm_txs.add(unfiltered_transmap_dict[tx_id]) + # extract only gene names for the filtered set + filtered_gene_ids = {tx.name2 for tx in filtered_overlapping_tm_txs} + for denovo_tx in denovo_txs: if len(filtered_gene_ids) > 1: # we have more than one match, so resolve it resolved_name, resolution_method = resolve_multiple_genes(denovo_tx, filtered_overlapping_tm_txs, - min_distance) + min_distance, tm_jaccard_distance) elif len(filtered_gene_ids) == 1: # yay, we have exactly one match resolved_name = list(filtered_gene_ids)[0] resolution_method = None @@ -61,45 +82,20 @@ def assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp, min return combined_alternatives -def create_chrom_dict(tx_dict, chrom_sizes=None): - """ - Split up a dictionary of Transcript objects by chromosome. Add in extra chromosomes based on a sizes file - """ - chrom_dict = collections.defaultdict(dict) - for tx_id, tx in tx_dict.items(): - chrom_dict[tx.chromosome][tx_id] = tx - if chrom_sizes is not None: - for chrom, size in tools.fileOps.iter_lines(chrom_sizes): - if chrom not in chrom_dict: - chrom_dict[chrom] = {} - return chrom_dict - - -def find_tm_overlaps(denovo_tx, tm_tx_dict, cutoff=100): - """Find overlap with transMap transcripts first on a genomic scale then an exonic scale""" - r = DefaultOrderedDict(int) - for tx in tm_tx_dict.values(): - for tx_exon in tx.exon_intervals: - for denovo_exon in denovo_tx.exon_intervals: - i = tx_exon.intersection(denovo_exon) - if i is not None: - r[tx] += len(i) - return [tx_id for tx_id, num_bases in r.items() if num_bases >= cutoff] - - -def resolve_multiple_genes(denovo_tx, overlapping_tm_txs, min_distance): +def resolve_multiple_genes(denovo_tx, overlapping_tm_txs, min_distance, tm_jaccard_distance): """ Resolve multiple assignments based on the following rules: """ # use Jaccard metric to determine if the problem lies with transMap or annotation tm_txs_by_gene = tools.transcripts.group_transcripts_by_name2(overlapping_tm_txs) tm_jaccards = [find_highest_gene_jaccard(x, y) for x, y in itertools.combinations(list(tm_txs_by_gene.values()), 2)] - if any(x > 0.001 for x in tm_jaccards): + if any(x > tm_jaccard_distance for x in tm_jaccards): return None, 'badAnnotOrTm' # calculate asymmetric difference for this prediction scores = collections.defaultdict(list) for tx in overlapping_tm_txs: - scores[tx.name2].append(calculate_asymmetric_closeness(denovo_tx, tx)) + scores[tx.name2].append(tools.intervals.calculate_bed12_asymmetric_jaccard(denovo_tx.exon_intervals, + tx.exon_intervals)) best_scores = {gene_id: max(scores[gene_id]) for gene_id in scores} high_score = max(best_scores.values()) if all(high_score - x >= min_distance for x in best_scores.values() if x != high_score): @@ -125,15 +121,3 @@ def find_interval(gene_list): b_interval = find_interval(gene_list_b) return tools.intervals.calculate_bed12_jaccard(a_interval, b_interval) - -def calculate_asymmetric_closeness(denovo_tx, tm_tx): - """ - Calculates the asymmetric closeness between two transcripts. This allows for denovo predictions that are subsets - of existing annotations to get more weight. - - closeness = length(intersection) / length(denovo) in chromosome space - """ - intersection = denovo_tx.interval.intersection(tm_tx.interval) - if intersection is None: - return 0 - return tools.mathOps.format_ratio(len(intersection), len(denovo_tx.interval)) diff --git a/logging.cfg b/logging.cfg index 304ec597..9cb5d68f 100644 --- a/logging.cfg +++ b/logging.cfg @@ -1,21 +1,33 @@ [loggers] -keys=root +keys=root,luigi,cat [handlers] keys=consoleHandler [formatters] -keys=simpleFormatter +keys=consoleFormatter [logger_root] +level=WARNING +handlers=consoleHandler + +[logger_luigi] +level=INFO +handlers=consoleHandler +qualname=luigi-interface +propagate=0 + +[logger_cat] level=INFO handlers=consoleHandler +qualname=cat +propagate=0 [handler_consoleHandler] class=StreamHandler level=INFO -formatter=simpleFormatter +formatter=consoleFormatter args=(sys.stdout,) -[formatter_simpleFormatter] -format=%(levelname)s: %(message)s \ No newline at end of file +[formatter_consoleFormatter] +format=%(levelname)s: %(asctime)-15s - %(message)s \ No newline at end of file diff --git a/programs/convert_cupcake_gff b/programs/convert_cupcake_gff new file mode 100755 index 00000000..746c8fe1 --- /dev/null +++ b/programs/convert_cupcake_gff @@ -0,0 +1,40 @@ +#!/usr/bin/env python +import argparse +from tools.fileOps import TemporaryFilePath, print_row +from tools.transcripts import gene_pred_iterator, group_transcripts_by_name2 +from tools.procOps import run_proc + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('cupcake_gff', help='Cupcake GFF file') + parser.add_argument('gff3', help='Output GFF3') + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + with TemporaryFilePath() as tmp_gp: + cmd = ['gtfToGenePred', '-genePredExt', args.cupcake_gff, tmp_gp] + run_proc(cmd) + d = group_transcripts_by_name2(gene_pred_iterator(tmp_gp)) + with open(args.gff3, "w") as fh: + fh.write('##gff-version 3\n') + for gene_id, tx_list in d.items(): + for tx in tx_list: + attrs = {'transcript_id': tx.name, 'gene_id': gene_id, 'gene_type': 'IsoSeq_Cupcake', + 'transcript_type': 'IsoSeq_Cupcake'} + attrs['gene_name'] = attrs['gene_id'] + attrs['transcript_name'] = attrs['transcript_id'] + attrs['ID'] = attrs['transcript_id'] + # first write transcript line + tx_l = [tx.chromosome, 'PacBio', 'transcript', tx.start + 1, tx.stop, '.', tx.strand, '.', + ';'.join(['='.join([x, y]) for x, y in attrs.items()])] + print_row(fh, tx_l) + # exon lines + for e in tx.exon_intervals: + attrs['Parent'] = attrs['transcript_id'] + attrs['ID'] = 'exon:{}'.format(attrs['transcript_id']) + ex_l = [e.chromosome, 'PacBio', 'exon', e.start + 1, e.stop, '.', e.strand, '.', + ';'.join(['='.join([x, y]) for x, y in attrs.items()])] + print_row(fh, ex_l) diff --git a/programs/convert_ncbi_gff3 b/programs/convert_ncbi_gff3 new file mode 100755 index 00000000..5c297608 --- /dev/null +++ b/programs/convert_ncbi_gff3 @@ -0,0 +1,126 @@ +#!/usr/bin/env python +"""Developed against Rnor6 RefSeq""" +import argparse +from BCBio import GFF +from collections import Counter, OrderedDict + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_gff3', help='RefSeq GFF3 file') + parser.add_argument('output_gff3', help='Output GFF3') + return parser.parse_args() + + +def get_unique_id(i, c, update_id=True): + if i not in c: + c[i] = 0 + return i + if update_id is True: + c[i] += 1 + if c[i] > 0: + return f"{i}.{c[i]}" + return i + + +def add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feature=None): + if feature.type not in ['ncRNA', 'tRNA', 'rRNA', 'pseudogene', 'lnc_RNA', + 'rRNA', 'mRNA', 'snRNA', 'snoRNA', 'scRNA', 'antisense_RNA', 'guide_RNA', + 'C_gene_segment', 'J_gene_segment', 'V_gene_segment', + 'primary_transcript', 'miRNA', 'transcript', 'CDS', 'exon', 'gene', + 'SRP_RNA', 'telomerase_RNA']: + return + if 'dbxref' in feature.qualifiers: + xref = feature.qualifiers['dbxref'] + elif 'Dbxref' in feature.qualifiers: + xref = feature.qualifiers['Dbxref'] + else: # hacky fix + xref = ['GeneID:NoID-{}'.format(noid_count)] + noid_count += 1 + ids = dict([x.split(':', 1) for x in xref]) + if feature.type == 'gene': + assert feature == parent_feature + feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts)] + feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers.get('product', feature.qualifiers['gene_id'])) + assert 'gene_biotype' in feature.qualifiers + return + feature.qualifiers['gene_id'] = [get_unique_id(ids['GeneID'], gene_counts, update_id=False)] + feature.qualifiers['gene_name'] = feature.qualifiers.get('gene', feature.qualifiers.get('product', feature.qualifiers['gene_id'])) + tx_id = feature.qualifiers.get('transcript_id', feature.qualifiers['ID'])[0] + if '-' in tx_id: + tx_id = tx_id.split('-', 1)[1] + if feature.type not in ['CDS', 'exon']: + # is a transcript; check for uniqueness + tx_id = get_unique_id(tx_id, tx_counts) + else: + tx_id = get_unique_id(tx_id, tx_counts, update_id=False) + feature.qualifiers['transcript_id'] = [tx_id] + feature.qualifiers['transcript_name'] = feature.qualifiers.get('Name', feature.qualifiers['transcript_id']) + # figure out gene biotype + if 'gene_biotype' in parent_feature.qualifiers: + gene_biotype = parent_feature.qualifiers['gene_biotype'] + else: + gene_biotype = [feature.type] + + # figure out transcript biotype + if feature.type == 'primary_transcript': + transcript_biotype = ['miRNA'] + elif 'segment' in feature.type: + transcript_biotype = [feature.type] + feature.type = 'ncRNA' + elif feature.type == 'lnc_RNA': + feature.type = 'lncRNA' + transcript_biotype = [feature.type] + else: + transcript_biotype = gene_biotype + feature.qualifiers['gene_biotype'] = gene_biotype + feature.qualifiers['transcript_biotype'] = transcript_biotype + + +def construct_new_qualifiers(feature): + new_qualifiers = OrderedDict() + for key, val in feature.qualifiers.items(): + # no upper case keys unless it is ID or Parent or Name + if key not in ['ID', 'Parent', 'Name']: + key = key.lower() + # collapse to a single item + # replace all semicolons + if len(val) > 1: + val = [' '.join([x.replace(';', '%3B').replace('=', '%3D') for x in val])] + new_qualifiers[key] = val + # clean up and make parseable + for key, val in new_qualifiers.items(): + if sum(len(x) for x in val) == 0: + new_qualifiers[key] = 'True' + return new_qualifiers + + +def feature_traversal(feature): + yield feature + for sub_feature in feature.sub_features: + yield from feature_traversal(sub_feature) + + +if __name__ == '__main__': + args = parse_args() + noid_count = 1 + tx_counts = Counter() + gene_counts = Counter() + records = list(GFF.parse(args.input_gff3)) + for seqrecord in records: + for parent_feature in seqrecord.features: + for feature in feature_traversal(parent_feature): + try: + add_tags_to_feature(feature, noid_count, tx_counts, gene_counts, parent_feature) + except KeyError: + assert False, feature.qualifiers + if feature.type == 'CDS' and parent_feature.type == 'gene': + # convert gene into fake transcript; we lack a parent + parent_feature.qualifiers['transcript_biotype'] = ['protein_coding'] + parent_feature.qualifiers['transcript_id'] = feature.qualifiers['transcript_id'] + parent_feature.qualifiers['transcript_name'] = feature.qualifiers['transcript_name'] + new_qualifiers = construct_new_qualifiers(feature) + feature.qualifiers = new_qualifiers + + with open(args.output_gff3, 'w') as fh: + GFF.write(records, fh) diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes b/programs/convert_ncbi_refseq_genbank_chromosomes new file mode 100755 index 00000000..f5a127d6 --- /dev/null +++ b/programs/convert_ncbi_refseq_genbank_chromosomes @@ -0,0 +1,84 @@ +#!/usr/bin/env python +"""Convert RefSeq to GenBank or vice versa""" +import argparse +import pandas as pd +import numpy as np +from tools.fileOps import print_row + +header = ['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', +'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Length', 'UCSC-style-name'] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('conversion_table', help='Conversion table') + parser.add_argument('input_gff3', help='Input GFF3') + parser.add_argument('output_gff3', help='Output GFF3') + e = parser.add_mutually_exclusive_group(required=True) + e.add_argument('--refseq-to-genbank', action='store_true', help='Set this to convert RefSeq -> GenBank') + e.add_argument('--genbank-to-refseq', action='store_true', help='Set this to convert GenBank -> RefSeq') + e.add_argument('--refseq-to-ucsc', action='store_true', help='Set this to convert RefSeq -> UCSC') + e.add_argument('--genbank-to-ucsc', action='store_true', help='Set this to convert GenBank -> UCSC') + e.add_argument('--ucsc-to-genbank', action='store_true', help='Set this to convert UCSC -> GenBank') + e.add_argument('--ucsc-to-refseq', action='store_true', help='Set this to convert UCSC -> RefSeq') + parser.add_argument('--no-retain-na', action='store_true', help='Default is to retain NA targets; drop them instead?') + return parser.parse_args() + + +column_map = {'refseq': 'RefSeq-Accn', + 'genbank': 'GenBank-Accn', + 'ucsc': 'UCSC-style-name'} + + +if __name__ == "__main__": + args = parse_args() + df = pd.read_csv(args.conversion_table, sep='\t', comment='#', na_values='na', header=None) + df.columns = header + + if args.refseq_to_genbank: + from_col = column_map['refseq'] + to_col = column_map['genbank'] + elif args.genbank_to_refseq: + from_col = column_map['genbank'] + to_col = column_map['refseq'] + elif args.refseq_to_ucsc: + from_col = column_map['refseq'] + to_col = column_map['ucsc'] + elif args.genbank_to_ucsc: + from_col = column_map['genbank'] + to_col = column_map['ucsc'] + elif ucsc_to_genbank: + from_col = column_map['ucsc'] + to_col = column_map['genbank'] + else: + from_col = column_map['ucsc'] + to_col = column_map['refseq'] + + source_is_na = 0 + tgt_is_na = 0 + m = {} + for f, t in zip(df[from_col], df[to_col]): + if not isinstance(f, str) and np.isnan(f): + # if source is NA, we have no hope here + source_is_na += 1 + continue + elif not isinstance(t, str) and np.isnan(t): + tgt_is_na += 1 + if args.no_retain_na: + continue + else: + t = f + m[f] = t + + with open(args.output_gff3, 'w') as fh: + for row in open(args.input_gff3): + if row.startswith('#'): + if row.startswith('##gff'): + fh.write(row) + continue + row = row.rstrip().split('\t') + if row[0] in m: + row[0] = m[row[0]] + else: + print('Row unparseable: {}'.format(row)) + print_row(fh, row) \ No newline at end of file diff --git a/programs/convert_ncbi_refseq_genbank_chromosomes_hal b/programs/convert_ncbi_refseq_genbank_chromosomes_hal new file mode 100644 index 00000000..7d37396e --- /dev/null +++ b/programs/convert_ncbi_refseq_genbank_chromosomes_hal @@ -0,0 +1,75 @@ +#!/usr/bin/env python +"""Convert RefSeq to GenBank or vice versa""" +import argparse +import pandas as pd +import numpy as np +from tools.fileOps import print_rows + +header = ['Sequence-Name', 'Sequence-Role', 'Assigned-Molecule', 'Assigned-Molecule-Location/Type', +'GenBank-Accn', 'Relationship', 'RefSeq-Accn', 'Assembly-Unit', 'Sequence-Length', 'UCSC-style-name'] + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('conversion_table', help='Conversion table') + parser.add_argument('output_table', help='Output table for halRenameSequences') + e = parser.add_mutually_exclusive_group(required=True) + e.add_argument('--refseq-to-genbank', action='store_true', help='Set this to convert RefSeq -> GenBank') + e.add_argument('--genbank-to-refseq', action='store_true', help='Set this to convert GenBank -> RefSeq') + e.add_argument('--refseq-to-ucsc', action='store_true', help='Set this to convert RefSeq -> UCSC') + e.add_argument('--genbank-to-ucsc', action='store_true', help='Set this to convert GenBank -> UCSC') + e.add_argument('--ucsc-to-genbank', action='store_true', help='Set this to convert UCSC -> GenBank') + e.add_argument('--ucsc-to-refseq', action='store_true', help='Set this to convert UCSC -> RefSeq') + parser.add_argument('--no-retain-na', action='store_true', help='Default is to retain NA targets; drop them instead?') + return parser.parse_args() + + +column_map = {'refseq': 'RefSeq-Accn', + 'genbank': 'GenBank-Accn', + 'ucsc': 'UCSC-style-name'} + + +if __name__ == "__main__": + args = parse_args() + df = pd.read_csv(args.conversion_table, sep='\t', comment='#', na_values='na', header=None) + df.columns = header + + if args.refseq_to_genbank: + from_col = column_map['refseq'] + to_col = column_map['genbank'] + elif args.genbank_to_refseq: + from_col = column_map['genbank'] + to_col = column_map['refseq'] + elif args.refseq_to_ucsc: + from_col = column_map['refseq'] + to_col = column_map['ucsc'] + elif args.genbank_to_ucsc: + from_col = column_map['genbank'] + to_col = column_map['ucsc'] + elif ucsc_to_genbank: + from_col = column_map['ucsc'] + to_col = column_map['genbank'] + else: + from_col = column_map['ucsc'] + to_col = column_map['refseq'] + + source_is_na = 0 + tgt_is_na = 0 + m = {} + for f, t in zip(df[from_col], df[to_col]): + if not isinstance(f, str) and np.isnan(f): + # if source is NA, we have no hope here + source_is_na += 1 + continue + elif not isinstance(t, str) and np.isnan(t): + tgt_is_na += 1 + if args.no_retain_na: + continue + else: + t = f + if f == t: + continue + m[f] = t + + with open(args.output_table, 'w') as fh: + print_rows(fh, m.items()) diff --git a/programs/validate_gff3 b/programs/validate_gff3 index 1d16f1bf..ad5354ea 100755 --- a/programs/validate_gff3 +++ b/programs/validate_gff3 @@ -1,7 +1,6 @@ #!/usr/bin/env python """ This script allows you to test your GFF3 for compatibility with CAT before running the pipeline. - """ import argparse @@ -29,7 +28,7 @@ if __name__ == '__main__': l = l.split() c[l[0]] += 1 duplicates = {x for x, y in c.items() if y > 1} - assert len(duplicates) == 0, 'Found {} duplicate genes: {}'.format(len(duplicates), '\n'.join(duplicates)) + assert len(duplicates) == 0, 'Found {} duplicate transcript IDs: {}'.format(len(duplicates), '\n'.join(duplicates)) df = tools.gff3.parse_gff3(attrs, gp) tx_dict = tools.transcripts.get_gene_pred_dict(gp) assert len(tx_dict) == len(df) @@ -63,4 +62,4 @@ if __name__ == '__main__': if len(mixed_genes) > 0: print('Found {} genes where one gene name maps to multiple gene IDs:'.format(len(mixed_genes))) for x, y in mixed_genes.items(): - print('{}: {}'.format(x, y)) + print('{}: {}'.format(x, y)) \ No newline at end of file diff --git a/scripts/summarize_cat_output.py b/scripts/summarize_cat_output.py new file mode 100644 index 00000000..ffd5fb5c --- /dev/null +++ b/scripts/summarize_cat_output.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python + +import collections +import pandas as pd + +import tools.intervals +import tools.misc +import tools.mathOps +import tools.fileOps +import tools.sqlInterface +import tools.transcripts +import tools.nameConversions +import tools.procOps +from cat.consensus import * +from argparse import ArgumentParser + +def get_df(db_path, ref_db_path, chrXOnly=False): + tm_eval_df = load_transmap_evals(db_path) + ref_df = tools.sqlInterface.load_annotation(ref_db_path) + tx_modes = ['transMap'] + mrna_metrics_df = pd.concat([load_metrics_from_db(db_path, tx_mode, 'mRNA') for tx_mode in tx_modes]) + cds_metrics_df = pd.concat([load_metrics_from_db(db_path, tx_mode, 'CDS') for tx_mode in tx_modes]) + eval_df = pd.concat([load_evaluations_from_db(db_path, tx_mode) for tx_mode in tx_modes]).reset_index() + hgm_df = pd.concat([load_hgm_vectors(db_path, tx_mode) for tx_mode in tx_modes]) + + + # if chrXOnly: + # cmd = [['grep', 'chrX', 'gencode.v30.annotation.gp'], ['cut', '-f', '1']] + # chrx_txs = set(tools.procOps.call_proc_lines(cmd)) + # ref_df = ref_df[ref_df.TranscriptId.isin(chrx_txs)] + # else: + # # remove chrY + # cmd = [['grep', 'chrY', 'gencode.v30.annotation.gp'], ['cut', '-f', '1']] + # chry_txs = set(tools.procOps.call_proc_lines(cmd)) + # ref_df = ref_df[~ref_df.TranscriptId.isin(chry_txs)] + + + num_txs = len(set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].TranscriptId)) + num_genes = len(set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].GeneId)) + + + ## code below is from the consensus module. I ripped out from the combine_and_filter_dfs method + ## because it needs the genePred, but the info is also present elsewhere. + + #add the reference information to gain biotype information + hgm_ref_df = pd.merge(hgm_df, ref_df, on=['GeneId', 'TranscriptId']) + # combine in homGeneMapping results + hgm_ref_tm_df = pd.merge(hgm_ref_df, tm_eval_df, on=['GeneId', 'TranscriptId']) + # split merged_df into coding and noncoding + coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype == 'protein_coding'] + non_coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype != 'protein_coding'] + # add metrics information to coding df + metrics_df = pd.merge(mrna_metrics_df, cds_metrics_df, on='AlignmentId', suffixes=['_mRNA', '_CDS']) + coding_df = pd.merge(coding_df, metrics_df, on='AlignmentId') + # add evaluation information to coding df, where possible. This adds information on frame shifts. + coding_df = pd.merge(coding_df, eval_df, on='AlignmentId', how='left') + # fill the original intron values to 100 so we don't filter them out -- means a no-intron gene + coding_df['OriginalIntronsPercent_mRNA'] = coding_df.OriginalIntronsPercent_mRNA.fillna(100) + coding_df['OriginalIntronsPercent_CDS'] = coding_df.OriginalIntronsPercent_CDS.fillna(100) + non_coding_df['TransMapOriginalIntronsPercent'] = non_coding_df.TransMapOriginalIntronsPercent.fillna(100) + + + # rawest counts. homGeneMapping was ran on the unfiltered dataset, so use that + # do this only on coding transcripts for now + unique_genes = 0 + unique_txs = 0 + tmp = hgm_ref_df[hgm_ref_df.TranscriptBiotype == 'protein_coding'] + num_coding_genes = len(set(tmp.GeneId)) + num_coding_txs = len(set(tmp.TranscriptId)) + for gene_id, d in tmp.groupby('GeneId'): + paralogy = collections.Counter(x.split('-')[0] for x in d.AlignmentId) + if sum(paralogy.values()) == len(paralogy): + unique_genes += 1 + for tx_id, dd in d.groupby('TranscriptId'): + if len(dd) == 1: + unique_txs += 1 + + data = {} + data['GenesFound'] = num_coding_genes + data['GenesFoundPercent'] = 100.0 * num_coding_genes / num_genes + data['GenesMultiplyMapping'] = num_genes - unique_genes + data['GenesMultiplyMappingPercent'] = 100.0 * (num_genes - unique_genes) / num_genes + data['TranscriptsFound'] = num_coding_txs + data['TranscriptsFoundPercent'] = 100.0 * num_coding_txs / num_txs + data['TranscriptsMultiplyMapping'] = num_txs - unique_txs + data['TranscriptsMultiplyMappingPercent'] = 100.0 * (num_txs - unique_txs) / num_txs + + # full coverage + full_cov_mrna = len(coding_df[coding_df.AlnCoverage_mRNA == 100]) + full_cov_cds = len(coding_df[coding_df.AlnCoverage_CDS == 100]) + data['FullmRNACoverage'] = full_cov_mrna + data['FullmRNACoveragePercent'] = 100.0 * full_cov_mrna / num_txs + data['FullCDSCoverage'] = full_cov_cds + data['FullCDSCoveragePercent'] = 100.0 * full_cov_cds / num_txs + + # construct a stringent filter that requires the following: + # 1) Has all original introns + # 2) Full CDS Coverage + # 3) No Frame-shift + frameshift = len(coding_df[coding_df.Frameshift == True]) + original_introns = len(coding_df[coding_df.OriginalIntronsPercent_mRNA == 100]) + cov = len(coding_df[coding_df.AlnCoverage_CDS == 100]) + cov_frameshift = len(coding_df[(coding_df.AlnCoverage_CDS == 100) & + (coding_df.Frameshift != True)]) + cov_frameshift_original_introns = len(coding_df[(coding_df.AlnCoverage_CDS == 100) & + (coding_df.Frameshift != True) & + (coding_df.OriginalIntronsPercent_mRNA == 100)]) + data['TranscriptsWithFrameshift'] = frameshift + data['TranscriptsWithFrameshiftPercent'] = 100.0 * frameshift / num_txs + data['TranscriptsWithOriginalIntrons'] = original_introns + data['TranscriptsWithOriginalIntronsPercent'] = 100.0 * original_introns / num_txs + data['TranscriptsWithFullCDSCoverage'] = cov + data['TranscriptsWithFullCDSCoveragePercent'] = 100.0 * cov / num_txs + data['TranscriptsWithFullCDSCoverageAndNoFrameshifts'] = cov_frameshift + data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsPercent'] = 100.0 * cov_frameshift / num_txs + data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons'] = cov_frameshift_original_introns + data['TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntronsPercent'] = 100.0 * cov_frameshift_original_introns / num_txs + + # naive gene level + frameshift = len(set(coding_df[coding_df.Frameshift == True].GeneId)) + original_introns = len(set(coding_df[coding_df.OriginalIntronsPercent_mRNA == 100].GeneId)) + cov = len(set(coding_df[(coding_df.ProperOrf == True) & (coding_df.AlnCoverage_CDS == 100)].GeneId)) + cov_frameshift = len(set(coding_df[(coding_df.AlnCoverage_CDS == 100) & + (coding_df.Frameshift != True)].GeneId)) + cov_frameshift_original_introns = len(set(coding_df[(coding_df.AlnCoverage_CDS == 100) & + (coding_df.Frameshift != True) & + (coding_df.OriginalIntronsPercent_mRNA == 100)].GeneId)) + data['GenesWithFrameshift'] = frameshift + data['GenesWithFrameshiftPercent'] = 100.0 * frameshift / num_genes + num_genes_all_shifted = 0 + for gene_id, d in coding_df.groupby('GeneId'): + if len(d[d.Frameshift == True]) == len(d): + num_genes_all_shifted += 1 + data['GenesWithFrameshiftAllIsoforms'] = num_genes_all_shifted + data['GenesWithFrameshiftAllIsoformsPercent'] = 100.0 * num_genes_all_shifted / num_genes + data['GenesWithOriginalIntrons'] = original_introns + data['GenesWithOriginalIntronsPercent'] = 100.0 * original_introns / num_genes + data['GenesWithFullCDSCoverage'] = cov + data['GenesWithFullCDSCoveragePercent'] = 100.0 * cov / num_genes + data['GenesWithFullCDSCoverageAndNoFrameshifts'] = cov_frameshift + data['GenesWithFullCDSCoverageAndNoFrameshiftsPercent'] = 100.0 * cov_frameshift / num_genes + data['GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons'] = cov_frameshift_original_introns + data['GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntronsPercent'] = 100.0 * cov_frameshift_original_introns / num_genes + + missing = set(ref_df[ref_df.TranscriptBiotype == 'protein_coding'].GeneId) - set(tmp.GeneId) + + data['MissingGenes'] = len(missing) + data['MissingGenesPercent'] = (100.0 * len(missing)) / num_genes + + data['Name'] = db_path.replace('.db', '') + + return data + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('ref_db') + parser.add_argument('target_db', nargs='+') + parser.add_argument('--chrXOnly', action='store_true') + opts = parser.parse_args() + columns = ['Name', 'GenesFound', 'TranscriptsFound', 'GenesMultiplyMapping', 'TranscriptsMultiplyMapping', 'FullmRNACoverage', 'FullCDSCoverage', 'TranscriptsWithFrameshift', 'TranscriptsWithOriginalIntrons', 'TranscriptsWithFullCDSCoverage', 'TranscriptsWithFullCDSCoverageAndNoFrameshifts', 'TranscriptsWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons', 'GenesWithFrameshift', 'GenesWithOriginalIntrons', 'GenesWithFullCDSCoverage', 'GenesWithFullCDSCoverageAndNoFrameshifts', 'GenesWithFullCDSCoverageAndNoFrameshiftsAndOriginalIntrons', 'MissingGenes'] + columns_w_percent = [] + for column in columns: + if column != 'Name': + columns_w_percent.extend([column, column + 'Percent']) + else: + columns_w_percent.append(column) + df = pd.DataFrame(columns=columns_w_percent) + for target_db in opts.target_db: + df = df.append(get_df(target_db, opts.ref_db, opts.chrXOnly), ignore_index=True) + print(df.to_csv()) \ No newline at end of file diff --git a/setup.py b/setup.py index 94706655..4ef1e0ad 100644 --- a/setup.py +++ b/setup.py @@ -4,10 +4,10 @@ name='cat', version='2.0', packages=['cat', 'tools'], - python_requires='>=3.7.0', + python_requires='>=3.6.0', install_requires=[ 'pyfasta>=0.5.2', - 'toil>=3.5', + 'toil>=4.1', 'luigi>=2.5', 'seaborn>=0.7', 'pandas>=1.0', @@ -25,7 +25,10 @@ ], scripts=['programs/cat_to_ncbi_submit', 'programs/translate_gene_pred', 'programs/validate_gff3', 'programs/cat_parse_ncbi_genbank', - 'programs/cat_parse_ncbi_refseq', 'programs/cat_parse_prokka_gff3'], + 'programs/cat_parse_ncbi_refseq', 'programs/cat_parse_prokka_gff3', + 'programs/convert_ncbi_gff3', 'programs/convert_cupcake_gff', + 'programs/convert_ncbi_refseq_genbank_chromosomes', + 'programs/convert_ncbi_refseq_genbank_chromosomes_hal'], author='Ian Fiddes', description='Comparative Annotation Toolkit', url='https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit', diff --git a/tools/gff3.py b/tools/gff3.py index a424af6f..4f049ef2 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -7,6 +7,8 @@ reserved_keys = ['gene_biotype', 'transcript_biotype', + 'gene_type', + 'transcript_type', 'gene_name', 'gene_id', 'transcript_id', @@ -29,17 +31,23 @@ def parse_attrs(attrs): results = [] for tx_id, gene_id in tx_name_map.items(): d = attrs_dict[tx_id] - gene_biotype = d['gene_biotype'] - tx_biotype = d['transcript_biotype'] + gene_biotype = d.get('gene_biotype', d.get('gene_type', None)) + if gene_biotype is None: + raise Exception("Did not find a gene biotype or gene type for {} (attrs={})".format(gene_id, d)) + tx_biotype = d.get('transcript_biotype', d.get('transcript_type', None)) + if tx_biotype is None: + raise Exception("Did not find a transcript biotype or type for {} (attrs={})".format(tx_id, d)) gene_name = d['gene_name'] gene_id = d['gene_id'] tx_id = d['transcript_id'] tx_name = d['transcript_name'] - extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B')) for x, y in d.items() if x not in reserved_keys]) - try: - misc.parse_gff_attr_line(extra_tags) - except: - raise Exception('Error parsing extra tags in input GFF3') + extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B').replace('=', '%3D')) + for x, y in d.items() if x not in reserved_keys]) + if len(extra_tags) > 0: + try: + misc.parse_gff_attr_line(extra_tags) + except: + raise Exception(f'Error parsing extra tags in input GFF3 {extra_tags}') if is_external_reference is True: # hack to fix names gene_id = f'exRef-{gene_id}' @@ -53,5 +61,5 @@ def parse_attrs(attrs): def convert_gff3_cmd(annotation_attrs, annotation): cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons', - '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] + '-refseqHacks', '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] return cmd diff --git a/tools/intervals.py b/tools/intervals.py index 14e8dce6..f59b4291 100644 --- a/tools/intervals.py +++ b/tools/intervals.py @@ -341,6 +341,22 @@ def calculate_bed12_jaccard(intervals_a, intervals_b): return mathOps.format_ratio(intersection, (tot_a + tot_b) - intersection) +def calculate_bed12_asymmetric_jaccard(intervals_a, intervals_b): + """ + calculates the Jaccard similarity metric for two iterables of intervals. Will fail if they are not on the same + chromosome. + length(intersection) / (intervals_a) + """ + intersection = 0 + tot_a = sum(len(x) for x in intervals_a) + for a_interval in intervals_a: + for b_interval in intervals_b: + ins = a_interval.intersection(b_interval) + if ins is not None: + intersection += len(ins) + return mathOps.format_ratio(intersection, tot_a) + + def calculate_jaccard(interval_a, interval_b): """ calculates the Jaccard similarity metric for two intervals diff --git a/tools/misc.py b/tools/misc.py index 668a57df..1f3d5f73 100644 --- a/tools/misc.py +++ b/tools/misc.py @@ -102,6 +102,8 @@ def sort_gff(input_file, output_file): def parse_gtf_attr_line(attr_line): """parse a GTF attributes line""" + if len(attr_line) == 0: + return {} attr_line = [x.split(' ') for x in re.split('; +', attr_line.replace('"', ''))] attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') return dict(attr_line) @@ -109,6 +111,8 @@ def parse_gtf_attr_line(attr_line): def parse_gff_attr_line(attr_line): """parse a GFF attributes line""" + if len(attr_line) == 0: + return {} attr_line = [x.split('=') for x in re.split('; *', attr_line.replace('"', ''))] attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') return dict(attr_line) diff --git a/tools/nameConversions.py b/tools/nameConversions.py index 05e9d147..203c13cf 100644 --- a/tools/nameConversions.py +++ b/tools/nameConversions.py @@ -75,7 +75,7 @@ def aln_id_is_exref(aln_id): def aln_id_is_denovo(aln_id): - return aln_id_is_pb(aln_id) or aln_id_is_cgp(aln_id) + return aln_id_is_pb(aln_id) or aln_id_is_cgp(aln_id) or aln_id_is_exref(aln_id) def alignment_type(aln_id): diff --git a/tools/procOps.py b/tools/procOps.py index 9d83b490..af75be52 100644 --- a/tools/procOps.py +++ b/tools/procOps.py @@ -26,7 +26,10 @@ def cmdLists(cmd): else: return getDockerCommand('quay.io/ucsc_cgl/cat', cmd) elif os.environ.get('CAT_BINARY_MODE') == 'singularity': - img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img') + if os.environ.get('SINGULARITY_IMAGE'): + img = os.environ['SINGULARITY_IMAGE'] + else: + img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img') assert os.path.exists(img) if isinstance(cmd[0], list): return list([get_singularity_command(img, c) for c in cmd]) @@ -107,7 +110,7 @@ def popen_catch(command, stdin=None): def mrca_path(path1, path2): """ Gives the Most Recent Common Ancestor directory that contains both paths. - + >>> mrca_path('/usr/lib/python2.7', '/usr/bin/python') '/usr' >>> mrca_path('/usr/', '/usr/') @@ -165,6 +168,10 @@ def getDockerCommand(image, cmd): cmd: list of arguments """ dockerPreamble = ['docker', 'run', '-i', '--rm', '-u', "%s:%s" % (os.getuid(), os.getgid())] + if "TMPDIR" in os.environ: + tmpdir = os.environ["TMPDIR"] + dockerPreamble.extend(["--env", "TMPDIR={}".format(tmpdir)]) + dockerPreamble.extend(['-v', tmpdir + ':' + tmpdir]) work_dirs = [] for i, arg in enumerate(cmd): if arg.startswith('-') and '=' in arg: