diff --git a/pipelines/pipeline_scrnaseq.py b/pipelines/pipeline_scrnaseq.py index 728410b..2797e14 100644 --- a/pipelines/pipeline_scrnaseq.py +++ b/pipelines/pipeline_scrnaseq.py @@ -934,22 +934,57 @@ def featureCounts(infiles, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge(featureCounts, - "featureCounts.dir/featurecounts.load") -def loadFeatureCounts(infiles, outfile): + "featureCounts.dir/featurecounts.txt.gz") +def concatenateFeatureCounts(infiles, outfile): + ''' + Combine count data in the project database. + ''' + + infiles = " ".join(infiles) + + statement = '''python -m cgatcore.tables + --cat=track + --missing-value=na + --regex-filename='.*/(.*).counts.gz' + --no-titles + %(infiles)s + | gzip -c + > %(outfile)s + ''' + + P.run(statement, job_memory=PARAMS["sql_himem"]) + + +@transform(concatenateFeatureCounts, + regex(r"featureCounts.dir/(.*).txt.gz"), + r"featureCounts.dir/\1.load") +def loadFeatureCounts(infile, outfile): ''' Combine and load count data in the project database. ''' - P.concatenate_and_load(infiles, outfile, - regex_filename=".*/(.*).counts.gz", - has_titles=False, - cat="track", - header="track,gene_id,counts", - options='-i "gene_id"', - job_memory=PARAMS["sql_himem"]) + tablename = infile.replace(".load", "") + + database_url = PARAMS["database"]["url"] + + statement = '''zcat %(infile)s + | python -m cgatcore.csv2db + --retry + --database-url=%(database_url)s + --add-index=track + --header-names=track,gene_id,counts -i "gene_id" + --table=featurecounts + > %(outfile)s + ''' + + to_cluster = False + + P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @files(loadFeatureCounts, "featureCounts.dir/featurecounts_counts.txt") def featurecountsGeneCounts(infile, outfile): @@ -970,6 +1005,7 @@ def featurecountsGeneCounts(infile, outfile): df.to_csv(outfile, sep="\t", index=True, index_label="gene_id") +@active_if(PARAMS["sql_load_concatenated_table"]) @transform(featurecountsGeneCounts, suffix(".txt"), ".load") @@ -1034,6 +1070,7 @@ def salmon(infiles, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @active_if(fastqMode) @merge(salmon, "salmon.dir/salmon.transcripts.load") def loadSalmonTranscriptQuant(infiles, outfile): @@ -1050,6 +1087,7 @@ def loadSalmonTranscriptQuant(infiles, outfile): job_memory=PARAMS["sql_himem"]) +@active_if(PARAMS["sql_load_concatenated_table"]) @active_if(fastqMode) @merge(salmon, "salmon.dir/salmon.genes.load") def loadSalmonGeneQuant(infiles, outfile): @@ -1303,6 +1341,7 @@ def loadCuffNormUQ(infile, outfile): run_copy_number_estimation = False +@active_if(PARAMS["sql_load_concatenated_table"]) @active_if(run_copy_number_estimation) @follows(mkdir("copy.number.dir"), loadSalmonTPMs) @files("salmon.dir/salmon.genes.tpms.txt", @@ -1324,6 +1363,7 @@ def estimateCopyNumber(infile, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @active_if(run_copy_number_estimation) @transform(estimateCopyNumber, suffix(".txt"), @@ -1408,6 +1448,7 @@ def collectRnaSeqMetrics(infiles, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge(collectRnaSeqMetrics, "qc.dir/qc_rnaseq_metrics.load") def loadCollectRnaSeqMetrics(infiles, outfile): @@ -1448,6 +1489,7 @@ def threePrimeBias(infile, outfile): out_file.write("%.2f\n" % bias) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge(threePrimeBias, "qc.dir/qc_three_prime_bias.load") def loadThreePrimeBias(infiles, outfile): @@ -1498,6 +1540,7 @@ def estimateLibraryComplexity(infile, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @active_if(PAIRED) @merge(estimateLibraryComplexity, "qc.dir/qc_library_complexity.load") @@ -1553,6 +1596,7 @@ def alignmentSummaryMetrics(infile, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge(alignmentSummaryMetrics, "qc.dir/qc_alignment_summary_metrics.load") def loadAlignmentSummaryMetrics(infiles, outfile): @@ -1624,6 +1668,7 @@ def insertSizeMetricsAndHistograms(infile, outfiles): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge(insertSizeMetricsAndHistograms, "qc.dir/qc_insert_size_metrics.load") def loadInsertSizeMetrics(infiles, outfile): @@ -1647,6 +1692,7 @@ def loadInsertSizeMetrics(infiles, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge(insertSizeMetricsAndHistograms, "qc.dir/qc_insert_size_histogram.load") def loadInsertSizeHistograms(infiles, outfile): @@ -1702,6 +1748,7 @@ def spikeVsGenome(infile, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge(spikeVsGenome, "qc.dir/qc_spike_vs_genome.load") def loadSpikeVsGenome(infiles, outfile): @@ -1718,6 +1765,7 @@ def loadSpikeVsGenome(infiles, outfile): # ------------------------- No. genes detected ------------------------------ # +@active_if(PARAMS["sql_load_concatenated_table"]) @active_if(fastqMode) @follows(mkdir("qc.dir/"), loadSalmonTPMs, loadEnsemblAnnotations) @files("salmon.dir/salmon.genes.tpms.load", @@ -1753,6 +1801,7 @@ def numberGenesDetectedSalmon(infile, outfile): count_df.to_csv(outfile, index=False, sep="\t") +@active_if(PARAMS["sql_load_concatenated_table"]) @active_if(fastqMode) @follows(annotations) @files(numberGenesDetectedSalmon, @@ -1766,6 +1815,7 @@ def loadNumberGenesDetectedSalmon(infile, outfile): options='-i "sample_id"') +@active_if(PARAMS["sql_load_concatenated_table"]) @follows(annotations) @files(loadFeatureCounts, "qc.dir/number.genes.detected.featurecounts") @@ -1799,6 +1849,7 @@ def numberGenesDetectedFeatureCounts(infile, outfile): count_df.to_csv(outfile, index=False, sep="\t") +@active_if(PARAMS["sql_load_concatenated_table"]) @files(numberGenesDetectedFeatureCounts, "qc.dir/qc_no_genes_featurecounts.load") def loadNumberGenesDetectedFeatureCounts(infile, outfile): @@ -1836,6 +1887,7 @@ def fractionReadsSpliced(infile, outfile): P.run(statement) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge(fractionReadsSpliced, "qc.dir/qc_fraction_spliced.load") def loadFractionReadsSpliced(infiles, outfile): @@ -1873,6 +1925,7 @@ def loadSampleInformation(infile, outfile): P.load(infile, outfile) +@active_if(PARAMS["sql_load_concatenated_table"]) @merge([loadSampleInformation, loadCollectRnaSeqMetrics, loadThreePrimeBias, @@ -1968,6 +2021,7 @@ def qcSummary(infiles, outfile): df.to_csv(outfile, sep="\t", index=False) +@active_if(PARAMS["sql_load_concatenated_table"]) @transform(qcSummary, suffix(".txt"), ".load") diff --git a/pipelines/pipeline_scrnaseq/pipeline.yml b/pipelines/pipeline_scrnaseq/pipeline.yml index 61047e4..6a444a8 100644 --- a/pipelines/pipeline_scrnaseq/pipeline.yml +++ b/pipelines/pipeline_scrnaseq/pipeline.yml @@ -77,6 +77,7 @@ strandedness: none # location of the local sqlite3 database database: file: csvdb + url: sqlite:///./csvdb # Spike-in options @@ -287,3 +288,5 @@ sql: # RAM required for high memory operations (e.g. 5000M) himem: 10000M + + load_concatenated_table: True