From 648c1340b4ed08bc8fb066206c96729d3dab6527 Mon Sep 17 00:00:00 2001 From: Jeltje Date: Fri, 24 Feb 2017 16:54:58 +0000 Subject: [PATCH 1/3] upgraded cwl to v1.0, add option to input gzipped genome, change default docker container to quay.io, adding test input Dockerfile.json --- Dockerfile | 2 +- Dockstore.json | 26 ++++ pindel-somatic.cwl.yaml => pindel-somatic.cwl | 130 +++++++++--------- pindel.py | 17 ++- 4 files changed, 111 insertions(+), 64 deletions(-) create mode 100644 Dockstore.json rename pindel-somatic.cwl.yaml => pindel-somatic.cwl (59%) mode change 100644 => 100755 pindel.py diff --git a/Dockerfile b/Dockerfile index 2de846a..404b92c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ FROM ubuntu RUN apt-get update RUN apt-get install -y git wget make gcc zlib1g-dev ncurses-dev g++ python python-pip -RUN pip install PyVCF +RUN pip install PyVCF subprocess32 WORKDIR /opt diff --git a/Dockstore.json b/Dockstore.json new file mode 100644 index 0000000..f38a5f9 --- /dev/null +++ b/Dockstore.json @@ -0,0 +1,26 @@ +{ + "reference": { + "path": "http://hgwdev.cse.ucsc.edu/~jeltje/public_data/genome.fa.gz", + "class": "File" + }, + "normal": { + "path":"https://dcc.icgc.org/api/v1/download?fn=/PCAWG/reference_data/data_for_testing/HCC1143_ds/HCC1143_BL.bam", + "class": "File" + }, + "tumor": { + "path":"https://dcc.icgc.org/api/v1/download?fn=/PCAWG/reference_data/data_for_testing/HCC1143_ds/HCC1143.bam", + "class": "File" + }, + "somatic_vcf": { + "path": "/tmp/somatic.vcf", + "class": "File" + }, + "vcf": { + "path": "/tmp/pindel.vcf", + "class": "File" + }, + "rawFile": { + "path": "/tmp/pindel.raw", + "class": "File" + } +} diff --git a/pindel-somatic.cwl.yaml b/pindel-somatic.cwl similarity index 59% rename from pindel-somatic.cwl.yaml rename to pindel-somatic.cwl index b019eac..620cceb 100644 --- a/pindel-somatic.cwl.yaml +++ b/pindel-somatic.cwl @@ -1,190 +1,196 @@ - -class: CommandLineTool -label: Pindel-Somatic cwlVersion: v1.0 +class: CommandLineTool + +doc: "pindel somatic mutation calling" + baseCommand: [python, /opt/pindel.py, -t, NORMAL, -t TUMOR] -requirements: - - class: "DockerRequirement" - dockerPull: "opengenomics/pindel:0.2.5b8" + +hints: + DockerRequirement: + dockerPull: quay.io/opengenomics/pindel inputs: - - id: "normal" + normal: type: File inputBinding: prefix: -b - - id: "tumor" + tumor: type: File inputBinding: prefix: -b - - id: "reference" + reference: type: File + doc: | + the genome reference file inputBinding: prefix: -r - - id: "centromere" - type: File + centromere: + type: File? + doc: | + list of regions to exclude (chr, start, end) inputBinding: prefix: -J - - id: "referenceName" - type: string - default: HG19 + referenceName: + type: string? + doc: | + the genome reference ID, e.g. HG19 (default: genome) inputBinding: prefix: -R - - id: "windowSize" - type: int + windowSize: + type: int? default: 2 inputBinding: prefix: --window_size - - id: "procs" - type: int + procs: + type: int? default: 2 inputBinding: prefix: --number_of_procs - - id: "normal_insert_size" - type: ['null', int] + normal_insert_size: + type: int? inputBinding: prefix: -s - - id: "tumor_insert_size" - type: ['null', int] + tumor_insert_size: + type: int? inputBinding: prefix: -s - - id: "report_inversions" + report_inversions: type: boolean? default: false inputBinding: prefix: --report_inversions - - id: "report_duplications" + report_duplications: type: boolean? default: false inputBinding: prefix: --report_duplications - - id: "report_long_insertions" + report_long_insertions: type: boolean? default: false inputBinding: prefix: --report_long_insertions - - id: "report_breakpoints" + report_breakpoints: type: boolean? default: false inputBinding: prefix: --report_breakpoints - - id: "report_only_close_mapped_reads" + report_only_close_mapped_reads: type: boolean? default: false inputBinding: prefix: -S - - id: "outputRawFile" - type: string + outputRawFile: + type: string? default: pindel.raw inputBinding: prefix: -o1 - - id: "outputVcfFile" - type: string + outputVcfFile: + type: string? default: pindel.vcf inputBinding: prefix: -o2 - - id: "outputSomaticVcfFile" - type: string + outputSomaticVcfFile: + type: string? default: pindel_somatic.vcf inputBinding: prefix: -o3 - - id: "somatic_vaf" - type: float + somatic_vaf: + type: float? default: 0.08 inputBinding: prefix: --somatic_vaf - - id: "somatic_cov" - type: int + somatic_cov: + type: int? default: 20 inputBinding: prefix: --somatic_cov - - id: "somatic_hom" - type: int + somatic_hom: + type: int? default: 6 inputBinding: prefix: --somatic_hom - - id: "min_inversion_size" - type: int + min_inversion_size: + type: int? default: 50 inputBinding: prefix: --min_inversion_size - - id: min_num_matched_bases - type: int + min_num_matched_bases: + type: int? default: 30 inputBinding: prefix: -d - - id: max_range_index - type: int + max_range_index: + type: int? default: 4 inputBinding: prefix: -x - - id: additional_mismatch - type: int + additional_mismatch: + type: int? default: 1 inputBinding: prefix: -a - - id: min_perfect_match_around_BP - type: int + min_perfect_match_around_BP: + type: int? default: 3 inputBinding: prefix: -m - - id: sequencing_error_rate - type: float + sequencing_error_rate: + type: float? default: 0.01 inputBinding: prefix: --sequencing_error_rate - - id: maximum_allowed_mismatch_rate - type: float + maximum_allowed_mismatch_rate: + type: float? default: 0.02 inputBinding: prefix: -u - - id: sensitivity - type: float + sensitivity: + type: float? default: 0.95 inputBinding: prefix: --sensitivity outputs: - - id: "vcf" + vcf: type: File outputBinding: - glob: pindel.vcf + glob: $(inputs.outputVcfFile) - - id: "somatic_vcf" + somatic_vcf: type: File outputBinding: - glob: pindel_somatic.vcf + glob: $(inputs.outputSomaticVcfFile) - - id: "rawFile" + rawFile: type: File outputBinding: - glob: pindel.raw + glob: $(inputs.outputRawFile) - diff --git a/pindel.py b/pindel.py old mode 100644 new mode 100755 index c3ddd24..1b87dec --- a/pindel.py +++ b/pindel.py @@ -11,6 +11,14 @@ import datetime from multiprocessing import Pool import vcf +import gzip + +def gunzip(infile, outfile): + inF = gzip.GzipFile(infile, 'rb') + s = inF.read() + inF.close() + with open(outfile, 'wb') as outF: + outF.write(s) def execute(cmd, output=None): import subprocess, sys, shlex @@ -181,7 +189,7 @@ def __main__(): logging.basicConfig(level=logging.INFO) time.sleep(1) #small hack, sometimes it seems like docker file systems aren't avalible instantly parser = argparse.ArgumentParser(description='') - parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file') + parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file, can be gzipped') parser.add_argument('-R', dest='inputFastaName', default="genome", help='the reference name') parser.add_argument('-b', dest='inputBamFiles', default=[], action="append", help='the bam file') @@ -268,6 +276,13 @@ def __main__(): seq_hash = {} newInputFiles = [] i = 0 + # unzip input genome if necessary + if args.inputFastaFile.endswith('.gz'): + new_ref = os.path.join(args.workdir, "reference.fa") + gunzip(args.inputFastaFile, new_ref) + subprocess.check_call( ["samtools", "faidx", new_ref] ) + args.inputFastaFile = new_ref + #make sure the BAMs are indexed and get the mean insert sizes for inputBamFile, inputBamIndex, insertSize, sampleTag in zip(inputBamFiles, inputBamFileIndexes, insertSizes, sampleTags ): inputFastaFile, inputBamFile = indexBam(args.workdir, args.inputFastaFile, inputBamFile, i, inputBamIndex) From f389ed7b211784f0d35b83c848889a703c5f24e3 Mon Sep 17 00:00:00 2001 From: Jeltje Date: Wed, 1 Mar 2017 02:27:54 +0000 Subject: [PATCH 2/3] changed gunzip to zcat to avoid memory issues with larger files --- Dockerfile | 2 +- pindel.py | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 404b92c..ef39a5e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,7 +7,7 @@ RUN pip install PyVCF subprocess32 WORKDIR /opt -RUN mkdir /opt/bin +RUN mkdir /opt/bin RUN wget https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 RUN tar xvjf samtools-1.2.tar.bz2 RUN cd /opt/samtools-1.2 && make && make install diff --git a/pindel.py b/pindel.py index 1b87dec..8f1442b 100755 --- a/pindel.py +++ b/pindel.py @@ -11,14 +11,15 @@ import datetime from multiprocessing import Pool import vcf -import gzip def gunzip(infile, outfile): - inF = gzip.GzipFile(infile, 'rb') - s = inF.read() - inF.close() - with open(outfile, 'wb') as outF: - outF.write(s) + cmd = (' ').join(['zcat', infile]) + with open(outfile, 'w') as outF: + p = subprocess.Popen(cmd, shell=True, stdout=outF, stderr=subprocess.PIPE) + stdout,stderr = p.communicate() + if len(stderr): + print "unzip command failed:", stderr + raise Exception("unzip failed") def execute(cmd, output=None): import subprocess, sys, shlex From 3ef0b10e02e3c685df59815f94bf9aad8b5030b7 Mon Sep 17 00:00:00 2001 From: Jeltje Date: Tue, 4 Apr 2017 17:36:28 +0000 Subject: [PATCH 3/3] removed redundant parentheses --- pindel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pindel.py b/pindel.py index 8f1442b..6e5640e 100755 --- a/pindel.py +++ b/pindel.py @@ -13,7 +13,7 @@ import vcf def gunzip(infile, outfile): - cmd = (' ').join(['zcat', infile]) + cmd = ' '.join(['zcat', infile]) with open(outfile, 'w') as outF: p = subprocess.Popen(cmd, shell=True, stdout=outF, stderr=subprocess.PIPE) stdout,stderr = p.communicate()