diff --git a/Dockerfile b/Dockerfile index 2de846a..ef39a5e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,11 +3,11 @@ FROM ubuntu RUN apt-get update RUN apt-get install -y git wget make gcc zlib1g-dev ncurses-dev g++ python python-pip -RUN pip install PyVCF +RUN pip install PyVCF subprocess32 WORKDIR /opt -RUN mkdir /opt/bin +RUN mkdir /opt/bin RUN wget https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 RUN tar xvjf samtools-1.2.tar.bz2 RUN cd /opt/samtools-1.2 && make && make install diff --git a/Dockstore.json b/Dockstore.json new file mode 100644 index 0000000..f38a5f9 --- /dev/null +++ b/Dockstore.json @@ -0,0 +1,26 @@ +{ + "reference": { + "path": "http://hgwdev.cse.ucsc.edu/~jeltje/public_data/genome.fa.gz", + "class": "File" + }, + "normal": { + "path":"https://dcc.icgc.org/api/v1/download?fn=/PCAWG/reference_data/data_for_testing/HCC1143_ds/HCC1143_BL.bam", + "class": "File" + }, + "tumor": { + "path":"https://dcc.icgc.org/api/v1/download?fn=/PCAWG/reference_data/data_for_testing/HCC1143_ds/HCC1143.bam", + "class": "File" + }, + "somatic_vcf": { + "path": "/tmp/somatic.vcf", + "class": "File" + }, + "vcf": { + "path": "/tmp/pindel.vcf", + "class": "File" + }, + "rawFile": { + "path": "/tmp/pindel.raw", + "class": "File" + } +} diff --git a/pindel-somatic.cwl.yaml b/pindel-somatic.cwl similarity index 59% rename from pindel-somatic.cwl.yaml rename to pindel-somatic.cwl index b019eac..620cceb 100644 --- a/pindel-somatic.cwl.yaml +++ b/pindel-somatic.cwl @@ -1,190 +1,196 @@ - -class: CommandLineTool -label: Pindel-Somatic cwlVersion: v1.0 +class: CommandLineTool + +doc: "pindel somatic mutation calling" + baseCommand: [python, /opt/pindel.py, -t, NORMAL, -t TUMOR] -requirements: - - class: "DockerRequirement" - dockerPull: "opengenomics/pindel:0.2.5b8" + +hints: + DockerRequirement: + dockerPull: quay.io/opengenomics/pindel inputs: - - id: "normal" + normal: type: File inputBinding: prefix: -b - - id: "tumor" + tumor: type: File inputBinding: prefix: -b - - id: "reference" + reference: type: File + doc: | + the genome reference file inputBinding: prefix: -r - - id: "centromere" - type: File + centromere: + type: File? + doc: | + list of regions to exclude (chr, start, end) inputBinding: prefix: -J - - id: "referenceName" - type: string - default: HG19 + referenceName: + type: string? + doc: | + the genome reference ID, e.g. HG19 (default: genome) inputBinding: prefix: -R - - id: "windowSize" - type: int + windowSize: + type: int? default: 2 inputBinding: prefix: --window_size - - id: "procs" - type: int + procs: + type: int? default: 2 inputBinding: prefix: --number_of_procs - - id: "normal_insert_size" - type: ['null', int] + normal_insert_size: + type: int? inputBinding: prefix: -s - - id: "tumor_insert_size" - type: ['null', int] + tumor_insert_size: + type: int? inputBinding: prefix: -s - - id: "report_inversions" + report_inversions: type: boolean? default: false inputBinding: prefix: --report_inversions - - id: "report_duplications" + report_duplications: type: boolean? default: false inputBinding: prefix: --report_duplications - - id: "report_long_insertions" + report_long_insertions: type: boolean? default: false inputBinding: prefix: --report_long_insertions - - id: "report_breakpoints" + report_breakpoints: type: boolean? default: false inputBinding: prefix: --report_breakpoints - - id: "report_only_close_mapped_reads" + report_only_close_mapped_reads: type: boolean? default: false inputBinding: prefix: -S - - id: "outputRawFile" - type: string + outputRawFile: + type: string? default: pindel.raw inputBinding: prefix: -o1 - - id: "outputVcfFile" - type: string + outputVcfFile: + type: string? default: pindel.vcf inputBinding: prefix: -o2 - - id: "outputSomaticVcfFile" - type: string + outputSomaticVcfFile: + type: string? default: pindel_somatic.vcf inputBinding: prefix: -o3 - - id: "somatic_vaf" - type: float + somatic_vaf: + type: float? default: 0.08 inputBinding: prefix: --somatic_vaf - - id: "somatic_cov" - type: int + somatic_cov: + type: int? default: 20 inputBinding: prefix: --somatic_cov - - id: "somatic_hom" - type: int + somatic_hom: + type: int? default: 6 inputBinding: prefix: --somatic_hom - - id: "min_inversion_size" - type: int + min_inversion_size: + type: int? default: 50 inputBinding: prefix: --min_inversion_size - - id: min_num_matched_bases - type: int + min_num_matched_bases: + type: int? default: 30 inputBinding: prefix: -d - - id: max_range_index - type: int + max_range_index: + type: int? default: 4 inputBinding: prefix: -x - - id: additional_mismatch - type: int + additional_mismatch: + type: int? default: 1 inputBinding: prefix: -a - - id: min_perfect_match_around_BP - type: int + min_perfect_match_around_BP: + type: int? default: 3 inputBinding: prefix: -m - - id: sequencing_error_rate - type: float + sequencing_error_rate: + type: float? default: 0.01 inputBinding: prefix: --sequencing_error_rate - - id: maximum_allowed_mismatch_rate - type: float + maximum_allowed_mismatch_rate: + type: float? default: 0.02 inputBinding: prefix: -u - - id: sensitivity - type: float + sensitivity: + type: float? default: 0.95 inputBinding: prefix: --sensitivity outputs: - - id: "vcf" + vcf: type: File outputBinding: - glob: pindel.vcf + glob: $(inputs.outputVcfFile) - - id: "somatic_vcf" + somatic_vcf: type: File outputBinding: - glob: pindel_somatic.vcf + glob: $(inputs.outputSomaticVcfFile) - - id: "rawFile" + rawFile: type: File outputBinding: - glob: pindel.raw + glob: $(inputs.outputRawFile) - diff --git a/pindel.py b/pindel.py old mode 100644 new mode 100755 index c3ddd24..6e5640e --- a/pindel.py +++ b/pindel.py @@ -12,6 +12,15 @@ from multiprocessing import Pool import vcf +def gunzip(infile, outfile): + cmd = ' '.join(['zcat', infile]) + with open(outfile, 'w') as outF: + p = subprocess.Popen(cmd, shell=True, stdout=outF, stderr=subprocess.PIPE) + stdout,stderr = p.communicate() + if len(stderr): + print "unzip command failed:", stderr + raise Exception("unzip failed") + def execute(cmd, output=None): import subprocess, sys, shlex # function to execute a cmd and report if an error occur @@ -181,7 +190,7 @@ def __main__(): logging.basicConfig(level=logging.INFO) time.sleep(1) #small hack, sometimes it seems like docker file systems aren't avalible instantly parser = argparse.ArgumentParser(description='') - parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file') + parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file, can be gzipped') parser.add_argument('-R', dest='inputFastaName', default="genome", help='the reference name') parser.add_argument('-b', dest='inputBamFiles', default=[], action="append", help='the bam file') @@ -268,6 +277,13 @@ def __main__(): seq_hash = {} newInputFiles = [] i = 0 + # unzip input genome if necessary + if args.inputFastaFile.endswith('.gz'): + new_ref = os.path.join(args.workdir, "reference.fa") + gunzip(args.inputFastaFile, new_ref) + subprocess.check_call( ["samtools", "faidx", new_ref] ) + args.inputFastaFile = new_ref + #make sure the BAMs are indexed and get the mean insert sizes for inputBamFile, inputBamIndex, insertSize, sampleTag in zip(inputBamFiles, inputBamFileIndexes, insertSizes, sampleTags ): inputFastaFile, inputBamFile = indexBam(args.workdir, args.inputFastaFile, inputBamFile, i, inputBamIndex)