diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 5f48a2e..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,13 +0,0 @@ -[bumpversion] -current_version = 1.0.2 -commit = True -tag = False - -[bumpversion:file:setup.py] - -[bumpversion:file:discoplot/DiscoPlot.py] - -[bumpversion:file:do_release.sh] - -[bumpversion:file:docs/conf.py] - diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..b50bc04 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,15 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..3b31283 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,7 @@ + + + + \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ce220d3..0000000 --- a/.travis.yml +++ /dev/null @@ -1,14 +0,0 @@ -language: python -python: - - 2.7 -before_install: - # We do this to make sure we get dependencies - - sudo apt-get update -qq -install: - - pip install -r requirements.txt --use-mirrors - - pip install . -notifications: - email: true -script: - # cd tests && sh TEST.sh - - echo 'Not testing ATM' diff --git a/CITATION.txt b/CITATION.txt index 40186c1..63ba3f1 100644 --- a/CITATION.txt +++ b/CITATION.txt @@ -1,3 +1,2 @@ -Mitchell J Sullivan & Scott A Beatson* -DiscoPlot: Visualising discordant reads. -https://github.com/BeatsonLab-MicrobialGenomics/DiscoPlot +Sullivan MJ, Beatson SA. (2015) DiscoPlot: Discordant read visualisation. +PeerJ PrePrints 3:e1274 https://dx.doi.org/10.7287/peerj.preprints.1038v1 \ No newline at end of file diff --git a/DiscoPlot.py b/DiscoPlot.py new file mode 100644 index 0000000..a5e9dbd --- /dev/null +++ b/DiscoPlot.py @@ -0,0 +1,865 @@ +#!/usr/bin/env python +# DiscoPlot version 1.0.4 +# Written by Mitchell Sullivan, supervisor: Scott Beatson +# License: GPLv3 + + + +import argparse +import numpy +import sys +import subprocess +import random +from collections import namedtuple as ntup + +# Stream a SAM file - emulates pysam for Windows +class readSam: + def __init__(self, sam_file): + self.header = '' + self.references = [] + self.lengths = [] + self.read = ntup('blast', 'pos pnext rname rnext is_reverse mate_is_reverse is_read1 is_unmapped mate_is_unmapped line') + self.sam = open(sam_file) + line = self.sam.readline() + lastline = None + while line.startswith('@'): + self.header += line + if line.startswith('@SQ'): + for i in line.split(): + if i.startswith('SN:'): + self.references.append(i[3:]) + elif i.startswith('LN:'): + self.lengths.append(int(i[3:])) + lastline = line + line = self.sam.readline() + self.sam.seek(0) + if not lastline is None: + getit = True + while getit: + line = self.sam.readline() + if line == lastline: + getit = False + + def __iter__(self): + return self + + def next(self): + line = self.sam.readline() + if line == '': + raise StopIteration + name, flag, rname, pos, mapq, cigar, rnext, pnext = line.split()[:8] + if rnext == '=' or rnext == '*': + rnext = rname + flag = bin(int(flag)).zfill(12) + read = self.read(int(pos), int(pnext), rname, rnext, flag[-5] == '1', flag[-6] == '1', flag[-7] == '1', flag[-3] == '1', flag[-4] == '1', line) + return read + +# write to sam file +class writeSam: + def __init__(self, samfile, header): + self.out = open(samfile, 'w') + self.out.write(header) + + def write(self, read): + self.out.write(read.line) + + +# read in a Sam or Bam file, use pysam if available otherwise use readSam (Sam files only) +def read_sbam(args): + try: + import pysam + havepysam = True + except ImportError: + havepysam = False + if not args.bam_file is None: + sam = pysam.Samfile(args.bam_file, 'rb') + elif not args.sam_file is None and havepysam: + sam = pysam.Samfile(args.sam_file) + elif not args.sam_file is None: + sam = readSam(args.sam_file) + else: + sys.stderr.write('Please install pysam to read bam files (pysam not needed for sam files).') + return + global refpos + global cuta + global cutb + cuta = 0 + cutb = float('inf') + refpos = {} + if not args.subsection is None: + if len(args.subsection) == 1: + refpos[args.subsection[0]] = 0 + totallength = None + for i in range(0, len(sam.references)): + if sam.references[i] == args.subsection[0]: + totallength = sam.lengths[i] + if totallength is None: + sys.stderr.write('Selected reference not found.') + sys.exit() + elif len(args.subsection) == 2: + refpos[sam.references[0]] = 0 + cuta = int(args.subsection[0]) + cutb = int(args.subsection[1]) + totallength = cutb - cuta + elif len(args.subsection) == 3: + refpos[args.subsection[0]] = 0 + cuta = int(args.subsection[1]) + cutb = int(args.subsection[2]) + totallength = cutb - cuta + else: + sys.stderr.write('Too many arguments given for subsection') + sys.exit() + if args.bin_size is None: + args.bin_size = totallength / args.size + 1 + else: + args.size = totallength / args.bin_size + 1 + else: + references = sam.references + reflengths = sam.lengths + currpos = 0 + if args.bin_size is None: + args.bin_size = sum(reflengths) / (args.size - (len(reflengths) -1) * (args.gap + 1)) + 1 + else: + args.size = sum(map(lambda x: x/args.bin_size, reflengths)) + (len(reflengths) -1) * args.gap + 1 + for i in range(len(references)): + refpos[references[i]] = currpos + currpos += reflengths[i] / args.bin_size + args.gap + global invgrid, dirgrid, unmapped_for, unmapped_rev + unmapped_rev = {} + unmapped_for = {} + invgrid = {} + dirgrid = {} + if not args.write_reads is None: + if havepysam: + if args.bam_file is None: + newsam = pysam.Samfile(args.write_reads[4], 'w', template=sam) + else: + newsam = pysam.Samfile(args.write_reads[4], 'wb', template=sam) + else: + newsam = writeSam(args.write_reads[4], sam.header) + if len(args.write_reads) == 5: + refpos[sam.references[0]] = 0 + cutw = int(args.write_reads[0]) + cutx = int(args.write_reads[1]) + cuty = int(args.write_reads[2]) + cutz = int(args.write_reads[3]) + for read in sam: + if not args.write_reads is None: + pos1 = read.pos + pos2 = read.pnext + if read.is_reverse: + if read.mate_is_reverse: + if pos1 <= pos2: + if cutw <= pos2 <= cutx and cuty <= pos1 <= cutz: + newsam.write(read) + else: + if cutw <= pos1 <= cutx and cuty <= pos2 <= cutz: + newsam.write(read) + else: + if cutw <= pos2 <= cutx and cuty <= pos1 <= cutz: + newsam.write(read) + + else: + if read.mate_is_reverse: + if cutw <= pos1 <= cutx and cuty <= pos2 <= cutz: + newsam.write(read) + else: + if pos1 <= pos2: + if cutw <= pos1 <= cutx and cuty <= pos2 <= cutz: + newsam.write(read) + else: + if cutw <= pos2 <= cutx and cuty <= pos1 <= cutz: + newsam.write(read) + if havepysam: + if read.tid >= 0: + ref = sam.getrname(read.tid) + else: + ref = '' + else: + ref = read.rname + if ref in refpos: + if read.is_read1: + if cuta <= read.pos <= cutb and not read.is_unmapped: + pos1 = (read.pos - cuta) / args.bin_size + refpos[ref] + if read.mate_is_unmapped: + if read.is_reverse: + if pos1 in unmapped_rev: + unmapped_rev[pos1] += 1 + else: + unmapped_rev[pos1] = 1 + else: + if pos1 in unmapped_for: + unmapped_for[pos1] += 1 + else: + unmapped_for[pos1] = 1 + else: + if havepysam: + mref = sam.getrname(read.rnext) + else: + mref = read.rnext + if mref in refpos: + if cuta <= read.pnext <= cutb: + pos2 = (read.pnext - cuta) / args.bin_size + refpos[mref] + if read.is_reverse: + if read.mate_is_reverse: + if pos1 < pos2: + if pos2 in dirgrid and pos1 in dirgrid[pos2]: + dirgrid[pos2][pos1] += 1 + elif pos2 in dirgrid: + dirgrid[pos2][pos1] = 1 + else: + dirgrid[pos2] = {pos1:1} + else: + if pos1 in dirgrid and pos2 in dirgrid[pos1]: + dirgrid[pos1][pos2] += 1 + elif pos1 in dirgrid: + dirgrid[pos1][pos2] = 1 + else: + dirgrid[pos1] = {pos2:1} + else: + if pos2 in invgrid and pos1 in invgrid[pos2]: + invgrid[pos2][pos1] += 1 + elif pos2 in invgrid: + invgrid[pos2][pos1] = 1 + else: + invgrid[pos2] = {pos1:1} + else: + if read.mate_is_reverse: + if pos1 in invgrid and pos2 in invgrid[pos1]: + invgrid[pos1][pos2] += 1 + elif pos1 in invgrid: + invgrid[pos1][pos2] = 1 + else: + invgrid[pos1] = {pos2:1} + else: + if pos1 < pos2: + if pos1 in dirgrid and pos2 in dirgrid[pos1]: + dirgrid[pos1][pos2] += 1 + elif pos1 in dirgrid: + dirgrid[pos1][pos2] = 1 + else: + dirgrid[pos1] = {pos2:1} + else: + if pos2 in dirgrid and pos1 in dirgrid[pos2]: + dirgrid[pos2][pos1] += 1 + elif pos2 in dirgrid: + dirgrid[pos2][pos1] = 1 + else: + dirgrid[pos2] = {pos1:1} + else: + if read.mate_is_unmapped: + if havepysam: + if read.tid >= 0: + ref = sam.getrname(read.tid) + else: + ref = '' + else: + ref = read.rname + if ref in refpos: + if cuta <= read.pos <= cutb: + pos = (read.pos - cuta) / args.bin_size + refpos[ref] + if read.is_reverse: + if pos in unmapped_rev: + unmapped_rev[pos] += 1 + else: + unmapped_rev[pos] = 1 + else: + if pos in unmapped_for: + unmapped_for[pos] += 1 + else: + unmapped_for[pos] = 1 + +# read in a BLAST alignment file for single end reads +def read_sing(args): + readlen = None + global cuta + global cutb + global refpos + if not args.read_file is None: + reads = open(args.read_file) + first = True + getfq = 0 + readlen = {} + for line in reads: + if first: + first = False + if line.startswith('@'): + getfq = 2 + name = line.split()[0][1:] + seq = '' + elif line.startswith('>'): + readlen[name] = len(seq) + name = line.split()[0][1:] + seq = '' + elif getfq == 0: + seq += line.rstrip() + elif getfq == 1: + readlen[name] = len(seq) + name = line.split()[0][1:] + seq = '' + elif getfq == 2: + seq += line.rstrip() + getfq = 3 + elif getfq == 3: + getfq = 4 + elif getfq == 4: + getfq = 1 + readlen[name] = len(seq) + if not args.reference_file is None: + ref = open(args.reference_file) + first = True + references = [] + reflengths = [] + for line in ref: + if line.startswith('>'): + if first: + first = False + else: + references.append(name) + reflengths.append(len(seq)) + name = line.split()[0][1:] + seq = '' + else: + seq += line + references.append(name) + reflengths.append(len(seq)) + else: + blast = open(args.blast_file) + refdict = {} + for line in blast: + if line.split()[1] in refdict: + if max([int(line.split()[8]), int(line.split()[9])]) > refdict[line.split()[1]]: + refdict[line.split()[1]] = max([int(line.split()[8]), int(line.split()[9])]) + else: + refdict[line.split()[1]] = max([int(line.split()[8]), int(line.split()[9])]) + blast.close() + references = [] + reflengths = [] + for i in refdict: + references.append(i) + reflengths.append(refdict[i]) + cuta = 0 + cutb = float('inf') + refpos = {} + if not args.subsection is None: + if len(args.subsection) == 1: + refpos[args.subsection[0]] = 0 + totallength = None + for i in range(0, len(references)): + if references[i] == args.subsection[0]: + totallength = reflengths[i] + if totallength is None: + sys.stderr.write('Selected reference not found.') + sys.exit() + elif len(args.subsection) == 2: + refpos[references[0]] = 0 + cuta = int(args.subsection[0]) + cutb = int(args.subsection[1]) + totallength = cutb - cuta + elif len(args.subsection) == 3: + refpos[args.subsection[0]] = 0 + cuta = int(args.subsection[0]) + cutb = int(args.subsection[1]) + totallength = cutb - cuta + else: + sys.stderr.write('Too many arguments given for subsection') + sys.exit() + if args.bin_size is None: + args.bin_size = totallength / args.size + else: + args.size = totallength / args.bin_size + else: + currpos = 0 + if args.bin_size is None: + args.bin_size = sum(reflengths) / (args.size - (len(reflengths) -1) * (args.gap + 1)) + else: + args.size = sum(map(lambda x: x/args.bin_size, reflengths)) + (len(reflengths) -1) * args.gap + for i in range(len(references)): + refpos[references[i]] = currpos + currpos += reflengths[i] / args.bin_size + args.gap + global invgrid, dirgrid, unmapped_for, unmapped_rev + unmapped_rev = {} + unmapped_for = {} + invgrid = {} + dirgrid = {} + blast = open(args.blast_file) + lastquery = '' + hits = [] + maxrstart = 0 + for line in blast: + query, subject, ident, length, mm, indel, qstart, qstop, rstart, rstop, eval, bitscore = line.split() + qstart, qstop, rstart, rstop, length, mm, indel = map(int, [qstart, qstop, rstart, rstop, length, mm, indel]) + if rstart >= maxrstart: + maxrstart = rstart + if query != lastquery and lastquery != '' and hits != []: + hits.sort(reverse=True) + newhits = [hits[0]] + qtaken = set() + for i in range(hits[0][2], hits[0][3] + 1): + qtaken.add(i) + hitsizes = set() + hitsizes.add(hits[0][:4]) + for i in hits[1:]: + if i[:-3] == newhits[-1][:-3]: + newhits.append(i) + hitsizes.add(i[:4]) + else: + getit = False + for j in range(i[2], i[3] + 1): + if not j in qtaken: + getit = True + qtaken.add(j) + if getit: + newhits.append(i) + hitsizes.add(i[:4]) + if len(hitsizes) == 1 and len(newhits) != 1: + newhits = [random.choice(newhits)] + anchor = None + revseq = None + newhits2 = [] + for i in newhits: + if subject in refpos and ((cuta <= i[4] <= cutb) or (cuta <= i[5] <= cutb)): + newhits2.append(i) + for i in newhits2: + bitscore2, length2, qstart2, qstop2, rstart2, rstop2, subject2 = i + if anchor is None: + if rstart2 < rstop2: + anchor = refpos[subject2] * args.bin_size + rstart2 - qstart2 + revseq = False + else: + anchor = refpos[subject2] * args.bin_size + rstop2 - (readlen[lastquery] - qstop2) + revseq = True + if min(qtaken) >= args.unmapped and min(qtaken) == qstart2: + if revseq: + if (anchor + readlen[lastquery] - qstart2 - cuta)/args.bin_size + refpos[subject2] in unmapped_for: + unmapped_for[(anchor + readlen[lastquery] - qstart2 - cuta)/args.bin_size + refpos[subject2]] += 1 + else: + unmapped_for[(anchor + readlen[lastquery] - qstart2 - cuta)/args.bin_size + refpos[subject2]] = 1 + else: + if (anchor + qstart2 - cuta)/args.bin_size in unmapped_rev: + unmapped_rev[(anchor + qstart2 - cuta)/args.bin_size] += 1 + else: + unmapped_rev[(anchor + qstart2 - cuta)/args.bin_size] = 1 + if max(qtaken) <= readlen[lastquery] - args.unmapped and max(qtaken) == qstop2: + if revseq: + if (anchor + readlen[lastquery] - qstop2 - cuta)/args.bin_size + refpos[subject2] in unmapped_rev: + unmapped_rev[(anchor + readlen[lastquery] - qstop2 - cuta)/args.bin_size + refpos[subject2]] += 1 + else: + unmapped_rev[(anchor + readlen[lastquery] - qstop2 - cuta)/args.bin_size + refpos[subject2]] = 1 + else: + if (anchor + qstop2 - cuta)/args.bin_size in unmapped_for: + unmapped_for[(anchor + qstop2 - cuta)/args.bin_size] += 1 + else: + unmapped_for[(anchor + qstop2 - cuta)/args.bin_size] = 1 + lastxpos = None + lastypos = None + oldstart, oldstop = qstart2, qstop2 + if revseq: + rstart2, rstop2 = rstop2, rstart2 + qstart2 = readlen[lastquery] - qstop2 + qstop2 = readlen[lastquery] - oldstart + for j in range(qstart2, qstop2): + xpos = (anchor + j - cuta) / args.bin_size + ypos = refpos[subject2] + ((rstart2 + int(((j - qstart2) * 1.0 / (qstop2 - qstart2)) * (rstop2 - rstart2))) - cuta) / args.bin_size + if xpos != lastxpos or ypos != lastypos: + if rstart2 < rstop2: + if xpos in dirgrid: + if ypos in dirgrid[xpos]: + dirgrid[xpos][ypos] += 1 + else: + dirgrid[xpos][ypos] = 1 + else: + dirgrid[xpos] = {ypos:1} + else: + if xpos in invgrid: + if ypos in invgrid[xpos]: + invgrid[xpos][ypos] += 1 + else: + invgrid[xpos][ypos] = 1 + else: + invgrid[xpos] = {ypos:1} + lastxpos, lastypos = xpos, ypos + if query != lastquery: + hits = [] + if ident >= args.min_ident and length >= args.min_length: + hits.append((float(bitscore), length, qstart, qstop, rstart, rstop, subject)) + lastquery = query + + +# read in a histogram file - This is for Hi-C data +def read_hist(args): + histFile = open(args.heatmap) + global unmapped_for + global unmapped_rev + global dirgrid + global invgrid + global cuta + global cutb + global refpos + unmapped_for, unmapped_rev, dirgrid, invgrid = {}, {}, {}, {} + cuta = 0 + cutb = float('inf') + refpos = {} + header = True + args.bin_size = None + for line in histFile: + if not line.startswith('#'): + if header: + header = False + headpos = [] + args.size = len(line.split()) + for i in line.split(): + headpos.append(int(i.split(':')[1].split('-')[0])) + args.bin_size = headpos[1] - headpos[0] + else: + name = line.split()[0] + vals = line.split()[1:] + pos = int(name.split(':')[1].split('-')[0]) + for i in range(len(vals)): + if vals[i] != 0.0: + if pos/args.bin_size in dirgrid: + dirgrid[pos/args.bin_size][headpos[i]/args.bin_size] = float(vals[i]) + else: + dirgrid[pos/args.bin_size] = {headpos[i]/args.bin_size:float(vals[i])} + +# run BLAST +def generate_blast(args): + subprocess.Popen('makeblastdb -dbtype nucl -out ' + args.gen_blast + '.db -in ' + + args.reference_file, shell=True, stdout=subprocess.PIPE).wait() + subprocess.Popen('blastn -db ' + args.gen_blast + '.db -outfmt 6 -query ' + + args.read_file + ' -out ' + args.gen_blast + '.out', shell=True).wait() + args.blast_file = args.gen_blast + '.out' + + +# Draw the dotplot +def draw_dotplot(args): + global refpos + global cuta + global cutb + numvals = 0 + vals = [] + diagdict = {} + for i in invgrid: + for j in invgrid[i]: + if args.max_hits >= invgrid[i][j] >= args.min_hits: + numvals += 1 + if i - j in diagdict: + diagdict[i-j].append(invgrid[i][j]) + else: + diagdict[i-j] = [invgrid[i][j]] + thesum = 0 + for i in diagdict: + thesum += sum(diagdict[i]) + for i in dirgrid: + for j in dirgrid[i]: + if args.max_hits >= dirgrid[i][j] >= args.min_hits: + numvals += 1 + if i - j in diagdict: + diagdict[i-j].append(dirgrid[i][j]) + else: + diagdict[i-j] = [dirgrid[i][j]] + thesum2 = 0 + for i in diagdict: + thesum2 += sum(diagdict[i]) + if thesum2 / 2 < thesum: + args.switch = not args.switch + maxsum = 0 + for i in diagdict: + if sum(diagdict[i]) > maxsum: + maxsum = sum(diagdict[i]) + vals = diagdict[i] + numvals2 = 0 + for i in unmapped_rev: + if args.max_hits >= unmapped_rev[i] >= args.min_hits: + numvals2 += 1 + for i in unmapped_for: + if args.max_hits >= unmapped_for[i] >= args.min_hits: + numvals2 += 1 + if args.log: + vals = map(numpy.log10, vals) + if args.m_count != -1: + med = args.m_count + else: + med = numpy.median(vals) + sizemod = (864.0 / args.size * args.m_size) ** 2 / med + themew = 864.0 / args.size * args.marker_edge_width + if args.split_graph is None: + ax = plt.subplot(aspect=1) + else: + if args.split_graph[0].isdigit(): + start = True + starts = [] + stops = [] + for i in args.split_graph: + if start: + starts.append(int(i)) + else: + stops.append(int(i)) + start = not start + else: + count = 0 + starts = [] + stops = [] + for i in args.split_graph(): + if count % 3 == 0: + name = i + elif count % 3 == 1: + starts.append(refpos[name] * args.bin_size + int(i)) + else: + stops.append(refpos[name] * args.bin_size + int(i)) + widths = [a - b for a, b in zip(stops, starts)] + heights = widths[::-1] + gs = gridspec.GridSpec(len(starts), len(starts), width_ratios=widths, height_ratios=heights) + axgrid = [[None for i in range(len(starts))] for i in range(len(starts))] + for i in range(len(starts) * len(starts)): + if i % len(starts) == 0: + axgrid[i%len(starts)][i/len(starts)] = plt.subplot(gs[i], aspect=1) + else: + axgrid[i%len(starts)][i/len(starts)] = plt.subplot(gs[i], aspect=1)#, sharey=axgrid[0][i/len(starts)]) + if not args.highlight is None: + hstarts = [] + hstops = [] + halpha = float(args.highlight[0]) + if args.highlight[1].isdigit(): + start = True + for i in args.highlight[1:]: + if start: + hstarts.append(int(i)) + else: + hstops.append(int(i)) + start = not start + else: + count = 0 + for i in args.highlight[1:]: + if count % 3 == 0: + name = i + elif count % 3 == 1: + hstarts.append(refpos[name] * args.bin_size + int(i)) + else: + hstops.append(refpos[name] * args.bin_size + int(i)) + if args.split_graph is None: + for i in range(len(hstarts)): + ax.axhspan(hstarts[i], hstops[i], facecolor='g', alpha=halpha) + ax.axvspan(hstarts[i], hstops[i], facecolor='g', alpha=halpha) + + x = numpy.zeros(numvals, dtype='u4') + y = numpy.zeros(numvals, dtype='u4') + sizes = numpy.zeros(numvals, dtype='f4') + colours = numpy.array(['x' for i in range(numvals)]) + count = 0 + if args.switch: + for i in invgrid: + for j in invgrid[i]: + if args.max_hits >= invgrid[i][j] >= args.min_hits: + x[count] = i * args.bin_size + cuta + y[count] = j * args.bin_size + cuta + if args.log: + sizes[count] = numpy.log10(invgrid[i][j]) * sizemod + else: + sizes[count] = invgrid[i][j] * sizemod + colours[count] = 'b' + count += 1 + for i in dirgrid: + for j in dirgrid[i]: + if args.max_hits >= dirgrid[i][j] >= args.min_hits: + x[count] = i * args.bin_size + cuta + y[count] = j * args.bin_size + cuta + if args.log: + sizes[count] = numpy.log10(dirgrid[i][j]) * sizemod + else: + sizes[count] = dirgrid[i][j] * sizemod + colours[count] = 'r' + count += 1 + if not args.switch: + for i in invgrid: + for j in invgrid[i]: + if args.max_hits >= invgrid[i][j] >= args.min_hits: + x[count] = i * args.bin_size + cuta + y[count] = j * args.bin_size + cuta + if args.log: + sizes[count] = numpy.log10(invgrid[i][j]) * sizemod + else: + sizes[count] = invgrid[i][j] * sizemod + colours[count] = 'b' + count += 1 + if args.split_graph is None: + ax.scatter(x, y, s=sizes, c=colours, alpha=args.alpha, marker='x', lw=themew) + else: + for i in range(len(starts)): + for j in range(len(starts)): + axgrid[i][j].scatter(x, y, s=sizes, c=colours, alpha=args.alpha, marker='x', lw=themew) + count = 0 + x = numpy.zeros(numvals2, dtype='u4') + y = numpy.zeros(numvals2, dtype='u4') + sizes = numpy.zeros(numvals2, dtype='f4') + colours = numpy.array(['x' for i in range(numvals2)]) + for i in unmapped_for: + if args.max_hits >= unmapped_for[i] >= args.min_hits: + x[count] = cuta + y[count] = i * args.bin_size + cuta + if args.log: + sizes[count] = numpy.log10(unmapped_for[i]) * sizemod + else: + sizes[count] = unmapped_for[i] * sizemod + colours[count] = 'g' + count += 1 + for i in unmapped_rev: + if args.max_hits >= unmapped_rev[i] >= args.min_hits: + x[count] = i * args.bin_size + cuta + y[count] = cuta + if args.log: + sizes[count] = numpy.log10(unmapped_rev[i]) * sizemod + else: + sizes[count] = unmapped_rev[i] * sizemod + colours[count] = 'g' + count += 1 + if args.split_graph is None: + ax.scatter(x, y, s=sizes, c=colours, alpha=args.alpha2, marker='+', lw=themew) + else: + for i in range(len(starts)): + for j in range(len(starts)): + axgrid[i][j].scatter(x, y, s=sizes, c=colours, alpha=args.alpha2, marker='+', lw=themew) + sizes = [] + names = [] + for i in [10, 25, 50, 75, 90]: + if args.log: + sizes.append(10**numpy.percentile(vals, i)) + else: + sizes.append(numpy.percentile(vals, i)) + names.append(str(i) + '% Normal ' + str(sizes[-1])) + names.append('50% Inverted ' + str(sizes[2])) + if args.log: + a = plt.scatter([], [], s=numpy.log10(sizes[2]) * sizemod, c='b', marker='x', lw=themew) + b = plt.scatter([], [], s=numpy.log10(sizes[0]) * sizemod, c='r', marker='x', lw=themew) + c = plt.scatter([], [], s=numpy.log10(sizes[1]) * sizemod, c='r', marker='x', lw=themew) + d = plt.scatter([], [], s=numpy.log10(sizes[2]) * sizemod, c='r', marker='x', lw=themew) + e = plt.scatter([], [], s=numpy.log10(sizes[3]) * sizemod, c='r', marker='x', lw=themew) + f = plt.scatter([], [], s=numpy.log10(sizes[4]) * sizemod, c='r', marker='x', lw=themew) + else: + a = plt.scatter([], [], s=sizes[2] * sizemod, c='b', marker='x', lw=themew) + b = plt.scatter([], [], s=sizes[0] * sizemod, c='r', marker='x', lw=themew) + c = plt.scatter([], [], s=sizes[1] * sizemod, c='r', marker='x', lw=themew) + d = plt.scatter([], [], s=sizes[2] * sizemod, c='r', marker='x', lw=themew) + e = plt.scatter([], [], s=sizes[3] * sizemod, c='r', marker='x', lw=themew) + f = plt.scatter([], [], s=sizes[4] * sizemod, c='r', marker='x', lw=themew) + if args.split_graph is None: + if args.no_legend: + leg = ax.legend([b, c, d, e, f, a], names, loc=4) + leg.draggable(state=True) + for i in refpos: + if not refpos[i] == 0: + ax.axhspan(refpos[i] * args.bin_size, refpos[i] * args.bin_size - args.gap * args.bin_size, facecolor='g', alpha=args.alpha) + ax.axvspan(refpos[i] * args.bin_size, refpos[i] * args.bin_size - args.gap * args.bin_size, facecolor='g', alpha=args.alpha) + if cutb == float('inf'): + cutb = args.size * args.bin_size + cuta + if args.split_graph is None: + plt.xlim([cuta - args.bin_size, cutb]) + plt.ylim([cuta - args.bin_size, cutb]) + if args.no_gridlines: + plt.grid(True) + if args.no_label: + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + else: + gs.update(wspace=0.05, hspace=0.05)#, left=0.05, top=0.85, bottom=0.05) + for i in range(len(starts)): + for j in range(len(starts)): + axgrid[i][j].set_xlim((starts[i], stops[i])) + axgrid[i][j].set_ylim((starts[-1-j], stops[-1-j])) + if j != len(starts) - 1: + axgrid[i][j].set_xticklabels([]) + if i != 0: + axgrid[i][j].set_yticklabels([]) + if args.no_gridlines: + axgrid[i][j].grid(True) + if args.no_label: + axgrid[i][j].xaxis.set_visible(False) + axgrid[i][j].yaxis.set_visible(False) + if not args.output_file is None: + fig = plt.gcf() + fig.set_size_inches(12,12) + plt.savefig(args.output_file, dpi=args.image_quality) + else: + plt.show() + + + +parser = argparse.ArgumentParser(prog='DiscoPlot.py', formatter_class=argparse.RawDescriptionHelpFormatter, description=''' +DiscoPlot.py - read mapping visualisation in the large + +Version 1.0.4 + +USAGE: DiscoPlot.py -bam bamfile.bam -o output_file.bmp -size 5000 + Create a bmp file from a bamfile of paired-end reads with 5000 bins + DiscoPlot.py -r reads.fa -B blast_prefix -r reference -o output_file.png -bin 10000 + Create a png file using reads.fa aligned to the reference, automatically generate blast file. Use a bin size of 10,000bp. +''', epilog="Thanks for using DiscoPlot.py") +parser.add_argument('-r', '--read_file', action='store', default=None, help='read file - provide DiscoPlot with a read file to BLAST (long read mode).') +parser.add_argument('-ref', '--reference_file', action='store', default=None, help='Reference file - Reference for generating long reads alignments.') +parser.add_argument('-bam', '--bam_file', action='store', default=None, help='bam file - paired read mode. (Requires pysam).') +parser.add_argument('-sam', '--sam_file', action='store', default=None, help='sam file - paired read mode. (pysam not required)') +parser.add_argument('-hm', '--heatmap', action='store', default=None, help='Heatmap file - provide DiscoPlot with custom generated heatmap.') +parser.add_argument('-B', '--gen_blast', action='store', default=None, help='Generate blast files, use argument as prefix for output.') +parser.add_argument('-b', '--blast_file', action='store', default=None, help='Provide DiscoPlot with alignment file (long read mode) (BLAST tab delimited file - output format 6)') +parser.add_argument('-o', '--output_file', action='store', default=None, help='output file [gif/bmp/png]') +parser.add_argument('-s', '--size', action='store', type=int, default=None, help='Number of bins.') +parser.add_argument('-bin', '--bin_size', action='store', type=int, default=None, help='Bin size (in bp)') +parser.add_argument('-g', '--gap', action='store', type=int, default=5, help='Gap size - gap size between entries in reference.') +parser.add_argument('-sub', '--subsection', nargs='+', action='store', default=None, help='Only display subection of genome [ref]/[min_cutoff max_cutoff]/[ref min_cutoff max_cutoff]') +parser.add_argument('-wb', '--write_reads', nargs='+', action='store', default=None, help='Write reads in rectangle to bam/sam [x1 y1 x2 y2 out.bam]') +parser.add_argument('-c', '--min_hits', action='store', type=int, default=1, help='Only show bins with more than this number of hits.') +parser.add_argument('-m', '--max_hits', action='store', type=float, default=float('inf'), help='Only show bins with less hits than this.') +parser.add_argument('-dpi', '--image_quality', action='store', type=int, default=1600, help='Image quality (in DPI)') +parser.add_argument('-i', '--min_ident', action='store', type=float, default=85.0, help='Min. idenity of hits to draw (long read mode).') +parser.add_argument('-l', '--min_length', action='store', type=int, default=50, help='Min. length of hits to draw (long read mode).') +parser.add_argument('-d', '--unmapped', action='store', type=int, default=100, help='Unmapped bases on edge for RMaD to consider read partially unmapped.') +parser.add_argument('-a', '--alpha', action='store', type=float, default=0.1, help='Transparency of mapped read markers') +parser.add_argument('-a2', '--alpha2', action='store', type=float, default=0.8, help='Transparency of unmapped read markers') +parser.add_argument('-mc', '--m_count', action='store', type=int, default=-1, help='The count of a bin to be used as the median value for calculating the size of the dot [auto]') +parser.add_argument('-ms', '--m_size', action='store', type=float, default=20, help='Set the width (in bins) of a marker with a median count.') +parser.add_argument('-log', '--log', action='store_true', default=False, help='Log10 bin counts. (For data with highly variable coverage).') +parser.add_argument('-sw', '--switch', action='store_true', default=False, help='Draw most common (inverted/direct) hits first.') +parser.add_argument('-nl', '--no_legend', action='store_false', default=True, help='Don\'t create legend.') +parser.add_argument('-ng', '--no_gridlines', action='store_false', default=True, help='Don\'t draw gridlines.') +parser.add_argument('-na', '--no_label', action='store_true', default=False, help='No axis labels.') +parser.add_argument('-split', '--split_graph', nargs='+', action='store', default=None, help='Show multiple subsections of graph [start1 stop1 start2 stop2 etc.] or [ref1 start1 stop1 ref2 start2 stop2 etc.]') +parser.add_argument('-hl', '--highlight', nargs='+', action='store', default=None, help='Highlight subsections of graph [alpha start1 stop1 start2 stop2 etc.] or [alpha ref1 start1 stop1 ref2 start2 stop2 etc.]') +parser.add_argument('-mw', '--marker_edge_width', action='store', type=int, default=20, help='Marker width (default is roughly 20x bin size)') + + + +args = parser.parse_args() +if args.size is None and args.bin_size is None and args.heatmap is None: + sys.stderr.write('Please give a image size or bin size.') + sys.exit() + +if not args.gen_blast is None: + if args.reference_file is None: + sys.stderr.write('Please provide a reference file') + sys.exit() + if args.read_file is None: + sys.stderr.write('Please provide a read file (FASTA)') + sys.exit() + generate_blast(args) + +if not args.output_file is None: + import matplotlib + matplotlib.use('Agg') + +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec + +if not args.size is None and not args.bin_size is None: + sys.stderr.write('Only provide bin size or image size, not both.') + sys.exit() +if not args.sam_file is None or not args.bam_file is None: + read_sbam(args) +elif not args.heatmap is None: + read_hist(args) +elif args.blast_file is None: + sys.stderr.write('Please either generate or provide a BLAST comparison') + sys.exit() +else: + read_sing(args) +if args.write_reads is None: + draw_dotplot(args) \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index a594fd6..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include LICENSE -include requirements.txt -include README.rst -recursive-include docs/_build/html * diff --git a/README.rst b/README.rst index 8b6408b..c0c233c 100644 --- a/README.rst +++ b/README.rst @@ -23,9 +23,11 @@ DiscoPlot is freely available (under a GPL license) for download (Mac OS X, Unix and Windows) at: https://github.com/BeatsonLab-MicrobialGenomics/DiscoPlot/releases. -.. image:: https://raw.githubusercontent.com/BeatsonLab-MicrobialGenomics/DiscoPlot/master/pictures/Figure_3_lowres.gif - :target: https://raw.githubusercontent.com/BeatsonLab-MicrobialGenomics/DiscoPlot/master/pictures/Figure_3.gif +.. image:: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_3.png + :target: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_3.png :alt: DiscoPlot figure + :width: 800px + :align: center **DiscoPlot of a mock genome.** A mock genome was created by adding genomic rearrangements to the chromosome of E. coli str. UTI89. Paired-end reads @@ -33,6 +35,8 @@ generated from the mock genome (query) with GemSim and mapped back to UTI89 (reference). The first ~500 Kbp were then visualised using DiscoPlot. +.. contents:: **Table of Contents** + Documentation ------------- @@ -54,7 +58,6 @@ TODO ---- On the roadmap: - * Sam compatibility * Print selected read names, alignments or sequences @@ -155,18 +158,31 @@ pip install DiscoPlot:: $ pip install --user DiscoPlot -We use the --user option of pip_ to put DiscoPlot in: /home/$USER/.local/bin/ +We use the --user option of pip_ to put DiscoPlot in: /Users/$HOME/Library/Python/2.7/bin You need to add this location to you ~/.bash_profile. Add DiscoPlot to your path:: - $ echo 'export PATH=$PATH:/home/$USER/.local/bin/' >> ~/.bash_profile + $ echo 'export PATH=$PATH:/Users/$HOME/Library/Python/2.7/bin/ >> ~/.bash_profile' Finally install BLAST+:: $ sudo brew install blast +Windows +~~~~~~~ +Download and install numpy and matplotlib. +To make this process easier you can download a distribution of python with matplotlib and numpy already installed +such as anaconda_. + +pip install DiscoPlot:: + + $ pip install DiscoPlot + +Finally download and install BLAST_. + + Testing DiscoPlot Installation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -190,12 +206,13 @@ You can upgrade like this:: DiscoPlot version.** -Example of figures produced by DiscoPlot ----------------------------------------- +Examples +-------- -.. image:: https://raw.githubusercontent.com/BeatsonLab-MicrobialGenomics/DiscoPlot/master/pictures/Figure_3_lowres.gif - :target: https://raw.githubusercontent.com/BeatsonLab-MicrobialGenomics/DiscoPlot/master/pictures/Figure_3.gif +.. image:: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_3.png + :target: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_3.png :alt: DiscoPlot figure + :width: 800px :align: center **DiscoPlot of a mock genome.** A mock genome was created by adding genomic @@ -203,10 +220,11 @@ rearrangements to the chromosome of E. coli str. UTI89. Paired-end reads generated from the mock genome (query) with GemSim (ref) and mapped back to UTI89 (reference). The first ~500 Kbp were then visualised using DiscoPlot. -.. image:: https://raw.githubusercontent.com/BeatsonLab-MicrobialGenomics/DiscoPlot/master/pictures/Figure_4_lowres.gif - :target: https://raw.githubusercontent.com/BeatsonLab-MicrobialGenomics/DiscoPlot/master/pictures/Figure_4.gif +.. image:: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_4.png + :target: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_4.png :alt: DiscoPlots of structural variants :align: center + :width: 800px **DiscoPlots of common structural variants.** Each box shows a common genomic rearrangement represented by a DiscoPlot. Rows A and B were created using @@ -214,42 +232,151 @@ rearrangement represented by a DiscoPlot. Rows A and B were created using created using single-end reads with an average length of 1000bp. For each box the rearrangement in the sequenced genome is listed, followed by the scale of the gridlines in brackets. -A1, C1: 300 bp deletion (400 bp). A2, C2: 300 bp insertion (400 bp). +A1, C1: 300 bp deletion (400 bp). +A2, C2: 300 bp insertion (400 bp). A3, C3: 300 bp inversion (400 bp). A4, C4: 300 bp sequence translocated 50 Kbp upstream (10 Kbp). B1, D1: 3000 bp deletion (1000 bp). -B2, D2: 3000 bp insertion (500 bp). B3, D3: 3000 bp inversion (1000 bp). -B4, D4: 3000 bp sequence translocated 50 Kbp upstream (10 Kbp). C1) +B2, D2: 3000 bp insertion (500 bp). +B3, D3: 3000 bp inversion (1000 bp). +B4, D4: 3000 bp sequence translocated 50 Kbp upstream (10 Kbp). -.. image:: https://raw.githubusercontent.com/BeatsonLab-MicrobialGenomics/DiscoPlot/master/pictures/Figure_5_lowres.png - :target: https://raw.githubusercontent.com/BeatsonLab-MicrobialGenomics/DiscoPlot/master/pictures/Figure_5.png +.. image:: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_5.png + :target: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_5.png :alt: DiscoPlot of E. coli genome + :width: 800px :align: center -**The dynamic nature of the genome of Escherichia coli str. UTI89.** Discoplot -of paired-end reads from a clonal culture of UTI89 mapped back to the -published reference chromosome and plasmid (top). A) Zoomed region of the -DiscoPlot, a small inversion exists in some of the sequenced bacteria. Four -of these sites, corresponding to known prophage regions, were identified -using DiscoPlot. B) Close up of the plasmid in the DiscoPlot. Each entry -in the alignment file is separated by an opaque green line. A large inversion -has been identified, this region corresponds to an inverted repeat found in the -plasmid. The cross in the lower right corner indicates that this region -circularises. +**The dynamic nature of the genome of Escherichia coli str. UTI89.** +Discoplot of paired-end reads from a clonal culture of UTI89 mapped back +to the published reference chromosome and plasmid. Coordinates from +0 to 5,065,741 represent the chromosome of E. coli UTI89, +coordinates ≥ 5,066,000 represent the plasmid of E. coli UTI89 -Tutorials ---------- - -**Coming Soon** - +.. image:: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_6.png + :target: https://raw.githubusercontent.com/mjsull/DiscoPlot/master/pictures/Figure_6.png + :alt: DiscoPlot of E. coli genome + :width: 800px + :align: center -Commands --------- +**Discordant reads in E. coli str. UTI89.** a) Read alignment indicates inversion of bases 919,638..922,323. 12bp inverted repeat present at terminals of region. Start and stop of inverted region occurs in two probable tail fibre proteins. Two additional tail fibre assembly proteins are encoded within the boundaries of this region. Region is immediately downstream of a putative DNA invertase gene. b, f, h, i) Reads are misaligned as they map equally well in a concordant position. c) Read alignment indicates circularisation of bases 1,653,000..1,662,603. 17bp direct repeats present at terminals of this region. Region also encodes five putative phage-related membrane proteins, two putative phage proteins, three phage hypothetical proteins, four hypothetical proteins and a single putative phage related secreted protein. Size of crosses indicates coverage of this region is higher than average. Only a single read (indicated by the cross, top left) indicates potential excision of this region. d) Read alignments indicate inversion of bases 2,109,690..2,114,003. Region contains ~100bp inverted repeat at terminals which encodes a tRNA. Region contains 3 hypothetical proteins and an additional tRNA identical to the repeats. A P4-phage integrase is present immediately downstream of the inversion. The lack of concordantly mapping reads at prophage boundary indicates that the inverted phage has reached fixation in the population. e) Reads indicate inversion of bases 2,906,008..2,906,936. 15bp inverted repeats present at terminals of this region. The 3’ end of a putative tail fibre assembly gene is encoded by this region. g) Read alignments indicate inversion of bases 4,907,424..4,907,737. Regions has 9bp inverted repeat at terminals. It is located in a non-coding region between fimA and fimE which encode the type I fimbriae. -To see a full list of flags type DiscoPlot --help -Detailed descriptions coming soon +Tutorials +--------- +Quick Start - paired-end/mate-pair short reads. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Align reads with your favourite short read aligner (e.g. BWA, Bowtie2) + +Create DiscoPlot from sam file with 5000 bins - open in a matplotlib window:: + + DiscoPlot.py -sam sam_file.sam -s 5000 + +Create a DiscoPlot from a bam file using a bin size of 10,000bp - save as .png file:: + + DisocPlot.py -bam bam_file.bam -bin 10000 -o discoplot.png + +Quick Start - long reads +~~~~~~~~~~~~~~~~~~~~~~~~ +To automatically generate a BLAST alignment using BLAST+ run:: + + DiscoPlot.py -r reads.fa -ref reference.fasta -B -s 5000 + +To provide DiscoPlot with an alignment file (BLAST tab delimited format):: + + DiscoPlot -ref reference.fasta -b alignment.out -s 5000 + + +**More Coming Soon** + + +Using DiscoPlot +--------------- + +DiscoPlot.py - Visualising discordant reads. + + +In paired read mode DiscoPlot must be provided with a BAM or SAM file. +In Single read mode DiscoPlit must be provided with a alignment file (BLAST tab delimited format) or reads and a reference (in FASTA format). + +-bin (size of bins in bp) or -s (size of bins) must be specified. + +**additional arguments**:: + + -h, --help show this help message and exit + -r READ_FILE, --read_file READ_FILE + read file - provide DiscoPlot with a read file to + BLAST (long read mode). + -ref REFERENCE_FILE, --reference_file REFERENCE_FILE + Reference file - Reference for generating long reads + alignments. + -bam BAM_FILE, --bam_file BAM_FILE + bam file - paired read mode. (Requires pysam). + -sam SAM_FILE, --sam_file SAM_FILE + sam file - paired read mode. (pysam not required) + -hm HEATMAP, --heatmap HEATMAP + Heatmap file - provide DiscoPlot with custom generated + heatmap. + -B GEN_BLAST, --gen_blast GEN_BLAST + Generate blast files, use argument as prefix for + output. + -b BLAST_FILE, --blast_file BLAST_FILE + Provide DiscoPlot with alignment file (long read mode) + (BLAST tab delimited file - output format 6) + -o OUTPUT_FILE, --output_file OUTPUT_FILE + output file [gif/bmp/png] + -s SIZE, --size SIZE Number of bins. + -bin BIN_SIZE, --bin_size BIN_SIZE + Bin size (in bp) + -g GAP, --gap GAP Gap size - gap size between entries in reference. + -sub SUBSECTION [SUBSECTION ...], --subsection SUBSECTION [SUBSECTION ...] + Only display subection of genome [ref]/[min_cutoff + max_cutoff]/[ref min_cutoff max_cutoff] + -wb WRITE_READS [WRITE_READS ...], --write_reads WRITE_READS [WRITE_READS ...] + Write reads in rectangle to bam/sam [x1 y1 x2 y2 + out.bam] + -c MIN_HITS, --min_hits MIN_HITS + Only show bins with more than this number of hits. + -m MAX_HITS, --max_hits MAX_HITS + Only show bins with less hits than this. + -dpi IMAGE_QUALITY, --image_quality IMAGE_QUALITY + Image quality (in DPI) + -i MIN_IDENT, --min_ident MIN_IDENT + Min. idenity of hits to draw (long read mode). + -l MIN_LENGTH, --min_length MIN_LENGTH + Min. length of hits to draw (long read mode). + -d UNMAPPED, --unmapped UNMAPPED + Unmapped bases on edge for RMaD to consider read + partially unmapped. + -a ALPHA, --alpha ALPHA + Transparency of mapped read markers + -a2 ALPHA2, --alpha2 ALPHA2 + Transparency of unmapped read markers + -mc M_COUNT, --m_count M_COUNT + The count of a bin to be used as the median value for + calculating the size of the dot [auto] + -ms M_SIZE, --m_size M_SIZE + Set the width (in bins) of a marker with a median + count. + -log, --log Log10 bin counts. (For data with highly variable + coverage). + -sw, --switch Draw most common (inverted/direct) hits first. + -nl, --no_legend Don't create legend. + -ng, --no_gridlines Don't draw gridlines. + -na, --no_label No axis labels. + -split SPLIT_GRAPH [SPLIT_GRAPH ...], --split_graph SPLIT_GRAPH [SPLIT_GRAPH ...] + Show multiple subsections of graph [start1 stop1 + start2 stop2 etc.] or [ref1 start1 stop1 ref2 start2 + stop2 etc.] + -hl HIGHLIGHT [HIGHLIGHT ...], --highlight HIGHLIGHT [HIGHLIGHT ...] + Highlight subsections of graph [alpha start1 stop1 + start2 stop2 etc.] or [alpha ref1 start1 stop1 ref2 + start2 stop2 etc.] + -mw MARKER_EDGE_WIDTH, --marker_edge_width MARKER_EDGE_WIDTH + Marker width (default is roughly 20x bin size) + +Thanks for using DiscoPlot.py @@ -258,3 +385,5 @@ Detailed descriptions coming soon .. _libraries: https://github.com/BeatsonLab-MicrobialGenomics/DiscoPlot/blob/master/requirements.txt .. _gist: https://gist.github.com/mscook/ef7499fc9d2138f17c7f .. _pip installation instructions: http://pip.readthedocs.org/en/latest/installing.html +.. _anaconda: http://continuum.io/downloads +.. _BLAST: http://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ diff --git a/discoplot/DiscoPlot b/discoplot/DiscoPlot deleted file mode 120000 index 5601ede..0000000 --- a/discoplot/DiscoPlot +++ /dev/null @@ -1 +0,0 @@ -DiscoPlot.py \ No newline at end of file diff --git a/discoplot/DiscoPlot.py b/discoplot/DiscoPlot.py deleted file mode 100644 index a179a4a..0000000 --- a/discoplot/DiscoPlot.py +++ /dev/null @@ -1,541 +0,0 @@ -#!/usr/bin/env python - -# DiscoPlot: identify genomic rearrangements, misassemblies and sequencing -# artefacts in NGS data -# Copyright (C) 2013-2015 Mitchell Sullivan -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -# Mitchell Sullivan -# mjsull@gmail.com -# School of Chemistry & Molecular Biosciences -# The University of Queensland -# Brisbane, QLD 4072. -# Australia - -__title__ = 'DiscoPlot' -__version__ = '1.0.2' -__description__ = ("DiscoPlot: identify genomic rearrangements, misassemblies " - "and sequencing artefacts in NGS data") -__author__ = 'Mitchell Sullivan' -__license__ = 'GPLv3' -__author_email__ = "mjsull@gmail.com" -__url__ = 'https://github.com/BeatsonLab-MicrobialGenomics/DiscoPlot' - -import argparse -import numpy -import sys -import subprocess - - -def read_sbam(args): - import pysam - if not args.bam_file is None: - sam = pysam.Samfile(args.bam_file, 'rb') - elif not args.sam_file: - sam = pysam.Samfile(args.sam_file) - global refpos - global cuta - global cutb - cuta = 0 - cutb = float('inf') - refpos = {} - if not args.subsection is None: - if len(args.subsection) == 1: - refpos[args.subsection[0]] = 0 - totallength = None - for i in range(0, len(sam.references)): - if sam.references[i] == args.subsection[0]: - totallength = sam.lengths[i] - if totallength is None: - sys.stderr.write('Selected reference not found.') - sys.exit() - elif len(args.subsection) == 2: - refpos[sam.references[0]] = 0 - cuta = int(args.subsection[0]) - cutb = int(args.subsection[1]) - totallength = cutb - cuta - elif len(args.subsection) == 3: - refpos[args.subsection[0]] = 0 - cuta = int(args.subsection[1]) - cutb = int(args.subsection[2]) - totallength = cutb - cuta - else: - sys.stderr.write('Too many arguments given for subsection') - sys.exit() - if args.bin_size is None: - args.bin_size = totallength / args.size + 1 - else: - args.size = totallength / args.bin_size + 1 - else: - references = sam.references - reflengths = sam.lengths - currpos = 0 - if args.bin_size is None: - args.bin_size = sum(reflengths) / (args.size - (len(reflengths) -1) * (args.gap + 1)) + 1 - else: - args.size = sum(map(lambda x: x/args.bin_size, reflengths)) + (len(reflengths) -1) * args.gap + 1 - for i in range(len(references)): - refpos[references[i]] = currpos - currpos += reflengths[i] / args.bin_size + args.gap - global invgrid, dirgrid, unmapped_for, unmapped_rev - unmapped_rev = {} - unmapped_for = {} - invgrid = {} - dirgrid = {} - for read in sam.fetch(): - ref = sam.getrname(read.tid) - if ref in refpos: - if read.is_read1: - if cuta <= read.pos <= cutb: - pos1 = (read.pos - cuta) / args.bin_size + refpos[ref] - if read.mate_is_unmapped: - if read.is_reverse: - if pos1 in unmapped_rev: - unmapped_rev[pos1] += 1 - else: - unmapped_rev[pos1] = 1 - else: - if pos1 in unmapped_for: - unmapped_for[pos1] += 1 - else: - unmapped_for[pos1] = 1 - else: - mref = sam.getrname(read.rnext) - if mref in refpos: - if cuta <= read.pnext <= cutb: - pos2 = (read.pnext - cuta) / args.bin_size + refpos[mref] - if read.is_reverse: - if read.mate_is_reverse: - if pos1 < pos2: - if pos2 in dirgrid and pos1 in dirgrid[pos2]: - dirgrid[pos2][pos1] += 1 - elif pos2 in dirgrid: - dirgrid[pos2][pos1] = 1 - else: - dirgrid[pos2] = {pos1:1} - else: - if pos1 in dirgrid and pos2 in dirgrid[pos1]: - dirgrid[pos1][pos2] += 1 - elif pos1 in dirgrid: - dirgrid[pos1][pos2] = 1 - else: - dirgrid[pos1] = {pos2:1} - else: - if pos2 in invgrid and pos1 in invgrid[pos2]: - invgrid[pos2][pos1] += 1 - elif pos2 in invgrid: - invgrid[pos2][pos1] = 1 - else: - invgrid[pos2] = {pos1:1} - else: - if read.mate_is_reverse: - if pos1 in invgrid and pos2 in invgrid[pos1]: - invgrid[pos1][pos2] += 1 - elif pos1 in invgrid: - invgrid[pos1][pos2] = 1 - else: - invgrid[pos1] = {pos2:1} - else: - if pos1 < pos2: - if pos1 in dirgrid and pos2 in dirgrid[pos1]: - dirgrid[pos1][pos2] += 1 - elif pos1 in dirgrid: - dirgrid[pos1][pos2] = 1 - else: - dirgrid[pos1] = {pos2:1} - else: - if pos2 in dirgrid and pos1 in dirgrid[pos2]: - dirgrid[pos2][pos1] += 1 - elif pos2 in dirgrid: - dirgrid[pos2][pos1] = 1 - else: - dirgrid[pos2] = {pos1:1} - else: - if read.mate_is_unmapped: - ref = sam.getrname(read.tid) - if ref in refpos: - if cuta <= read.pos <= cutb: - pos = (read.pos - cuta) / args.bin_size + refpos[ref] - if read.is_reverse: - if pos in unmapped_rev: - unmapped_rev[pos] += 1 - else: - unmapped_rev[pos] = 1 - else: - if pos in unmapped_for: - unmapped_for[pos] += 1 - else: - unmapped_for[pos] = 1 - - -def read_sing(args): - readlen = None - if not args.read_file is None: - reads = open(args.read_file) - first = True - getfq = 0 - readlen = {} - for line in reads: - if first: - first = False - if line.startswith('@'): - getfq = 2 - name = line.rstrip()[1:] - seq = '' - elif line.startswith('>'): - readlen[name] = len(seq) - name = line.rstrip()[1:] - seq = '' - elif getfq == 0: - seq += line.rstrip() - elif getfq == 1: - readlen[name] = len(seq) - name = line.rstrip() - seq = '' - elif getfq == 2: - seq += line.rstrip() - getfq = 3 - elif getfq == 3: - getfq = 4 - elif getfq == 4: - getfq = 1 - readlen[name] = len(seq) - if not args.reference_file is None: - ref = open(args.reference_file) - first = True - references = [] - reflengths = [] - for line in ref: - if line.startswith('>'): - if first: - first = False - else: - references.append(name) - reflengths.append(len(seq)) - name = line.rstrip()[1:] - seq = '' - else: - seq += line - references.append(name) - reflengths.append(len(seq)) - else: - blast = open(args.blast_file) - refdict = {} - for line in blast: - if line.split()[1] in refdict: - if max([int(line.split()[8]), int(line.split()[9])]) > refdict[line.split()[1]]: - refdict[line.split()[1]] = max([int(line.split()[8]), int(line.split()[9])]) - else: - refdict[line.split()[1]] = max([int(line.split()[8]), int(line.split()[9])]) - blast.close() - references = [] - reflengths = [] - for i in refdict: - references.append(i) - reflengths.append(refdict[i]) - cuta = 0 - cutb = float('inf') - refpos = {} - if not args.subsection is None: - if len(args.subsection) == 1: - refpos[args.subsection[0]] = 0 - totallength = None - for i in range(0, len(references)): - if references[i] == args.subsection[0]: - totallength = reflengths[i] - if totallength is None: - sys.stderr.write('Selected reference not found.') - sys.exit() - elif len(args.subsection) == 2: - refpos[references[0]] = 0 - cuta = int(args.subsection[0]) - cutb = int(args.subsection[1]) - totallength = cutb - cuta - elif len(args.subsection) == 3: - refpos[args.subsection[0]] = 0 - cuta = int(args.subsection[0]) - cutb = int(args.subsection[1]) - totallength = cutb - cuta - else: - sys.stderr.write('Too many arguments given for subsection') - sys.exit() - if args.bin_size is None: - args.bin_size = totallength / args.size - else: - args.size = totallength / args.bin_size - else: - currpos = 0 - if args.bin_size is None: - args.bin_size = sum(reflengths) / (args.size - (len(reflengths) -1) * (args.gap + 1)) - else: - args.size = sum(map(lambda x: x/args.bin_size, reflengths)) + (len(reflengths) -1) * args.gap - for i in range(len(references)): - refpos[references[i]] = currpos - currpos += reflengths[i] / args.bin_size + args.gap - global invgrid, dirgrid, unmapped_for, unmapped_rev - unmapped_rev = {} - unmapped_for = {} - invgrid = {} - dirgrid = {} - blast = open(args.blast_file) - lastquery = '' - hits = [] - for line in blast: - query, subject, ident, length, mm, indel, qstart, qstop, rstart, rstop, eval, bitscore = line.split() - qstart, qstop, rstart, rstop, length, mm, indel = map(int, [qstart, qstop, rstart, rstop, length, mm, indel]) - if query != lastquery and lastquery != '': - hits.sort(reverse=True) - newhits = [hits[0]] - qtaken = set() - for i in range(hits[2], hits[3] + 1): - qtaken.add(i) - for i in hits[1:]: - if i[:-3] == newhits[-1][:-3]: - newhits.append(i) - else: - getit = False - for j in range(hits[2], hits[3] + 1): - if not j in qtaken: - getit = True - qtaken.add(j) - if getit: - newhits.append(i) - anchor = None - revseq = None - for i in newhits: - bitscore, length, qstart, qstop, rstart, rstop, subject = i - if anchor is None: - if rstart < rstop: - anchor = rstart - revseq = False - else: - anchor = rstop - revseq = True - if min(qtaken) >= args.unmapped: - if revseq: - if anchor in unmapped_for: - unmapped_for[anchor] += 1 - else: - unmapped_for[anchor] = 1 - else: - if anchor in unmapped_rev: - unmapped_rev[anchor] += 1 - else: - unmapped_rev[anchor] = 1 - if max(qtaken) <= readlen[lastquery] - args.unmapped: - if revseq: - if anchor in unmapped_rev: - unmapped_rev[anchor] += 1 - else: - unmapped_rev[anchor] = 1 - else: - if anchor in unmapped_for: - unmapped_for[anchor] += 1 - else: - unmapped_for[anchor] = 1 - lastxpos = None - lastypos = None - oldstart, oldstop = qstart, qstop - if revseq: - rstart, rstop = rstop, rstart - qstart = readlen[lastquery] - qstop - qstop = readlen[lastquery] - oldstart - for j in range(qstart, qstop): - xpos = refpos[subject] + (anchor + j - cuta) / args.bin_size - ypos = refpos[subject] + (rstart + int(((j - qstart) * 1.0 / (qstop - qstart)) * (rstop - rstart))) / args.bin_size - if xpos != lastxpos or ypos != lastypos: - if rstart < rstop: - if xpos in dirgrid: - if ypos in dirgrid[xpos]: - dirgrid[xpos][ypos] += 1 - else: - dirgrid[xpos][ypos] = 1 - else: - dirgrid[xpos] = {ypos:1} - else: - if xpos in invgrid: - if ypos in invgrid[xpos]: - invgrid[xpos][ypos] += 1 - else: - invgrid[xpos][ypos] = 1 - else: - invgrid[xpos] = {ypos:1} - lastxpos, lastypos = xpos, ypos - - if ident >= args.min_ident and length >= args.min_length and subject in refpos and ((cuta <= rstart <= cutb) or (cuta <= rstop <= cutb)): - hits.append((float(bitscore), length, qstart, qstop, rstart, rstop, subject)) - lastquery = query - - -def generate_blast(args): - subprocess.Popen('makeblastdb -dbtype nucl -out ' + args.gen_blast + '.db -in ' + - args.reference_file, shell=True, stdout=subprocess.PIPE).wait() - subprocess.Popen('blastn -db ' + args.gen_blast + '.db -outfmt 6 -query ' + - args.read_file + ' -out ' + args.gen_blast + '.out', shell=True).wait() - args.blast_file = args.gen_blast + '.out' - - -def draw_dotplot(args): - global refpos - global cuta - global cutb - vals1, vals2 = [], [] - for i in invgrid: - for j in invgrid[i]: - vals1.append(invgrid[i][j]) - vals2.append(invgrid[i][j]) - for i in dirgrid: - for j in dirgrid[i]: - vals1.append(dirgrid[i][j]) - vals2.append(dirgrid[i][j]) - vals2 = numpy.array(vals2) - for i in unmapped_rev: - vals1.append(unmapped_rev[i]) - for i in unmapped_for: - vals1.append(unmapped_for[i]) - vals1 = numpy.array(vals1) - med = numpy.median(vals2) - numvals = numpy.size(vals1) - sizemod = 2000.0 / args.size / med - fig = plt.figure(figsize=(10,10)) - ax = fig.add_subplot(111, aspect='equal') - x = numpy.zeros(numvals, dtype='u4') - y = numpy.zeros(numvals, dtype='u4') - sizes = numpy.zeros(numvals, dtype='f4') - colours = numpy.array(['x' for i in range(numvals)]) - count = 0 - for i in dirgrid: - for j in dirgrid[i]: - if args.max_hits >= dirgrid[i][j] >= args.min_hits: - x[count] = i * args.bin_size + cuta - y[count] = j * args.bin_size + cuta - sizes[count] = dirgrid[i][j] * sizemod - colours[count] = 'r' - count += 1 - for i in invgrid: - for j in invgrid[i]: - if args.max_hits >= invgrid[i][j] >= args.min_hits: - x[count] = i * args.bin_size + cuta - y[count] = j * args.bin_size + cuta - sizes[count] = invgrid[i][j] * sizemod - colours[count] = 'b' - count += 1 - for i in unmapped_for: - if args.max_hits >= unmapped_for[i] >= args.min_hits: - x[count] = cuta - y[count] = i * args.bin_size + cuta - sizes[count] = unmapped_for[i] * sizemod - colours[count] = 'g' - count += 1 - for i in unmapped_rev: - if args.max_hits >= unmapped_rev[i] >= args.min_hits: - x[count] = i * args.bin_size + cuta - y[count] = cuta - sizes[count] = unmapped_rev[i] * sizemod - colours[count] = 'g' - count += 1 - count1, count2, count3 = 0, 0, 0 - for i in colours: - if i == 'b': - count1 += 1 - elif i == 'r': - count2 += 1 - elif i == 'g': - count3 += 1 - ax.scatter(x, y, s=sizes, c=colours, edgecolor='none', alpha=0.3) - sizes = [] - names = [] - for i in [10, 25, 50, 75, 90]: - sizes.append(numpy.percentile(vals2, i)) - names.append(str(i) + '% Normal ' + str(sizes[-1])) - names.append('50% Inverted ' + str(sizes[2])) - a = plt.scatter(-100, -100, s=sizes[2] * sizemod, c='b', edgecolor='none', alpha=0.3) - b = plt.scatter(-100, -100, s=sizes[0] * sizemod, c='r', edgecolor='none', alpha=0.3) - c = plt.scatter(-100, -100, s=sizes[1] * sizemod, c='r', edgecolor='none', alpha=0.3) - d = plt.scatter(-100, -100, s=sizes[2] * sizemod, c='r', edgecolor='none', alpha=0.3) - e = plt.scatter(-100, -100, s=sizes[3] * sizemod, c='r', edgecolor='none', alpha=0.3) - f = plt.scatter(-100, -100, s=sizes[4] * sizemod, c='r', edgecolor='none', alpha=0.3) - leg = ax.legend([b, c, d, e, f, a], names, loc=4) - leg.draggable(state=True) - for i in refpos: - if not refpos[i] == 0: - ax.axhspan(refpos[i] * args.bin_size, refpos[i] * args.bin_size - args.gap * args.bin_size, facecolor='g', alpha=0.3) - ax.axvspan(refpos[i] * args.bin_size, refpos[i] * args.bin_size - args.gap * args.bin_size, facecolor='g', alpha=0.3) - if cutb == float('inf'): - cutb = args.size * args.bin_size + cuta - plt.xlim([cuta - args.bin_size * 10, cutb]) - plt.ylim([cuta - args.bin_size * 10, cutb]) - plt.grid(True) - if not args.output_file is None: - plt.savefig(args.output_file, dpi=args.image_quality) - else: - plt.show() - - - -parser = argparse.ArgumentParser(prog='DiscoPlot', formatter_class=argparse.RawDescriptionHelpFormatter, description=''' -DiscoPlot - read mapping visualisation in the large - -USAGE: DiscoPlot -bam bamfile.bam -o output_file.bmp -size 5000 - Create a bmp file from a bamfile of paired-end reads with a width and height of 5000px - DiscoPlot -r reads.fa -B blast_prefix -r reference -o output_file.png -bin bin_size - Create a png file from reads.fa, generate blast file. Image size will be reference length / bin_size -''', epilog="Thanks for using DiscoPlot") -parser.add_argument('-r', '--read_file', action='store', default=None, help='read file') -parser.add_argument('-ref', '--reference_file', action='store', default=None, help='reference file') -parser.add_argument('-bam', '--bam_file', action='store', default=None, help='bam file') -parser.add_argument('-sam', '--sam_file', action='store', default=None, help='sam file') -parser.add_argument('-B', '--gen_blast', action='store', default=None, help='Generate blast files, use argument as prefix for output.') -parser.add_argument('-b', '--blast_file', action='store', default=None, help='Blast file (output format 6)') -parser.add_argument('-o', '--output_file', action='store', default=None, help='output file [gif/bmp/png]') -parser.add_argument('-s', '--size', action='store', type=int, default=None, help='Number of bins') -parser.add_argument('-bin', '--bin_size', action='store', type=int, default=None, help='Bin size (in bp)') -parser.add_argument('-g', '--gap', action='store', type=int, default=5, help='Gap size') -parser.add_argument('-sub', '--subsection', nargs='+', action='store', default=None, help='Only display subection of genome [ref]/[min_cutoff max_cutoff]/[ref min_cutoff max_cutoff]') -parser.add_argument('-c', '--min_hits', action='store', type=int, default=1, help='Min hits to be shown') -parser.add_argument('-m', '--max_hits', action='store', type=float, default=float('inf'), help='Bins with more hits than this will be skipped.') -parser.add_argument('-dpi', '--image_quality', action='store', type=int, default=1600, help='Image quality (in DPI)') - - -args = parser.parse_args() -if args.size is None and args.bin_size is None: - sys.stderr.write('Please give a image size or bin size.') - sys.exit() - -if not args.gen_blast is None: - if args.reference_file is None: - sys.stderr.write('Please provide a reference file') - sys.exit() - if args.read_file is None: - sys.stderr.write('Please provide a read file (FASTA)') - sys.exit() - generate_blast(args) - -if not args.output_file is None: - import matplotlib - matplotlib.use('Agg') - -import matplotlib.pyplot as plt - -if not args.size is None and not args.bin_size is None: - sys.stderr.write('Only provide bin size or image size, not both.') - sys.exit() -if not args.sam_file is None or not args.bam_file is None: - read_sbam(args) -elif args.blast_file is None: - sys.stderr.write('Please either generate or provide a BLAST comparison') - sys.exit() -else: - read_sing(args) -draw_dotplot(args) diff --git a/discoplot/__init__.py b/discoplot/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/do_release.sh b/do_release.sh deleted file mode 100755 index 9aa1d18..0000000 --- a/do_release.sh +++ /dev/null @@ -1,85 +0,0 @@ -# Release script -# Copyright (C) 2013-2015 Mitchell Jon Stanton-Cook -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -# -# m.stantoncook@gmail.com -# School of Chemistry & Molecular Biosciences -# The University of Queensland -# Brisbane, QLD 4072. -# Australia - - -#VERSION=1.0.2 - -# Perform an install-uninstall cycle -pip uninstall DiscoPlot -python setup.py install -pip uninstall DiscoPlot -python setup.py clean - - -# Do all the versioning stuff here.. -bumpversion patch - - -# Clean, test, build the source distribution & pip install it -# Need to get exit statuses here... -python setup.py clean -#python setup.py test -#STATUS=`echo $?` -#if [ $STATUS -eq 0 ]; then -# echo "" -#else -# echo "Tests failed. Will not release" -# exit -#fi - -python setup.py sdist bdist_wheel -pip install dist/DiscoPlot-$VERSION.tar.gz -STATUS=`echo $?` -if [ $STATUS -eq 0 ]; then - echo "" -else - echo "Package is not pip installable. Will not release" - exit -fi - - -# Docs -# Need to get exit statuses here... -cd docs -make clean -sphinx-apidoc -o API ../DiscoPlot -mv API/* . -rmdir API -make html -cd .. - -git push -# tag & push the tag to github -GIT=`git status` -CLEAN='# On branch master nothing to commit, working directory clean' -if [ "$s1" == "$s2" ]; then - git tag v$VERSION - git push --tags -else - echo "Git not clean. Will not release" - exit -fi - - -# Upload to PyPI & clean -twine upload -u mscook -p $PYPIPASS dist/* && python setup.py clean diff --git a/docs/DiscoPlot.rst b/docs/DiscoPlot.rst deleted file mode 100644 index cbdd31d..0000000 --- a/docs/DiscoPlot.rst +++ /dev/null @@ -1,22 +0,0 @@ -DiscoPlot package -================= - -Submodules ----------- - -DiscoPlot.DiscoPlot module --------------------------- - -.. automodule:: DiscoPlot.DiscoPlot - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: DiscoPlot - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 68b6717..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/DiscoPlot.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/DiscoPlot.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/DiscoPlot" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/DiscoPlot" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 344e1ed..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,271 +0,0 @@ -# -*- coding: utf-8 -*- -# -# DiscoPlot documentation build configuration file, created by -# sphinx-quickstart on Wed Feb 25 11:12:29 2015. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'DiscoPlot' -copyright = u'2015, Mitchell Sullivan' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '1.0.2' -# The full version, including alpha/beta/rc tags. -release = '1.0.2' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'DiscoPlotdoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'DiscoPlot.tex', u'DiscoPlot Documentation', - u'Mitchell Sullivan', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'discoplot', u'DiscoPlot Documentation', - [u'Mitchell Sullivan'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'DiscoPlot', u'DiscoPlot Documentation', - u'Mitchell Sullivan', 'DiscoPlot', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index d403190..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,22 +0,0 @@ -.. DiscoPlot documentation master file, created by - sphinx-quickstart on Wed Feb 25 11:12:29 2015. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to DiscoPlot's documentation! -===================================== - -Contents: - -.. toctree:: - :maxdepth: 2 - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/docs/modules.rst b/docs/modules.rst deleted file mode 100644 index 48a6e6c..0000000 --- a/docs/modules.rst +++ /dev/null @@ -1,7 +0,0 @@ -DiscoPlot -========= - -.. toctree:: - :maxdepth: 4 - - DiscoPlot diff --git a/pictures/Figure_1.gif b/pictures/Figure_1.gif deleted file mode 100644 index b2bdd6c..0000000 Binary files a/pictures/Figure_1.gif and /dev/null differ diff --git a/pictures/Figure_2.gif b/pictures/Figure_2.gif deleted file mode 100644 index 4976e41..0000000 Binary files a/pictures/Figure_2.gif and /dev/null differ diff --git a/pictures/Figure_3.gif b/pictures/Figure_3.gif deleted file mode 100644 index b0d5d7e..0000000 Binary files a/pictures/Figure_3.gif and /dev/null differ diff --git a/pictures/Figure_3.png b/pictures/Figure_3.png new file mode 100644 index 0000000..c0f6b87 Binary files /dev/null and b/pictures/Figure_3.png differ diff --git a/pictures/Figure_3_lowres.gif b/pictures/Figure_3_lowres.gif deleted file mode 100644 index 81f5ef1..0000000 Binary files a/pictures/Figure_3_lowres.gif and /dev/null differ diff --git a/pictures/Figure_4.gif b/pictures/Figure_4.gif deleted file mode 100644 index 1357a46..0000000 Binary files a/pictures/Figure_4.gif and /dev/null differ diff --git a/pictures/Figure_4.png b/pictures/Figure_4.png new file mode 100644 index 0000000..3e7b9d6 Binary files /dev/null and b/pictures/Figure_4.png differ diff --git a/pictures/Figure_4_lowres.gif b/pictures/Figure_4_lowres.gif deleted file mode 100644 index 8dcbe17..0000000 Binary files a/pictures/Figure_4_lowres.gif and /dev/null differ diff --git a/pictures/Figure_5.png b/pictures/Figure_5.png index 0fa8d64..9c1d712 100644 Binary files a/pictures/Figure_5.png and b/pictures/Figure_5.png differ diff --git a/pictures/Figure_5_lowres.png b/pictures/Figure_5_lowres.png deleted file mode 100644 index f49c41f..0000000 Binary files a/pictures/Figure_5_lowres.png and /dev/null differ diff --git a/pictures/Figure_6.png b/pictures/Figure_6.png new file mode 100644 index 0000000..55c30fd Binary files /dev/null and b/pictures/Figure_6.png differ diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index dbcf58a..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,4 +0,0 @@ -Sphinx==1.2.3 -bumpversion==0.5.1 -wheel==0.24.0 -twine==1.4.0 diff --git a/requirements.txt b/requirements.txt index 99f79dc..bcc1303 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -numpy==1.8.1 -matplotlib==1.3.1 -pysam==0.8.1 +numpy>=1.9.2 +matplotlib>=1.4.3 +pysam>=0.8.1 diff --git a/setup.py b/setup.py index 4ee97a6..3f33bed 100644 --- a/setup.py +++ b/setup.py @@ -1,95 +1,19 @@ -#!/usr/bin/env python +# cx_Freeze setup file -import os import sys -import glob - -# Try and import pip. We'll stop if it is not present -try: - import pip -except ImportError: - print "Installation of DiscoPlot requires pip. Please install it! See -" - print "http://pip.readthedocs.org/en/latest/installing.html" - sys.exit(1) - -from setuptools import setup - -__title__ = 'DiscoPlot' -__version__ = '1.0.2' -__description__ = ("DiscoPlot: identify genomic rearrangements, misassemblies " - "and sequencing artefacts in NGS data") -__author__ = 'Mitchell Sullivan' -__license__ = 'GPLv3' -__author_email__ = "mjsull@gmail.com" -__url__ = 'https://github.com/BeatsonLab-MicrobialGenomics/DiscoPlot' - - -# Helper functions -if sys.argv[-1] == 'publish': - print "Please use twine or do_release.sh" - sys.exit() - -if sys.argv[-1] == 'clean': - os.system('rm -rf DiscoPlot.egg-info build dist') - sys.exit() - -if sys.argv[-1] == 'docs': - os.system('cd docs && make html') - sys.exit() - - -packages = [__title__, ] - -requires = [] -with open('requirements.txt') as fin: - lines = fin.readlines() - for line in lines: - requires.append(line.strip()) - -# Build lists to package the docs -html, sources, static = [], [], [] -html_f = glob.glob('docs/_build/html/*') -accessory = glob.glob('docs/_build/html/*/*') -for f in html_f: - if os.path.isfile(f): - html.append(f) -for f in accessory: - if f.find("_static") != -1: - if os.path.isfile(f): - static.append(f) - elif f.find("_sources"): - if os.path.isfile(f): - sources.append(f) - -setup( - name=__title__, - version=__version__, - description=__description__, - long_description=open('README.rst').read(), - author=__author__, - author_email=__author_email__, - url=__url__, - packages=packages, - test_suite="tests", - package_dir={__title__.lower(): __title__}, - scripts=[__title__.lower()+'/'+__title__], - package_data={}, - data_files=[('', ['LICENSE', 'requirements.txt', 'README.rst']), - ('docs', html), ('docs/_static', static), - ('docs/_sources', sources)], - include_package_data=True, - install_requires=requires, - license=__license__, - zip_safe=False, - classifiers=('Development Status :: 4 - Beta', - 'Environment :: X11 Applications', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved', - 'Natural Language :: English', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 2 :: Only', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Scientific/Engineering :: Visualization',), -) +from cx_Freeze import setup, Executable + +# Dependencies are automatically detected, but it might need fine tuning. +build_exe_options = {"packages": ["pysam", "numpy", "matplotlib"]} + +# GUI applications require a different base on Windows (the default is for a +# console application). +base = None +if sys.platform == "win32": + base = "Win32GUI" + +setup( name = "DiscoPlot", + version = "1.0.4", + description = "Discordant read visualisation.", + options = {"build_exe": build_exe_options}, + executables = [Executable("DiscoPlot.py", base=base)]) \ No newline at end of file